aboutsummaryrefslogblamecommitdiff
path: root/src/rtl/modexpa7_systolic_multiplier.v
blob: 32ed5433a0a45501413ff50a5958911f0cc08c99 (plain) (tree)



























































                                                                                                              




                                                                          
 



                                                                                 










































































                                                                                                          
                                                                  





                                                                      
                         
                         
                   
                            
                    
                         
















                                                                                                                                                                      
                         
                 













                                                                                                                            
                                 










                                                                


                                                                                                    


                                                                                       




                                                                                        




                                                          
                                                         
                          
                                                






                                                                     

                                                              
                                                                





                                                                                  


                          
                    
                         
                                 

















                                                                                                                     

                                                              

                                         








                                                                                                              

                                   

                                                                                         


                                                            
        




                                                   
 




                                                                                      

                                                           
 
                                 


















                                                                                                                                
                            

                                                 
 






                                           

                                                                                                    
                        
 
 
                   
                                               
                    


                             
                                 
                






                                                                                                           

                                        
         
 
                   
                                                
                    




                                                                         
                        

                   
 



































                                                                                                      
                                                                                       

                                                             

                                                       


                                                            












                                                            




                              
                   






















                                                                                                                                                                                                      
                                                                                                                                                                      
                           



                                                                                                                                                                                                       
                          
                                                                                                                                                                        










                                                                        
//======================================================================
//
// modexpa7_systolic_multiplier.v
// -----------------------------------------------------------------------------
// Systolic Montgomery multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================

module modexpa7_systolic_multiplier #
	(
			//
			// This sets the address widths of memory buffers. Internal data
			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
			// 2 ** 6 = 64.
			//
		parameter	OPERAND_ADDR_WIDTH		= 4,
		
			//
			// Explain.
			//
		parameter	SYSTOLIC_ARRAY_POWER		= 2
	)
	(
		input											clk,
		input											rst_n,

		input											ena,
		output										rdy,

		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,

		input		[                32-1:0]	a_bram_out,
		input		[                32-1:0]	b_bram_out,
		input		[                32-1:0]	n_bram_out,
		input		[                32-1:0]	n_coeff_bram_out,

		output	[                32-1:0]	r_bram_in,
		output										r_bram_wr,

		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
	);
	
		
		/*
		 * Include Settings
		 */
	`include "pe/modexpa7_primitive_switch.v"
	`include "modexpa7_settings.v"
		

		/*
		 * FSM Declaration
		 */
	localparam	[ 7: 0]	FSM_STATE_IDLE				= 8'h00;
	
	localparam	[ 7: 0]	FSM_STATE_LOAD_START		= 8'h11;
	localparam	[ 7: 0]	FSM_STATE_LOAD_SHIFT		= 8'h12;
	localparam	[ 7: 0]	FSM_STATE_LOAD_WRITE		= 8'h13;
	localparam	[ 7: 0]	FSM_STATE_LOAD_FINAL		= 8'h14;

	localparam	[ 7: 0]	FSM_STATE_MULT_START		= 8'h21;
	localparam	[ 7: 0]	FSM_STATE_MULT_CRUNCH	= 8'h22;
	localparam	[ 7: 0]	FSM_STATE_MULT_FINAL		= 8'h23;
	
	localparam	[ 7: 0]	FSM_STATE_STOP				= 8'hFF;
	
		/*
		 * FSM State / Next State
		 */
	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
	reg	[ 7: 0]	fsm_next_state;


		/*
		 * Enable Delay and Trigger
		 */
   reg ena_dly = 1'b0;
	
		// delay enable by one clock cycle
   always @(posedge clk) ena_dly <= ena;

		// trigger new operation when enable goes high
   wire ena_trig = ena && !ena_dly;
	
	
		/*
		 * Ready Flag Logic
		 */
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;

   always @(posedge clk or negedge rst_n)
		
			// reset flag
		if (rst_n == 1'b0) rdy_reg <= 1'b1;
		else begin
		
				// clear flag when operation is started
			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
			
				// set flag after operation is finished
			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
			
		end
		
		
		/*
		 * Parameters Latch
		 */
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;
	reg	[OPERAND_ADDR_WIDTH  :0]	p_num_words_latch;

		// save number of words in n when new operation starts
	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_IDLE) && ena_trig)
			n_num_words_latch <= n_num_words;
			
			
		/*
		 * Counters
		 */
			
		// handy values
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};

	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};	
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
	
		// counter
	reg	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt;
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt;
	
		// handy increment value and stop flag
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_next	= load_mult_cnt + 1'b1;
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_next	= load_syst_cnt + 1'b1;

	wire										load_mult_cnt_done	= (load_mult_cnt == load_mult_cnt_last) ? 1'b1 : 1'b0;
	wire										load_syst_cnt_done	= (load_syst_cnt == load_syst_cnt_last) ? 1'b1 : 1'b0;
			
		
		/*
		 * Loader Count Logic
		 */
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_LOAD_START:	{load_syst_cnt, load_mult_cnt} <= {load_syst_cnt_zero, load_mult_cnt_zero};
			//
			FSM_STATE_LOAD_SHIFT:	load_mult_cnt <= load_mult_cnt_next;
			FSM_STATE_LOAD_WRITE:	load_syst_cnt <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
		endcase
		//
	end
			
				
		/*
		 * Wide Operand Loader
		 */
	
		/*
		 * Explain how parallelized loader works here...
		 *
		 */
	
	
		// loader input
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_wr;
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_rd;
	reg										loader_wren;
	reg	[                 32-1:0]	loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
	
		// loader output
	wire	[                 32-1:0]	loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1];
	
		// array_input
	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	pe_a_wide;
	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	pe_b_wide;
			
		// generate parallelized loader		
	genvar i;
	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
		//
		begin : gen_bram_1rw_1ro_readfirst_loader
			//
			bram_1rw_1ro_readfirst #
			(
				.MEM_WIDTH		(32),
				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH)
			)
			bram_loader
			(
				.clk		(clk),
				.a_addr	(loader_addr_wr),
				.a_wr		(loader_wren),
				.a_in		(loader_din[i]),
				.a_out	(),
				.b_addr	(loader_addr_rd),
				.b_out	(loader_dout[i])
			);
			//
			assign pe_b_wide[32 * (i + 1) - 1 -: 32] = loader_dout[i];
			//
		end
		//
	endgenerate
			
				
		/*
		 * Block Memory Addresses
		 */
		
		/*
		 * Explain why there are two memory sizes.
		 */
		
		// the very first addresses
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
	
		// the very last addresses
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {n_num_words_latch};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {n_num_words_latch, 1'b1};

		// address registers
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
	wire	[OPERAND_ADDR_WIDTH  :0]	p_addr_ext_wr;
	reg	[OPERAND_ADDR_WIDTH  :0]	p_addr_ext_rd;
		
		// handy increment values
	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next				= b_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH  :0]	p_addr_ext_rd_next	= b_addr       + 1'b1;
	
		// write enables
	wire	p_wren;
	
		// data buses
	wire	[31: 0]	p_data_in;
	wire	[31: 0]	p_data_out;
	
		// handy stop flags
	wire	b_addr_done        = (b_addr        == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	p_addr_ext_rd_done = (p_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;

		// delayed addresses
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
	
	always @(posedge clk) b_addr_dly <= b_addr;

				
		// map registers to top-level ports
	assign b_bram_addr = b_addr;


	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
	bram_p
	(	.clk(clk),
		.a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(),
		.b_addr(p_addr_ext_rd), .b_out(p_data_out)
	);

				
		/*
		 * Loader Data Input 
		 */
	integer j;
	
		// shift logic
	always @(posedge clk)
		//
		case (fsm_state)
			//
			FSM_STATE_LOAD_SHIFT: begin
		
						// update the rightmost part of loader buffer
				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
				
						// shift the loader buffer to the left
				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_din[j-1] <= loader_din[j];
					
			end
			//			
		endcase


		/*
		 * Load Write Enable Logic
		 */
	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_LOAD_WRITE:	loader_wren <= 1'b1;
			default:						loader_wren <= 1'b0;
		endcase


		/*
		 * Loader Address Update Logic
		 */

	always @(posedge clk)
		//
		case (fsm_state)
		
			FSM_STATE_LOAD_START:
				//
				loader_addr_wr <= load_syst_cnt_zero;
				
			FSM_STATE_LOAD_WRITE:
				//
				loader_addr_wr <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
					
		endcase
	

		/*
		 * Memory Address Control Logic
		 */
	always @(posedge clk) begin
		//
		case (fsm_next_state)
			FSM_STATE_LOAD_START:	b_addr <= bram_addr_zero;
			FSM_STATE_LOAD_SHIFT:	b_addr <= b_addr_next;
		endcase
		//
	end


		/*
		 * Multiplier Array
		 */
	reg	pe_array_ena;
	wire	pe_array_rdy;

	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_MULT_START:	pe_array_ena <= 1'b1;
			default:						pe_array_ena <= 1'b0;
		endcase
		
	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_MULT_START:	p_num_words_latch <= {n_num_words_latch, 1'b1};
		endcase
			
	
	modexpa7_systolic_multiplier_array #
	(
		.OPERAND_ADDR_WIDTH		(OPERAND_ADDR_WIDTH),
		.SYSTOLIC_ARRAY_POWER	(SYSTOLIC_ARRAY_POWER)
	)
	systolic_pe_array
	(
		.clk					(clk),
		.rst_n				(rst_n),

		.ena					(pe_array_ena),
		.rdy					(pe_array_rdy),

		.loader_addr_rd	(loader_addr_rd),
		
		.pe_a_wide			({SYSTOLIC_ARRAY_LENGTH{a_bram_out}}),
		.pe_b_wide			(pe_b_wide),
		
		.a_bram_addr		(a_bram_addr),
		
		.p_bram_addr		(p_addr_ext_wr),
		.p_bram_in			(p_data_in),
		.p_bram_wr			(p_wren),

		.n_num_words		(n_num_words_latch),
		.p_num_words		(p_num_words_latch)
	);
	









		
			
		/*
		 * FSM Process
	-	 */
	always @(posedge clk or negedge rst_n)
		//
		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
		else						fsm_state <= fsm_next_state;
	
	
		/*
		 * FSM Transition Logic
		 */
	always @* begin
		//
		fsm_next_state = FSM_STATE_STOP;
		//
		case (fsm_state)
			//
			FSM_STATE_IDLE:				if (ena_trig)					fsm_next_state = FSM_STATE_LOAD_START;
												else								fsm_next_state = FSM_STATE_IDLE;
			//
			FSM_STATE_LOAD_START:											fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_SHIFT:		if (load_mult_cnt_done)		fsm_next_state = FSM_STATE_LOAD_WRITE;
												else								fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_WRITE:		if (load_syst_cnt_done)		fsm_next_state = FSM_STATE_LOAD_FINAL;
												else								fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_FINAL:											fsm_next_state = FSM_STATE_MULT_START;
			//
			FSM_STATE_MULT_START:											fsm_next_state = FSM_STATE_MULT_CRUNCH;
			FSM_STATE_MULT_CRUNCH:		if (pe_array_rdy)				fsm_next_state = FSM_STATE_MULT_FINAL;
												else								fsm_next_state = FSM_STATE_MULT_CRUNCH;
			FSM_STATE_MULT_FINAL:											fsm_next_state = FSM_STATE_STOP;
			//
			FSM_STATE_STOP:													fsm_next_state = FSM_STATE_IDLE;
			//
		endcase
		//
	end


endmodule

//======================================================================
// End of file
//======================================================================