aboutsummaryrefslogblamecommitdiff
path: root/src/rtl/modexpa7_systolic_multiplier.v
blob: a1e141ed65ca2bcc43549d1d5e0d64157559c656 (plain) (tree)



























































                                                                                                              




                                                                          
 



                                                                                 
















































































                                                                                                          
                         
                         
                   
                            
                    
                         
















                                                                                                                                                                      
                         
                 













                                                                                                                            
                                 







































                                                                                                                               
                    
                         
                                 
































                                                                                                                     
 
                                 


















                                                                                                                                
                            

                                                 
 








                                              
                                   







                                                                         
                        
 
 
                   
                                               
                    















                                                                                                                   
         
 
                   
                                                
                    




                                                                         
                        

                   
 














































                                                                                                                                                                                                      
//======================================================================
//
// modexpa7_systolic_multiplier.v
// -----------------------------------------------------------------------------
// Systolic Montgomery multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================

module modexpa7_systolic_multiplier #
	(
			//
			// This sets the address widths of memory buffers. Internal data
			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
			// 2 ** 6 = 64.
			//
		parameter	OPERAND_ADDR_WIDTH		= 4,
		
			//
			// Explain.
			//
		parameter	SYSTOLIC_ARRAY_POWER		= 2
	)
	(
		input											clk,
		input											rst_n,

		input											ena,
		output										rdy,

		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,

		input		[                32-1:0]	a_bram_out,
		input		[                32-1:0]	b_bram_out,
		input		[                32-1:0]	n_bram_out,
		input		[                32-1:0]	n_coeff_bram_out,

		output	[                32-1:0]	r_bram_in,
		output										r_bram_wr,

		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
	);
	
		
		/*
		 * Include Settings
		 */
	`include "pe/modexpa7_primitive_switch.v"
	`include "modexpa7_settings.v"
		

		/*
		 * FSM Declaration
		 */
	localparam	[ 7: 0]	FSM_STATE_IDLE				= 8'h00;
	
	localparam	[ 7: 0]	FSM_STATE_LOAD_START		= 8'h11;
	localparam	[ 7: 0]	FSM_STATE_LOAD_SHIFT		= 8'h12;
	localparam	[ 7: 0]	FSM_STATE_LOAD_WRITE		= 8'h13;
	localparam	[ 7: 0]	FSM_STATE_LOAD_FINAL		= 8'h14;

	localparam	[ 7: 0]	FSM_STATE_MULT_START		= 8'h21;
	localparam	[ 7: 0]	FSM_STATE_MULT_CRUNCH	= 8'h22;
	localparam	[ 7: 0]	FSM_STATE_MULT_FINAL		= 8'h23;
	
	localparam	[ 7: 0]	FSM_STATE_STOP				= 8'hFF;
	
		/*
		 * FSM State / Next State
		 */
	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
	reg	[ 7: 0]	fsm_next_state;


		/*
		 * Enable Delay and Trigger
		 */
   reg ena_dly = 1'b0;
	
		// delay enable by one clock cycle
   always @(posedge clk) ena_dly <= ena;

		// trigger new operation when enable goes high
   wire ena_trig = ena && !ena_dly;
	
	
		/*
		 * Ready Flag Logic
		 */
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;

   always @(posedge clk or negedge rst_n)
		
			// reset flag
		if (rst_n == 1'b0) rdy_reg <= 1'b1;
		else begin
		
				// clear flag when operation is started
			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
			
				// set flag after operation is finished
			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
			
		end
		
		
		/*
		 * Parameters Latch
		 */
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;

		// save number of words in n when new operation starts
	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_IDLE) && ena_trig)
			n_num_words_latch <= n_num_words;
			
			
		/*
		 * Counters
		 */
			
		// handy values
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};

	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};	
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
	
		// counter
	reg	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt;
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt;
	
		// handy increment value and stop flag
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_next	= load_mult_cnt + 1'b1;
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_next	= load_syst_cnt + 1'b1;

	wire										load_mult_cnt_done	= (load_mult_cnt == load_mult_cnt_last) ? 1'b1 : 1'b0;
	wire										load_syst_cnt_done	= (load_syst_cnt == load_syst_cnt_last) ? 1'b1 : 1'b0;
			
		
		/*
		 * Loader Count Logic
		 */
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_LOAD_START:	{load_syst_cnt, load_mult_cnt} <= {load_syst_cnt_zero, load_mult_cnt_zero};
			//
			FSM_STATE_LOAD_SHIFT:	load_mult_cnt <= load_mult_cnt_next;
			FSM_STATE_LOAD_WRITE:	load_syst_cnt <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
		endcase
		//
	end
			
				
		/*
		 * Wide Operand Loader
		 */
	
		/*
		 * Explain how parallelized loader works here...
		 *
		 */
	
	
		// loader input
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr[0:SYSTOLIC_ARRAY_LENGTH-1];
	reg										loader_wren[0:SYSTOLIC_ARRAY_LENGTH-1];
	reg	[                 32-1:0]	loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
	
		// loader output
	wire	[                 32-1:0]	loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1];
			
		// generate parallelized loader		
	genvar i;
	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
		//
		begin : gen_bram_1rw_readfirst_loader
			//
			bram_1rw_readfirst #
			(
				.MEM_WIDTH		(32),
				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH)
			)
			bram_loader
			(
				.clk		(clk),
				.a_addr	(loader_addr[i]),
				.a_wr		(loader_wren[i]),
				.a_in		(loader_din[i]),
				.a_out	(loader_dout[i])
			);
			//
		end
		//
	endgenerate
			
				
		/*
		 * Block Memory Addresses
		 */
		
		/*
		 * Explain why there are two memory sizes.
		 */
		
		// the very first addresses
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
	
		// the very last addresses
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {n_num_words_latch};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {n_num_words_latch, 1'b1};

		// address registers
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
		
		// handy increment values
	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next		= b_addr       + 1'b1;
	
		// handy stop flags
	wire	b_addr_done			= (b_addr      == bram_addr_last)     ? 1'b1 : 1'b0;

		// delayed addresses
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
		
	always @(posedge clk) b_addr_dly <= b_addr;

				
		// map registers to top-level ports
	assign b_bram_addr = b_addr;

				
		/*
		 * Loader Data Input 
		 */
	integer j;
	
		// shift logic
	always @(posedge clk)
		//
		case (fsm_state)
			//
			FSM_STATE_LOAD_SHIFT: begin
		
						// update the rightmost part of loader buffer
				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
				
						// shift the loader buffer to the left
				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_din[j-1] <= loader_din[j];
					
			end
			//			
		endcase


		/*
		 * Load Write Enable Logic
		 */
	always @(posedge clk)
		//
		case (fsm_next_state)
		
			FSM_STATE_LOAD_WRITE:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_wren[j] <= 1'b1;
					
			default:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_wren[j] <= 1'b0;
					
		endcase


		/*
		 * Loader Address Update Logic
		 */

	always @(posedge clk)
		//
		case (fsm_state)
		
			FSM_STATE_LOAD_START:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr[j] <= load_syst_cnt_zero;
					
			FSM_STATE_LOAD_WRITE:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr[j] <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
					
		endcase
	

		/*
		 * Memory Address Control Logic
		 */
	always @(posedge clk) begin
		//
		case (fsm_next_state)
			FSM_STATE_LOAD_START:	b_addr <= bram_addr_zero;
			FSM_STATE_LOAD_SHIFT:	b_addr <= b_addr_next;
		endcase
		//
	end


		
			
		/*
		 * FSM Process
		 */
	always @(posedge clk or negedge rst_n)
		//
		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
		else						fsm_state <= fsm_next_state;
	
	
		/*
		 * FSM Transition Logic
		 */
	always @* begin
		//
		fsm_next_state = FSM_STATE_STOP;
		//
		case (fsm_state)
			//
			FSM_STATE_IDLE:				if (ena_trig)					fsm_next_state = FSM_STATE_LOAD_START;
												else								fsm_next_state = FSM_STATE_IDLE;
			//
			FSM_STATE_LOAD_START:											fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_SHIFT:		if (load_mult_cnt_done)		fsm_next_state = FSM_STATE_LOAD_WRITE;
												else								fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_WRITE:		if (load_syst_cnt_done)		fsm_next_state = FSM_STATE_LOAD_FINAL;
												else								fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_FINAL:											fsm_next_state = FSM_STATE_STOP;
			//
			//FSM_STATE_MULT_START:
			//FSM_STATE_MULT_CRUNCH:
			//FSM_STATE_MULT_FINAL:
			//
			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
			//
		endcase
		//
	end


endmodule

//======================================================================
// End of file
//======================================================================