aboutsummaryrefslogblamecommitdiff
path: root/src/rtl/modexpa7_n_coeff.v
blob: c2d7c9d6336377610334ffaba7a82a91cb761b06 (plain) (tree)








































                                                                                         

                                                                                              
                           
                                                       
          
                                                                                                                                                                    
 
                                                                                                                                                                        
 
                                                                                                                      
 
                                                                                                                                 
 
                                                                                                                                                                           
 
                                                                                                                                      















                                                                 
                                                                 




                                                                 
 
                                                                 


                                          


                                                    

                                            
                       
                                                      
                                         


                                                                  
         


















                                                                                                         




                                                                   
                                                                               



                                                          
                 

                                  






                                                                                   
                  
                                                         
                 

                                                                                                        
 
                                 
                                                                        



                                                        
                                                                                                           
                                
                         
                   
                                        
                   




                                                                                             



                                  













                                                                                                  
                                                        
                                                        
                                                         

                                                              

                                   
                                   


                                         

                                    
                                    

                                     
         





                              
                 
                                                                                                  
 
                                                                                 
                                                                                                  
                                                                                 
                                                                                                                 
















                                                                                                               
         







                                                                                                          
         
                                                                











                                                                              

                   
                          
                   









                                                                                                                       
         
                                                            
 







                                                                                                    
                                   
                                                                                                   
                                                                                                   





                                                                                                           
         
                   








                                                                                
                         
                                   
                                       




                                                      




                              




                                                                    




                                  
                 
                                         








                                                


                                                         
                 





                                                                     
                              

                                                                                   
 




















































                                                                                                                
         

                                       
                 






                                                                                     
         



















                                                                                               
         



                                                                                                        
                              

                                                                   
         








                                                               
         


























































                                                                                                                                                                        
                                    
                     
                                      
                                                                                  
                           


                                                                                                       
                        
                     


                                                                                  



                                                                                  
                        
                     
                                      
                                                                                  
                                                                               












































                                                                                                                 



                        

                                                   
                                    
                     
                   
                                      
                                          


                                                                        

                                                                                        
                     
                                                         
                                                                                                             
                                                                                        

                     
                                                         



























                                                                                                              
                                          
                                                                                              
                        
                   
            


                                            
                                    
                     
                                          




                                                                                                                
                        
                     
                                          


                                                                                                       
                        







                                                                                                 
                                          
                                                                                                  
                        

                      
                                      
                                                                                                  
                        
                      
                                      
                                                                                                         
                        
                           

                                          
                                                                                                                      
                        
                   
            
                   
                               





                                                                                             

                                        








                                                                                                                                                                         
                                                                                                                                                           
                                                                                                                                                           
                                                                                                                                                           


                                                                                                                                                                           
                                                                                                                                                           
                                                                                                                                                           


                                                                                                                                                                           
                                                                                                                                                           
                                                                                                                                                           
                                                                                                                                                           
                                                                                                                   
                                                                                                                                                                           
                                                                                                                                                                           
                         









                                                                                                                                                 
//======================================================================
//
// modexpa7_n_coeff.v
// -----------------------------------------------------------------------------
// Montgomery modulus-dependent coefficient calculation block.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================

module modexpa7_n_coeff #
	(
			//
			// This sets the address widths of memory buffers. Internal data
			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
			// 2 ** 6 = 64.
			//
		parameter	OPERAND_ADDR_WIDTH = 6
	)
	(
		input											clk,						// clock
		input											rst_n,					// active-low reset

		input											ena,						// enable input
		output										rdy,						// ready output

		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,			// modulus memory address
		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,	// modulus coefficient memory address

		input		[                32-1:0]	n_bram_out,				// modulus memory output

		output	[                32-1:0]	n_coeff_bram_in,		// modulus coefficient memory input
		output										n_coeff_bram_wr,		// modulus coefficient memory write enable

		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words				// number of words in modulus
	);
	
		//
		// FSM Declaration
		//
	localparam	[ 7: 0]	FSM_STATE_IDLE		= 8'h00;
	
	localparam	[ 7: 0]	FSM_STATE_INIT_1	= 8'hA1;
	localparam	[ 7: 0]	FSM_STATE_INIT_2	= 8'hA2;
	localparam	[ 7: 0]	FSM_STATE_INIT_3	= 8'hA3;
	localparam	[ 7: 0]	FSM_STATE_INIT_4	= 8'hA4;
	localparam	[ 7: 0]	FSM_STATE_INIT_5	= 8'hA5;
	
	localparam	[ 7: 0]	FSM_STATE_CALC_1	= 8'hB1;
	localparam	[ 7: 0]	FSM_STATE_CALC_2	= 8'hB2;
	localparam	[ 7: 0]	FSM_STATE_CALC_3	= 8'hB3;
	localparam	[ 7: 0]	FSM_STATE_CALC_4	= 8'hB4;
	localparam	[ 7: 0]	FSM_STATE_CALC_5	= 8'hB5;
	
	localparam	[ 7: 0]	FSM_STATE_SAVE_1	= 8'hC1;
	localparam	[ 7: 0]	FSM_STATE_SAVE_2	= 8'hC2;
	localparam	[ 7: 0]	FSM_STATE_SAVE_3	= 8'hC3;
	localparam	[ 7: 0]	FSM_STATE_SAVE_4	= 8'hC4;
	localparam	[ 7: 0]	FSM_STATE_SAVE_5	= 8'hC5;

	localparam	[ 7: 0]	FSM_STATE_STOP		= 8'hFF;
	
	
		//
		// FSM State / Next State
		//
	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
	reg	[ 7: 0]	fsm_next_state;


		//
		// Enable Delay and Trigger
		//
   reg ena_dly = 1'b0;
	
		/* delay enable by one clock cycle */
   always @(posedge clk) ena_dly <= ena;

		/* trigger new operation when enable goes high */
   wire ena_trig = ena && !ena_dly;
	
	
		//
		// Ready Flag Logic
		//
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;

   always @(posedge clk or negedge rst_n)
		
			/* reset flag */
		if (rst_n == 1'b0)						rdy_reg <= 1'b1;
		else begin
		
				/* clear flag when operation is started */
			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
			
				/* set flag after operation is finished */
			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
			
		end
		
		
		//
		// Parameters Latch
		//
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;

		/* save number of words in modulus when new operation starts*/
	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_INIT_1)
			n_num_words_latch <= n_num_words;

		
		//
		// Cycle Counters
		//
		
		/*
		 * Maybe we can cheat and skip calculation of entire T every time.
		 * During the first 32 cycles we only need the first word of T,
		 * during the following 64 cycles the secord word, etc. Needs
		 * further investigation...
		 *
		 */
		 
	reg	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt;
		
	wire	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt_zero = {{OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}};
	wire	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt_last = {n_num_words, 5'b11110};
	wire	[OPERAND_ADDR_WIDTH+4:0]	cyc_cnt_next = cyc_cnt + 1'b1;

		/* handy flag */
	wire	cyc_cnt_done = (cyc_cnt == cyc_cnt_last) ? 1'b1 : 1'b0;

	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_CALC_1)
			//
			case (fsm_state)
				FSM_STATE_INIT_5:	cyc_cnt <= cyc_cnt_zero;
				FSM_STATE_SAVE_5:	cyc_cnt <= !cyc_cnt_done ? cyc_cnt_next : cyc_cnt;
			endcase
			
			
		//
		// Handy Address Values
		//
		
		/* the very first address */
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
	
		/* the very last address */
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
		
		
		//
		// Block Memories
		//
		
		/*
		 * This module uses 8 block memories:
		 *
		 * N       - external input, stores modulus
		 * R       - internal, stores intermediate result
		 * B       - internal, stores current bit mask (see high-level algorithm)
		 * T       - internal, stores the product R * NN (see high-level algorithm)
		 * NN      - internal, stores the quantity ~N + 1 (see high-level algorithm)
		 * RR      - internal, stores a copy of R (see high-level algorithm)
		 * RB      - internal, stores the sum R + B (see high-level algorithm)
		 * N_COEFF - external output, stores the calculated modulus-depentent coefficient
		 *
		 */
		
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	t_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	nn_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	rr_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	rb_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr;
	
	reg	[31: 0]	r_data_in;
	reg	[31: 0]	b_data_in;
	reg	[31: 0]	t_data_in;
	reg	[31: 0]	nn_data_in;
	reg	[31: 0]	rr_data_in;
	reg	[31: 0]	rb_data_in;
	reg	[31: 0]	n_coeff_data_in;
	
	wire	[31: 0]	r_data_out;
	wire	[31: 0]	b_data_out;
	wire	[31: 0]	t_data_out;
	wire	[31: 0]	nn_data_out;
	wire	[31: 0]	rr_data_out;
	wire	[31: 0]	rb_data_out;
	
	reg	r_wren;
	reg	b_wren;
	reg	t_wren;
	reg	nn_wren;
	reg	rr_wren;
	reg	rb_wren;
	reg	n_coeff_wren;
		
	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out));

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_b (.clk(clk), .a_addr(b_addr), .a_wr(b_wren), .a_in(b_data_in), .a_out(b_data_out));

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_nn (.clk(clk), .a_addr(nn_addr), .a_wr(nn_wren), .a_in(nn_data_in), .a_out(nn_data_out));		

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_t (.clk(clk), .a_addr(t_addr), .a_wr(t_wren), .a_in(t_data_in), .a_out(t_data_out));

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_rb (.clk(clk), .a_addr(rb_addr), .a_wr(rb_wren), .a_in(rb_data_in), .a_out(rb_data_out));

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_rr (.clk(clk), .a_addr(rr_addr), .a_wr(rr_wren), .a_in(rr_data_in), .a_out(rr_data_out));
			
		/* handy values */
	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next				= n_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next				= r_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next				= b_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	t_addr_next				= t_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	nn_addr_next			= nn_addr      + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	rr_addr_next			= rr_addr      + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	rb_addr_next			= rb_addr      + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_next		= n_coeff_addr + 1'b1;
	
		/* handy flags */
	wire	n_addr_done				= (n_addr       == bram_addr_last) ? 1'b1 : 1'b0;
	wire	r_addr_done				= (r_addr       == bram_addr_last) ? 1'b1 : 1'b0;
	wire	b_addr_done				= (b_addr       == bram_addr_last) ? 1'b1 : 1'b0;
	wire	t_addr_done				= (t_addr       == bram_addr_last) ? 1'b1 : 1'b0;
	wire	nn_addr_done			= (nn_addr      == bram_addr_last) ? 1'b1 : 1'b0;	
	wire	rr_addr_done			= (rr_addr      == bram_addr_last) ? 1'b1 : 1'b0;
	wire	rb_addr_done			= (rb_addr      == bram_addr_last) ? 1'b1 : 1'b0;
	wire	n_coeff_addr_done		= (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
	
		/* map top-level ports to internal registers */
	assign n_bram_addr			= n_addr;
	assign n_coeff_bram_addr	= n_coeff_addr;
	assign n_coeff_bram_in		= n_coeff_data_in;
	assign n_coeff_bram_wr		= n_coeff_wren;


		//
		// Delayed Flags
		//
	reg	rb_addr_done_dly;
	
		/* delay rb_addr_done flag by one clock cycle (used later) */
	always @(posedge clk) rb_addr_done_dly <= rb_addr_done;
	
	
		//
		// Adder1
		//
		
		/*
		 * This adder is used to calculate NN = ~N + 1.
		 *
		 */
	wire	[31: 0]	add1_s;					// sum output
	wire				add1_c_in;				// carry input
	reg				add1_b_lsb;				// B-input
	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
	reg				add1_c_in_mask_dly;	// delayed carry masking flag
	wire				add1_c_out;				// carry output
	
		/* add masking into carry feedback chain */
	assign add1_c_in = add1_c_out & ~add1_c_in_mask;

		/* feed 1 into port B of adder */
	always @(posedge clk) add1_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;

		/* mask carry for the very first word of N */
	always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;

		/* delay carry masking flag by one clock cycle (used later) */
	always @(posedge clk) add1_c_in_mask_dly <= add1_c_in_mask;
	
	modexpa7_adder32 add1_inst
	(
		.clk		(clk),								//
		.ce		(1'b1),								//
		.a			(~n_bram_out),						// ~N
		.b			({{31{1'b0}}, add1_b_lsb}),	//  1
		.c_in		(add1_c_in),						//
		.s			(add1_s),							//
		.c_out	(add1_c_out)						//
	);
	
	
		//
		// Adder2
		//
		
		/*
		 * This adder is used to calculate RB = R + B.
		 *
		 */
	wire	[31: 0]	add2_s;			// sum output
	reg				add2_c_in;		// carry input
	wire				add2_c_out;		// carry output
			
	modexpa7_adder32 add2_inst
	(
		.clk		(clk),
		.ce		(1'b1),
		.a			(r_data_out),
		.b			(b_data_in),
		.c_in		(add2_c_in),
		.s			(add2_s),
		.c_out	(add2_c_out)
	);


		//
		// Multiplier
		//
		
		/*
		 * This multiplier is used to calculate T = R * NN.
		 *
		 */
		 
	reg	[31: 0]	pe_a;
	reg	[31: 0]	pe_b;
	reg	[31: 0]	pe_t;
	reg	[31: 0]	pe_c_in;
	wire	[31: 0]	pe_p;
	wire	[31: 0]	pe_c_out;
		
	modexpa7_systolic_pe pe_mul_inst
	(
		.clk		(clk),
		.a			(pe_a),
		.b			(pe_b),
		.t			(pe_t),
		.c_in		(pe_c_in),
		.p			(pe_p),
		.c_out	(pe_c_out)
	);


		//
		// Multiplier Latency Compensation Logic
		//
		
	localparam SYSTOLIC_PE_LATENCY = 4;
	
		/* shift register to match data propagation delay */
	reg [SYSTOLIC_PE_LATENCY:0] pe_latency;
	wire pe_latency_done = pe_latency[SYSTOLIC_PE_LATENCY];
	
		/* gradually fill the shift register with ones */
	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_CALC_1)
				pe_latency <= {1'b0, {SYSTOLIC_PE_LATENCY{1'b0}}};
		else	pe_latency <= {pe_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b1};


		//
		// Adder2 Output Delay
		//
	reg	[31: 0]	add2_s_dly[1:SYSTOLIC_PE_LATENCY-1];
	reg				add2_c_out_dly[1:SYSTOLIC_PE_LATENCY+2];	

		/* delay sum */
	integer i;
	always @(posedge clk)
		//
		for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
			add2_s_dly[i] <= (i == 1) ? add2_s : add2_s_dly[i-1];
		
		/* delay adder carry */
	always @(posedge clk)
		//
		for (i=1; i<=(SYSTOLIC_PE_LATENCY+2); i=i+1)
			add2_c_out_dly[i] <= (i == 1) ? add2_c_out : add2_c_out_dly[i-1];

		/* adder carry feedback */
	always @(posedge clk)
		//
		if ((fsm_next_state == FSM_STATE_CALC_3) && (nn_addr == bram_addr_zero))
			add2_c_in <= (r_addr == bram_addr_zero) ? 1'b0 : add2_c_out_dly[SYSTOLIC_PE_LATENCY+2];
			
		//
		// Multiplier Output Delay
		//
	reg	[31: 0]	pe_c_out_dly[1:3];

	always @(posedge clk)
		//
		for (i=1; i<=3; i=i+1)
			pe_c_out_dly[i] <= (i == 1) ? pe_c_out : pe_c_out_dly[i-1];


		//
		// Multiplier Operand Loader
		//
	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_CALC_3) begin
			pe_a    <= r_data_out;
			pe_b    <= nn_data_out;
			pe_t    <= (nn_addr == bram_addr_zero) ? {32{1'b0}} : t_data_out;
			pe_c_in <= (r_addr  == bram_addr_zero) ? {32{1'b0}} : pe_c_out_dly[3];
		end else begin
			pe_a    <= {32{1'bX}};
			pe_b    <= {32{1'bX}};
			pe_t    <= {32{1'bX}};
			pe_c_in <= {32{1'bX}};		
		end
	
	
		//
		// B Shift Carry Logic
		//
		
		/*
		 * B value is repeatedly shifted to the left, so we need carry logic
		 * to save the MSB of the current output word and feed into the LSB
		 * of the next input word.
		 *
		 */
		 
	reg	b_data_out_carry;
	
	always @(posedge clk)
		//
		case (fsm_next_state)
		
				/* mask carry into the very first word */
			FSM_STATE_CALC_2:
				if ((nn_addr == bram_addr_zero) && (b_addr == bram_addr_zero))
					b_data_out_carry <= 1'b0;
					
				/* carry feedback */
			FSM_STATE_CALC_3:
				if (nn_addr == bram_addr_zero)
					b_data_out_carry <= b_data_out[31];
					
		endcase
		
		
		//
		// R Update Flag
		//
	reg	flag_update_r;
	
		/* indices of the target bit of T */
	wire	[                   4:0]	flag_addr_bit  = cyc_cnt_next[4:0];
	wire	[OPERAND_ADDR_WIDTH-1:0]	flag_addr_word	= cyc_cnt_next[OPERAND_ADDR_WIDTH+4:5];
	
		/* update flag when the target bit of T is available */
	always @(posedge clk)
		//
		if (t_wren && (t_addr == flag_addr_word))
			flag_update_r <= t_data_in[flag_addr_bit];
	
	
		//
		// Block Memory Address Logic
		//

	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr_calc1;
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_calc1;
	reg	[OPERAND_ADDR_WIDTH-1:0]	t_addr_calc1;
	reg	[OPERAND_ADDR_WIDTH-1:0]	nn_addr_calc1;
	reg	[OPERAND_ADDR_WIDTH-1:0]	rr_addr_calc1;
	reg	[OPERAND_ADDR_WIDTH-1:0]	rb_addr_calc1;
	
		/* how to update R duing CALC_1 state */
	always @*
		//
		if (fsm_state == FSM_STATE_INIT_5)					r_addr_calc1 <= bram_addr_zero;
		else begin
			if (r_addr < (n_num_words_latch - nn_addr))	r_addr_calc1 <= r_addr_next;
			else														r_addr_calc1 <= bram_addr_zero;
		end

		/* how to update B, RR, RB duing CALC_1 state */
	always @* begin
		//
		b_addr_calc1  = b_addr;
		rr_addr_calc1 = rr_addr;
		rb_addr_calc1 = rb_addr;
		//
		if ((fsm_state == FSM_STATE_INIT_5)	|| (fsm_state == FSM_STATE_SAVE_5)) begin
			//
			b_addr_calc1  = bram_addr_zero;
			rr_addr_calc1 = bram_addr_zero;
			rb_addr_calc1 = bram_addr_zero;
			//
		end else if (nn_addr == bram_addr_zero) begin
			//
			b_addr_calc1  = !b_addr_done  ? b_addr_next  : b_addr;
			rr_addr_calc1 = !rr_addr_done ? rr_addr_next : rr_addr;
			rb_addr_calc1 = !rb_addr_done ? rb_addr_next : rb_addr;
			//
		end
		//
	end

		/* how to update T duing CALC_1 state */
	always @*
		//
		if ((fsm_state == FSM_STATE_INIT_5) || (fsm_state == FSM_STATE_SAVE_5))
			t_addr_calc1 = bram_addr_zero;
		else begin
			if (r_addr == (n_num_words_latch - nn_addr))
				t_addr_calc1 = nn_addr_next;
			else
				t_addr_calc1 = t_addr_next;
		end

		/* how to update NN duing CALC_1 state */
	always @* begin
		//
		nn_addr_calc1 = nn_addr;
		//
		if ((fsm_state == FSM_STATE_INIT_5) || (fsm_state == FSM_STATE_SAVE_5))
			nn_addr_calc1 = bram_addr_zero;
		else if (r_addr == (n_num_words_latch - nn_addr))
			nn_addr_calc1 = nn_addr_next;
		//
	end


		//
		// Address Update Logic
		//
	always @(posedge clk) begin
		//
		// N
		//
		case (fsm_next_state)
			FSM_STATE_INIT_1:		n_addr <= bram_addr_zero;
			//
			FSM_STATE_INIT_2,
			FSM_STATE_INIT_3,
			FSM_STATE_INIT_4,
			FSM_STATE_INIT_5:		n_addr <= !n_addr_done ? n_addr_next : n_addr;
		endcase
		//
		// R
		//
		case (fsm_next_state)
			FSM_STATE_INIT_4:		r_addr <= bram_addr_zero;
			FSM_STATE_INIT_5:		r_addr <= r_addr_next;
			FSM_STATE_CALC_1:		r_addr <= r_addr_calc1;
			FSM_STATE_SAVE_3:		r_addr <= bram_addr_zero;
			//
			FSM_STATE_SAVE_4,
			FSM_STATE_SAVE_5:		r_addr <= r_addr_next;	
		endcase
		//
		// B
		//
		case (fsm_next_state)
			FSM_STATE_INIT_4:		b_addr <= bram_addr_zero;
			FSM_STATE_INIT_5:		b_addr <= b_addr_next;
			FSM_STATE_CALC_1:		b_addr <= b_addr_calc1;
		endcase
		//
		// T
		//
		case (fsm_next_state)			
			FSM_STATE_CALC_1:		t_addr <= t_addr_calc1;			
		endcase
		//
		// NN
		//
		case (fsm_next_state)
			FSM_STATE_INIT_4:		nn_addr <= bram_addr_zero;
			FSM_STATE_INIT_5:		nn_addr <= nn_addr_next;
			FSM_STATE_CALC_1:		nn_addr <= nn_addr_calc1;
		endcase
		//
		// RR
		//
		case (fsm_next_state)			
			FSM_STATE_CALC_1:		rr_addr <= rr_addr_calc1;
			FSM_STATE_SAVE_1:		rr_addr <= bram_addr_zero;
			//
			FSM_STATE_SAVE_2,
			FSM_STATE_SAVE_3,
			FSM_STATE_SAVE_4:		rr_addr <= !rr_addr_done ? rr_addr_next : rr_addr;	
		endcase		
		//
		// RB
		//
		case (fsm_next_state)			
			FSM_STATE_CALC_1:		rb_addr <= rb_addr_calc1;			
			FSM_STATE_SAVE_1:		rb_addr <= bram_addr_zero;
			//
			FSM_STATE_SAVE_2,
			FSM_STATE_SAVE_3,
			FSM_STATE_SAVE_4:		rb_addr <= !rb_addr_done ? rb_addr_next : rb_addr;
		endcase		
		//
		// N_COEFF
		//
		case (fsm_next_state)			
			FSM_STATE_SAVE_3:		n_coeff_addr <= bram_addr_zero;
			//
			FSM_STATE_SAVE_4,
			FSM_STATE_SAVE_5:		n_coeff_addr <= r_addr_next;
		endcase
		//
	end


		//
		// Block Memory Write Enable Logic
		//
	always @(posedge clk) begin
		//
		// R
		//
		case (fsm_next_state)
			FSM_STATE_INIT_4,
			FSM_STATE_INIT_5,
			FSM_STATE_SAVE_3,
			FSM_STATE_SAVE_4,
			FSM_STATE_SAVE_5:		r_wren <= 1'b1;
			default:					r_wren <= 1'b0;
		endcase
		//
		// B
		//
		case (fsm_next_state)			
			FSM_STATE_INIT_4,
			FSM_STATE_INIT_5:		b_wren <= 1'b1;
			FSM_STATE_CALC_3:		b_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
			default:					b_wren <= 1'b0;
		endcase
		//
		// T
		//
		case (fsm_next_state)			
			FSM_STATE_CALC_5:		t_wren <= 1'b1;
			default:					t_wren <= 1'b0;
		endcase
		//
		// NN
		//
		case (fsm_next_state)			
			FSM_STATE_INIT_4,
			FSM_STATE_INIT_5:		nn_wren <= 1'b1;
			default:					nn_wren <= 1'b0;
		endcase
		//
		// RR
		//
		case (fsm_next_state)
			FSM_STATE_CALC_5:		rr_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
			default:					rr_wren <= 1'b0;
		endcase
		//
		// RB
		//
		case (fsm_next_state)
			FSM_STATE_CALC_5:		rb_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
			default:					rb_wren <= 1'b0;
		endcase
		//
		// N_COEFF
		//
		case (fsm_next_state)
			FSM_STATE_SAVE_3,
			FSM_STATE_SAVE_4,
			FSM_STATE_SAVE_5:		n_coeff_wren <= cyc_cnt_done;
			default:					n_coeff_wren <= 1'b0;
		endcase
		//
	end
	
	
		//
		// Block Memory Input Logic
		//
	always @(posedge clk) begin
		//
		// R
		//
		case (fsm_next_state)
			FSM_STATE_INIT_4,
			FSM_STATE_INIT_5:		r_data_in <= {{31{1'b0}}, add1_c_in_mask_dly};
			//
			FSM_STATE_SAVE_3,
			FSM_STATE_SAVE_4,
			FSM_STATE_SAVE_5:		r_data_in <= flag_update_r ? rb_data_out : rr_data_out;
			default:					r_data_in <= {32{1'bX}};
		endcase
		//
		// B
		//
		case (fsm_next_state)
			FSM_STATE_INIT_4,
			FSM_STATE_INIT_5:		b_data_in <= {{31{1'b0}}, add1_c_in_mask_dly};
			FSM_STATE_CALC_3:		b_data_in <= (nn_addr == bram_addr_zero) ?
				{b_data_out[30:0], b_data_out_carry} : {32{1'bX}};
			default:					b_data_in <= {32{1'bX}};
		endcase
		//
		// T
		//
		case (fsm_next_state)
			FSM_STATE_CALC_5:		t_data_in <= pe_p;
			default:					t_data_in <= {32{1'bX}};
		endcase
		//
		// NN
		//
		case (fsm_next_state)
			FSM_STATE_INIT_4,
			FSM_STATE_INIT_5:		nn_data_in <= add1_s;
			default:					nn_data_in <= {32{1'bX}};
		endcase
		//
		// RR
		//
		case (fsm_next_state)
			FSM_STATE_CALC_5:		rr_data_in <= r_data_out;
			default:					rr_data_in <= {32{1'bX}};
		endcase
		//
		// RB
		//
		case (fsm_next_state)
			FSM_STATE_CALC_5:		rb_data_in <= add2_s_dly[SYSTOLIC_PE_LATENCY-1];
			default:					rb_data_in <= {32{1'bX}};
		endcase
		//
		// N_COEFF
		//
		case (fsm_next_state)
			FSM_STATE_SAVE_3,
			FSM_STATE_SAVE_4,
			FSM_STATE_SAVE_5:		n_coeff_data_in <= flag_update_r ? rb_data_out : rr_data_out;
			default:					n_coeff_data_in <= {32{1'bX}};
		endcase
		//
	end

	
		//
		// FSM Process
		//
	always @(posedge clk or negedge rst_n)
		//
		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
		else						fsm_state <= fsm_next_state;
	
	
		//
		// FSM Transition Logic
		//
	always @* begin
		//
		fsm_next_state = FSM_STATE_STOP;
		//
		case (fsm_state)

			FSM_STATE_IDLE:		if (ena_trig)				fsm_next_state = FSM_STATE_INIT_1;
										else							fsm_next_state = FSM_STATE_IDLE;
												
			FSM_STATE_INIT_1:										fsm_next_state = FSM_STATE_INIT_2;
			FSM_STATE_INIT_2:										fsm_next_state = FSM_STATE_INIT_3;
			FSM_STATE_INIT_3:										fsm_next_state = FSM_STATE_INIT_4;
			FSM_STATE_INIT_4:										fsm_next_state = FSM_STATE_INIT_5;
			FSM_STATE_INIT_5:		if (nn_addr_done)			fsm_next_state = FSM_STATE_CALC_1;
										else							fsm_next_state = FSM_STATE_INIT_5;

			FSM_STATE_CALC_1:										fsm_next_state = FSM_STATE_CALC_2;
			FSM_STATE_CALC_2:										fsm_next_state = FSM_STATE_CALC_3;
			FSM_STATE_CALC_3:										fsm_next_state = FSM_STATE_CALC_4;
			FSM_STATE_CALC_4:		if (pe_latency_done)		fsm_next_state = FSM_STATE_CALC_5;
										else							fsm_next_state = FSM_STATE_CALC_4;
			FSM_STATE_CALC_5:		if (nn_addr_done)			fsm_next_state = FSM_STATE_SAVE_1;
										else							fsm_next_state = FSM_STATE_CALC_1;
			
			FSM_STATE_SAVE_1:										fsm_next_state = FSM_STATE_SAVE_2;
			FSM_STATE_SAVE_2:										fsm_next_state = FSM_STATE_SAVE_3;
			FSM_STATE_SAVE_3:										fsm_next_state = FSM_STATE_SAVE_4;
			FSM_STATE_SAVE_4:		if (rb_addr_done_dly)	fsm_next_state = FSM_STATE_SAVE_5;
										else							fsm_next_state = FSM_STATE_SAVE_4;
			FSM_STATE_SAVE_5:		if (cyc_cnt_done)			fsm_next_state = FSM_STATE_STOP;
										else							fsm_next_state = FSM_STATE_CALC_1;
			
			FSM_STATE_STOP:										fsm_next_state = FSM_STATE_IDLE;

		endcase
	end


endmodule

//======================================================================
// End of file
//======================================================================