aboutsummaryrefslogblamecommitdiff
path: root/src/rtl/modexpa7_systolic_multiplier.v
blob: 40ccac857887f63e6c44ed23a74d2b0a4783b55e (plain) (tree)


























































                                                                                                              

                                                                                                                     
 




                                                                          
 



                                                                                 



























                                                                                                          









                                                                         

                                                                                 
         

                  
                                                          


                                                   
                                       






































                                                                                                        

                                                                                                          





                                                                      
                         



                                                               
                         
                 
                   








                                                                                
                                                                                  















                                                                                                                                     
                            
                    
                         
















                                                                                                                                                                      
                         
                 













                                                                                                                            
                                 










                                                                


                                                                                                    


                                                                                       




                                                                                        




                                                          
                                                         
                          
                                                






                                                                     

                                                              
                                                                





                                                                                  


                          
                    
                         
                                 















                                                                                                                     
                                     
                                                        
                                                       
                                                       
                                                              








                                                               

                                         







                                                                                                                


                                 





                         


                                   










                                     

                                   







                                                                                                               


                                                            




                                                                     
        





                                                                                   


                                                   
























                                                                                        
 





                                                                                    

                                                                                      
                
                           













                                                                                         
          
 
                                 











                                                   

                                                                                      
                                














                                                                                                                                   


                                                                         
                            

                                                 
 






                                           

                                                                                                    
                        
 
 
                   
                                               
                    


                             
                                 
                






                                                                                                           

                                        
 
                   







                                                                     
                                                
                    

                                   







































                                                                                                                                         
                        

                   
 















                                                                                                      






                                                                                                                
                         


                                           










                                                                                                         













                                                                       

                                                                                              

                                                  
                                                                                       

                                                             
                                                  
                 


                                                            




                                                            










                                                                                                                      
 









                                                 
 









                                                                                                                       
 












                                                        
 






                                                                                    
 
































































































                                                                                                                                                                                           


 


                              
                   
                                              
                   
                                                                    

                                                                                             
                               
                   
                                               


















                                                                                                                                                                                                      
                                                                                                                                                                      
                           


                                                                                                                                                                                                       













                                                                                                                                                                                                       
                          
                                                                                                                                                                        










                                                                        
//======================================================================
//
// modexpa7_systolic_multiplier.v
// -----------------------------------------------------------------------------
// Systolic Montgomery multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================

module modexpa7_systolic_multiplier #
	(
			//
			// This sets the address widths of memory buffers. Internal data
			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
			// 2048 / 32 = 64 words, and these need 6-bit address bus, because
			// 2 ** 6 = 64.
			//
		parameter	OPERAND_ADDR_WIDTH		= 4,
		
			//
			// Explain.
			//
		parameter	SYSTOLIC_ARRAY_POWER		= 2
	)
	(
		input											clk,
		input											rst_n,

		input											ena,
		output										rdy,

		input											reduce_only,

		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,

		input		[                32-1:0]	a_bram_out,
		input		[                32-1:0]	b_bram_out,
		input		[                32-1:0]	n_bram_out,
		input		[                32-1:0]	n_coeff_bram_out,

		output	[                32-1:0]	r_bram_in,
		output										r_bram_wr,

		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
	);
	
		
		/*
		 * Include Settings
		 */
	`include "pe/modexpa7_primitive_switch.v"
	`include "modexpa7_settings.v"
		

		/*
		 * FSM Declaration
		 */
	localparam	[ 7: 0]	FSM_STATE_IDLE				= 8'h00;
	
	localparam	[ 7: 0]	FSM_STATE_LOAD_START		= 8'h11;
	localparam	[ 7: 0]	FSM_STATE_LOAD_SHIFT		= 8'h12;
	localparam	[ 7: 0]	FSM_STATE_LOAD_WRITE		= 8'h13;
	localparam	[ 7: 0]	FSM_STATE_LOAD_FINAL		= 8'h14;

	localparam	[ 7: 0]	FSM_STATE_MULT_START		= 8'h21;
	localparam	[ 7: 0]	FSM_STATE_MULT_CRUNCH	= 8'h22;
	localparam	[ 7: 0]	FSM_STATE_MULT_FINAL		= 8'h23;
	
	localparam	[ 7: 0]	FSM_STATE_ADD_START		= 8'h31;
	localparam	[ 7: 0]	FSM_STATE_ADD_CRUNCH		= 8'h32;
	localparam	[ 7: 0]	FSM_STATE_ADD_UNLOAD		= 8'h33;
	localparam	[ 7: 0]	FSM_STATE_SUB_UNLOAD		= 8'h34;
	localparam	[ 7: 0]	FSM_STATE_ADD_FINAL		= 8'h35;
	
	localparam	[ 7: 0]	FSM_STATE_SAVE_START		= 8'h41;
	localparam	[ 7: 0]	FSM_STATE_SAVE_WRITE		= 8'h42;
	localparam	[ 7: 0]	FSM_STATE_SAVE_FINAL		= 8'h43;
	
	localparam	[ 7: 0]	FSM_STATE_STOP				= 8'hFF;
	
	
		/*
		 * FSM State / Next State / Previous State
		 */
	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
	reg	[ 7: 0]	fsm_next_state;
	//reg	[ 7: 0]	fsm_prev_state;


		/*
		 * Enable Delay and Trigger
		 */
   reg ena_dly = 1'b0;
	
		// delay enable by one clock cycle
   always @(posedge clk) ena_dly <= ena;

		// trigger new operation when enable goes high
   wire ena_trig = ena && !ena_dly;
	
	
		/*
		 * Ready Flag Logic
		 */
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;

   always @(posedge clk or negedge rst_n)
		
			// reset flag
		if (rst_n == 1'b0) rdy_reg <= 1'b1;
		else begin
		
				// clear flag when operation is started
			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
			
				// set flag after operation is finished
			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
			
		end
		
		
		/*
		 * Parameters Latch
		 */
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;
	reg	[OPERAND_ADDR_WIDTH  :0]	p_num_words_latch;
	reg										reduce_only_latch;

		// save number of words in n when new operation starts
	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_IDLE) && ena_trig)
			n_num_words_latch <= n_num_words;
			
	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_IDLE) && ena_trig)
			reduce_only_latch <= reduce_only;
			
		
		/*
		 * Multiplication Phase
		 */
	localparam	[ 1: 0]	MULT_PHASE_A_B				= 2'd1;
	localparam	[ 1: 0]	MULT_PHASE_AB_N_COEFF	= 2'd2;
	localparam	[ 1: 0]	MULT_PHASE_Q_N				= 2'd3;
	localparam	[ 1: 0]	MULT_PHASE_STALL			= 2'd0;
	
	reg	[ 1: 0]	mult_phase;
	
	wire	mult_phase_ab   = (mult_phase == MULT_PHASE_A_B)   ? 1'b1 : 1'b0;
	wire	mult_phase_done = (mult_phase == MULT_PHASE_STALL) ? 1'b1 : 1'b0;
	
   always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_LOAD_START:	if (ena_trig)	mult_phase <= MULT_PHASE_A_B;
			FSM_STATE_MULT_FINAL:
				case (mult_phase)
					MULT_PHASE_A_B:						mult_phase <= MULT_PHASE_AB_N_COEFF;
					MULT_PHASE_AB_N_COEFF:				mult_phase <= MULT_PHASE_Q_N;
					MULT_PHASE_Q_N:						mult_phase <= MULT_PHASE_STALL;
				endcase
		endcase
	
			
		/*
		 * Counters
		 */
			
		// handy values
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};

	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};	
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
	
		// counter
	reg	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt;
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt;
	
		// handy increment value and stop flag
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	load_mult_cnt_next	= load_mult_cnt + 1'b1;
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	load_syst_cnt_next	= load_syst_cnt + 1'b1;

	wire										load_mult_cnt_done	= (load_mult_cnt == load_mult_cnt_last) ? 1'b1 : 1'b0;
	wire										load_syst_cnt_done	= (load_syst_cnt == load_syst_cnt_last) ? 1'b1 : 1'b0;
			
		
		/*
		 * Loader Count Logic
		 */
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_LOAD_START:	{load_syst_cnt, load_mult_cnt} <= {load_syst_cnt_zero, load_mult_cnt_zero};
			//
			FSM_STATE_LOAD_SHIFT:	load_mult_cnt <= load_mult_cnt_next;
			FSM_STATE_LOAD_WRITE:	load_syst_cnt <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
		endcase
		//
	end
			
				
		/*
		 * Wide Operand Loader
		 */
	
		/*
		 * Explain how parallelized loader works here...
		 *
		 */
	
	
		// loader input
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_wr;
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_rd;
	reg										loader_wren;
	reg	[                 32-1:0]	loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
	
		// loader output
	wire	[                 32-1:0]	loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1];
	
		// array_input
	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	pe_a_wide;
	wire	[32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0]	pe_b_wide;
			
		// generate parallelized loader		
	genvar i;
	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
		//
		begin : gen_bram_1rw_1ro_readfirst_loader
			//
			bram_1rw_1ro_readfirst #
			(
				.MEM_WIDTH		(32),
				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH)
			)
			bram_loader
			(
				.clk		(clk),
				.a_addr	(loader_addr_wr),
				.a_wr		(loader_wren),
				.a_in		(loader_din[i]),
				.a_out	(),
				.b_addr	(loader_addr_rd),
				.b_out	(loader_dout[i])
			);
			//
			assign pe_b_wide[32 * (i + 1) - 1 -: 32] = loader_dout[i];
			//
		end
		//
	endgenerate
			
				
		/*
		 * Block Memory Addresses
		 */
		
		/*
		 * Explain why there are two memory sizes.
		 */
		
		// the very first addresses
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
	
		// the very last addresses
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {n_num_words_latch};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {n_num_words_latch, 1'b1};

		// address registers
	wire	[OPERAND_ADDR_WIDTH-1:0]	a_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
	wire	[OPERAND_ADDR_WIDTH  :0]	p_addr_ext_wr;
	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_wr;
	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_rd;
	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_wr;
	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_rd;
	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_wr;
	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd;
	reg	[OPERAND_ADDR_WIDTH-1:0]	s_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
		
		// handy increment values
	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next				= b_addr         + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next				= n_addr         + 1'b1;
	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_rd_next	= ab_addr_ext_rd + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_rd_next			= q_addr_rd      + 1'b1;
	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd_next	= qn_addr_ext_rd + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	s_addr_next				= s_addr         + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_addr_next			= sn_addr        + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next				= r_addr         + 1'b1;
	
		// write enables
	wire	p_wren;
	wire	ab_wren;
	wire	q_wren;
	wire	qn_wren;
	reg	s_wren;
	reg	sn_wren;
	reg	r_wren;
	
		// data buses
	wire	[31: 0]	p_data_in;
	wire	[31: 0]	ab_data_in;
	wire	[31: 0]	ab_data_out;
	wire	[31: 0]	q_data_in;
	wire	[31: 0]	q_data_out;
	wire	[31: 0]	qn_data_in;
	wire	[31: 0]	qn_data_out;
	wire	[31: 0]	s_data_in;
	wire	[31: 0]	s_data_out;
	wire	[31: 0]	sn_data_in;
	wire	[31: 0]	sn_data_out;
	wire	[31: 0]	r_data_in;
	
		// handy stop flags
	wire	b_addr_done				= (b_addr         == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	n_addr_done				= (n_addr         == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	ab_addr_ext_rd_done	= (ab_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
	wire	q_addr_rd_done			= (q_addr_rd      == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	qn_addr_ext_rd_done	= (qn_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
	wire	s_addr_done				= (s_addr         == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	sn_addr_done			= (sn_addr        == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	r_addr_done				= (r_addr         == bram_addr_last)     ? 1'b1 : 1'b0;

		// delayed addresses
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr_dly;
	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_rd_dly;
	reg	[OPERAND_ADDR_WIDTH : 0]	qn_addr_ext_rd_dly1;
	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd_dly2;
	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_rd_dly3;
	
	always @(posedge clk) b_addr_dly				<= b_addr;
	always @(posedge clk) n_addr_dly				<= n_addr;
	always @(posedge clk) ab_addr_ext_rd_dly	<= ab_addr_ext_rd;
	always @(posedge clk) qn_addr_ext_rd_dly1 <= qn_addr_ext_rd;
	always @(posedge clk) qn_addr_ext_rd_dly2 <= qn_addr_ext_rd_dly1;
	always @(posedge clk) qn_addr_ext_rd_dly3 <= qn_addr_ext_rd_dly2;
				
		// map registers to top-level ports
	assign b_bram_addr = b_addr;
	assign n_bram_addr = n_addr;
	assign r_bram_addr = r_addr;

		// map
	assign ab_addr_ext_wr	= p_addr_ext_wr[OPERAND_ADDR_WIDTH  :0];
	assign q_addr_wr			= p_addr_ext_wr[OPERAND_ADDR_WIDTH-1:0];
	assign qn_addr_ext_wr	= p_addr_ext_wr[OPERAND_ADDR_WIDTH  :0];
	assign r_bram_wr			= r_wren;
	
	assign ab_data_in		= p_data_in;
	assign q_data_in		= p_data_in;
	assign qn_data_in		= p_data_in;
	assign r_bram_in		= r_data_in;
	
	assign ab_wren		= p_wren && (mult_phase == MULT_PHASE_A_B);
	assign q_wren		= p_wren && (mult_phase == MULT_PHASE_AB_N_COEFF);
	assign qn_wren		= p_wren && (mult_phase == MULT_PHASE_Q_N);
		

	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
	bram_ab
	(	.clk(clk),
		.a_addr(ab_addr_ext_wr), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(),
		.b_addr(ab_addr_ext_rd), .b_out(ab_data_out)
	);

	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_q
	(	.clk(clk),
		.a_addr(q_addr_wr), .a_wr(q_wren), .a_in(q_data_in), .a_out(),
		.b_addr(q_addr_rd), .b_out(q_data_out)
	);

	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
	bram_qn
	(	.clk(clk),
		.a_addr(qn_addr_ext_wr), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(),
		.b_addr(qn_addr_ext_rd), .b_out(qn_data_out)
	);

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_s
	(	.clk(clk),
		.a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)
	);

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_sn
	(	.clk(clk),
		.a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)
	);

				
		/*
		 * Loader Data Input 
		 */
	integer j;
	
		// shift logic
	always @(posedge clk)
		//
		case (fsm_state)
			//
			FSM_STATE_LOAD_SHIFT: begin
		
					// update the rightmost part of loader buffer
				case (mult_phase)
				
					MULT_PHASE_A_B:
						loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
							(b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
							
					MULT_PHASE_AB_N_COEFF:
						loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
							(ab_addr_ext_rd_dly <= {1'b0, bram_addr_last}) ? ab_data_out : {32{1'b0}};
							
					MULT_PHASE_Q_N:
						loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
							(n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
							
				endcase
				
					// shift the loader buffer to the left
				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_din[j-1] <= loader_din[j];
					
			end
			//			
		endcase


		/*
		 * Load Write Enable Logic
		 */
	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_LOAD_WRITE:	loader_wren <= 1'b1;
			default:						loader_wren <= 1'b0;
		endcase


		/*
		 * Loader Address Update Logic
		 */

	always @(posedge clk)
		//
		case (fsm_state)
		
			FSM_STATE_LOAD_START:
				//
				loader_addr_wr <= load_syst_cnt_zero;
				
			FSM_STATE_LOAD_WRITE:
				//
				loader_addr_wr <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
					
		endcase

		/*
		 * Flag
		 */
	reg flag_select_s;
	
	assign r_data_in = flag_select_s ? s_data_out : sn_data_out;
	
	
		/*
		 * Memory Address Control Logic
		 */
	always @(posedge clk) begin
		//
		case (fsm_next_state)
		
			FSM_STATE_LOAD_START: begin
				ab_addr_ext_rd		<= bram_addr_ext_zero;
			end
								
			FSM_STATE_LOAD_SHIFT: begin
				ab_addr_ext_rd		<= ab_addr_ext_rd_next;
			end
		
			FSM_STATE_ADD_START: begin
				ab_addr_ext_rd		<= bram_addr_ext_zero;
				qn_addr_ext_rd		<= bram_addr_ext_zero;
			end
			
			FSM_STATE_ADD_CRUNCH: begin
				ab_addr_ext_rd		<= ab_addr_ext_rd_next;
				qn_addr_ext_rd		<= qn_addr_ext_rd_next;
			end
				
		endcase
		//
		case (fsm_next_state)
	
			FSM_STATE_LOAD_START: begin
				b_addr				<= bram_addr_zero;
				n_addr				<= bram_addr_zero;
			end
								
			FSM_STATE_LOAD_SHIFT: begin
				b_addr 				<= b_addr_next;
				n_addr				<= n_addr_next;
			end
					
			FSM_STATE_ADD_CRUNCH,
			FSM_STATE_ADD_UNLOAD: begin
				if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_last})			n_addr <= bram_addr_zero;
				else if (qn_addr_ext_rd_dly1 >  {1'b0, bram_addr_last})	n_addr <= n_addr_next;				
			end
			
		endcase
		//
	end


		/*
		 * Multiplier Array
		 */
	reg	pe_array_ena;
	wire	pe_array_rdy;

	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_MULT_START:	pe_array_ena <= 1'b1;
			default:						pe_array_ena <= 1'b0;
		endcase
		
	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_MULT_START)
			//
			case (mult_phase)
				MULT_PHASE_A_B:			p_num_words_latch <= {n_num_words_latch, 1'b1};
				MULT_PHASE_AB_N_COEFF:	p_num_words_latch <= {1'b0, n_num_words_latch};
				MULT_PHASE_Q_N:			p_num_words_latch <= {n_num_words_latch, 1'b1};
			endcase
			
	assign a_bram_addr = a_addr;
	assign n_coeff_bram_addr = a_addr;
	assign q_addr_rd = a_addr;
	
	reg	[31: 0]	a_data_out;
	
	always @*
		//
		case (mult_phase)
			MULT_PHASE_A_B:			a_data_out = a_bram_out;
			MULT_PHASE_AB_N_COEFF:	a_data_out = n_coeff_bram_out;
			MULT_PHASE_Q_N:			a_data_out = q_data_out;
			default:						a_data_out = {32{1'bX}};
		endcase
	
	modexpa7_systolic_multiplier_array #
	(
		.OPERAND_ADDR_WIDTH		(OPERAND_ADDR_WIDTH),
		.SYSTOLIC_ARRAY_POWER	(SYSTOLIC_ARRAY_POWER)
	)
	systolic_pe_array
	(
		.clk					(clk),
		.rst_n				(rst_n),

		.ena					(pe_array_ena),
		.rdy					(pe_array_rdy),

		.crt					(reduce_only_latch && mult_phase_ab),

		.loader_addr_rd	(loader_addr_rd),
		
		.pe_a_wide			({SYSTOLIC_ARRAY_LENGTH{a_data_out}}),
		.pe_b_wide			(pe_b_wide),
		
		.a_bram_addr		(a_addr),
		
		.p_bram_addr		(p_addr_ext_wr),
		.p_bram_in			(p_data_in),
		.p_bram_wr			(p_wren),

		.n_num_words		(n_num_words_latch),
		.p_num_words		(p_num_words_latch)
	);
	
		/*
		 * Adder
		 */
		 
	reg				add1_ce;					// clock enable
	wire	[31: 0]	add1_s;					// sum output
	wire				add1_c_in;				// carry input
	wire	[31: 0]	add1_a;					// A-input
	wire	[31: 0]	add1_b;					// B-input
	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
	wire				add1_c_out;				// carry output

	modexpa7_adder32 add1_inst
	(
		.clk		(clk),
		.ce		(add1_ce),
		.a			(add1_a),
		.b			(add1_b),
		.c_in		(add1_c_in),
		.s			(add1_s),
		.c_out	(add1_c_out)
	);

		/*
		 * Subtractor
		 */		 
	reg				sub1_ce;					// clock enable
	wire	[31: 0]	sub1_d;					// difference output
	wire				sub1_b_in;				// borrow input
	wire	[31: 0]	sub1_a;					// A-input
	wire	[31: 0]	sub1_b;					// B-input
	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word
	wire				sub1_b_out;				// borrow output

	modexpa7_subtractor32 sub1_inst
	(
		.clk		(clk),
		.ce		(sub1_ce),
		.a			(sub1_a),
		.b			(sub1_b),
		.b_in		(sub1_b_in),
		.d			(sub1_d),
		.b_out	(sub1_b_out)
	);
	
		// add masking into carry feedback chain
	assign add1_c_in = add1_c_out & ~add1_c_in_mask;

		// add masking into borrow feedback chain
	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;

		// mask carry for the very first words of AB and QN
	always @(posedge clk)
		//
		add1_c_in_mask <= (fsm_state == FSM_STATE_ADD_START) ? 1'b1 : 1'b0;

		// mask borrow for the very first words of S and N
	always @(posedge clk)
		//
		sub1_b_in_mask <= add1_c_in_mask;
			
	
		// map adder inputs
	assign add1_a = ab_data_out;
	assign add1_b = qn_data_out;
	
		// map subtractor inputs
	assign sub1_a = add1_s;
	assign sub1_b = (qn_addr_ext_rd_dly2 <= {1'b0, bram_addr_last}) ? 32'd0 : n_bram_out;
	
		// clock enable
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_ADD_START,
			FSM_STATE_ADD_CRUNCH:	add1_ce <= 1'b1;
			default:						add1_ce <= 1'b0;
		endcase
		//
		sub1_ce <= add1_ce;
		//
	end
		
		// map outputs
	assign s_data_in = add1_s;
	assign sn_data_in = sub1_d;
		
		// write enabled
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_ADD_CRUNCH,
			FSM_STATE_ADD_UNLOAD:	s_wren <= 1'b1;
			default:						s_wren <= 1'b0;		
		endcase
		//
		case (fsm_state)
			FSM_STATE_ADD_CRUNCH,
			FSM_STATE_ADD_UNLOAD,
			FSM_STATE_SUB_UNLOAD,
			FSM_STATE_ADD_FINAL:		sn_wren <= s_wren;
			default:						sn_wren <= 1'b0;
		endcase
		//
		case (fsm_state)
			FSM_STATE_SAVE_START,
			FSM_STATE_SAVE_WRITE:	r_wren <= 1'b1;
			default:						r_wren <= 1'b0;
		endcase
		//
	end

		// ...
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_ADD_CRUNCH,
			FSM_STATE_ADD_UNLOAD: begin
					if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_zero})			s_addr <= bram_addr_zero;
					else if (qn_addr_ext_rd_dly2 >  {1'b0, bram_addr_last})	s_addr <= s_addr_next;
				end
			FSM_STATE_ADD_FINAL:															s_addr <= bram_addr_zero;
			FSM_STATE_SAVE_START,
			FSM_STATE_SAVE_WRITE:														s_addr <= s_addr_next;
		endcase
		//
		case (fsm_state)
			FSM_STATE_ADD_CRUNCH,
			FSM_STATE_ADD_UNLOAD,
			FSM_STATE_SUB_UNLOAD: begin
					if (qn_addr_ext_rd_dly2 == {1'b0, bram_addr_zero})			sn_addr <= bram_addr_zero;
					else if (qn_addr_ext_rd_dly3 >  {1'b0, bram_addr_last})	sn_addr <= sn_addr_next;
				end
			FSM_STATE_ADD_FINAL:															sn_addr <= bram_addr_zero;
			FSM_STATE_SAVE_START,
			FSM_STATE_SAVE_WRITE:														sn_addr <= sn_addr_next;
		endcase
		//
		case (fsm_state)
			FSM_STATE_SAVE_START:	r_addr <= bram_addr_zero;
			FSM_STATE_SAVE_WRITE:	r_addr <= r_addr_next;
		endcase
		//
	end

		
		/*
		 * Flag Update Logic
		 */
	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_ADD_FINAL)
			flag_select_s <= sub1_b_out & ~add1_c_out;



			
		/*
		 * FSM Process
		 */
	always @(posedge clk or negedge rst_n)
		//
		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
		else						fsm_state <= fsm_next_state;

	//always @(posedge clk)
		//
		//fsm_prev_state <= fsm_state;
	
	
		/*
		 * FSM Transition Logic
		 */
	always @* begin
		//
		fsm_next_state = FSM_STATE_STOP;
		//
		case (fsm_state)
			//
			FSM_STATE_IDLE:				if (ena_trig)					fsm_next_state = FSM_STATE_LOAD_START;
												else								fsm_next_state = FSM_STATE_IDLE;
			//
			FSM_STATE_LOAD_START:											fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_SHIFT:		if (load_mult_cnt_done)		fsm_next_state = FSM_STATE_LOAD_WRITE;
												else								fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_WRITE:		if (load_syst_cnt_done)		fsm_next_state = FSM_STATE_LOAD_FINAL;
												else								fsm_next_state = FSM_STATE_LOAD_SHIFT;
			FSM_STATE_LOAD_FINAL:											fsm_next_state = FSM_STATE_MULT_START;
			//
			FSM_STATE_MULT_START:											fsm_next_state = FSM_STATE_MULT_CRUNCH;
			FSM_STATE_MULT_CRUNCH:		if (pe_array_rdy)				fsm_next_state = FSM_STATE_MULT_FINAL;
												else								fsm_next_state = FSM_STATE_MULT_CRUNCH;
			FSM_STATE_MULT_FINAL:		if (mult_phase_done)			fsm_next_state = FSM_STATE_ADD_START;
												else								fsm_next_state = FSM_STATE_LOAD_START;
			//
			FSM_STATE_ADD_START:												fsm_next_state = FSM_STATE_ADD_CRUNCH;
			FSM_STATE_ADD_CRUNCH:		if (ab_addr_ext_rd_done)	fsm_next_state = FSM_STATE_ADD_UNLOAD;
												else								fsm_next_state = FSM_STATE_ADD_CRUNCH;
			FSM_STATE_ADD_UNLOAD:											fsm_next_state = FSM_STATE_SUB_UNLOAD;
			FSM_STATE_SUB_UNLOAD:											fsm_next_state = FSM_STATE_ADD_FINAL;
			FSM_STATE_ADD_FINAL:												fsm_next_state = FSM_STATE_SAVE_START;
			//
			FSM_STATE_SAVE_START:											fsm_next_state = FSM_STATE_SAVE_WRITE;
			FSM_STATE_SAVE_WRITE:		if (s_addr_done)				fsm_next_state = FSM_STATE_SAVE_FINAL;
												else								fsm_next_state = FSM_STATE_SAVE_WRITE;
			FSM_STATE_SAVE_FINAL:											fsm_next_state = FSM_STATE_STOP;
			//
			FSM_STATE_STOP:													fsm_next_state = FSM_STATE_IDLE;
			//
		endcase
		//
	end


endmodule

//======================================================================
// End of file
//======================================================================