aboutsummaryrefslogblamecommitdiff
path: root/src/rtl/modexpa7_systolic_multiplier.v
blob: 513b5aa28bd193431feb71cae625877817f6dc14 (plain) (tree)









































                                                                                         


                                                                                              
                           
                                                                     

                           
                                    
                           
                                                                    





















                                                                                                               
                                                                             

           
                 
                   
                                    
                   


                                                  



                                   
                                                                                                                 
 



                                                                                                 
 



                                                                                         
 



                                                                                                 
 



                                                                                                 
 








                                                                                                 
         



                                                                                                                 
                                                                                                                 

                   
                                          
                   

                                                    
 




                                            
         

                                                      
 


                                                                  
         




                                    
 













                                                                                                         
                 
                   
                                    
                   
                                                                    
 
                                                                                

                              



                                                              
                   
                                           
                   



                                                                                                                               
         








                                                                                                          
 



                                                                                                                                                                            
 


                                                                   
 

                   
                                                
                   



                                                                                              
         







                                                                                                                                                   
                   


                                                        

                                 






                                                                                                   
                        

                                 






                                                                                                                                                   
                        

                   


                   
                                  
                   
         



                                                                 
         



                                                                                        
         




                                                                                                                                    
         




                                                                                            
                   
                                                                                         
                                                                                             
                                  
                   


                                                           
                   
                                                      
                           

















                                                                                    
 
                   
                                          
                   
                 




















                                                                                                                      


                                                         
                 







                                                                                                       


                                                                                   








                                                                                                     


                                                                                     















                                                                  
                                     


                   





                               

                                                
                                    










                                                                                                   













                                                                               




                                                                                                       

            
         
                   
                                     
                   
 



                                    


                                    
 



                                     

                                     
 



                                    






                                      
 

                                                                                                           
 

                                                                                                  
         

                                                                                                           
 





                                                                                                       

                   
                                       
                   
                   
         
                                  


                                 

                                                      
                 









                                                                                                                                 
                 












                                                                                                                                             
                                 





                                                                                          
                        
                 
 
                                         

                              
                                      
                 











                                                                          

                        

                                                  


                                 
































                                                                                                                        
                        



















                                                                                       
 
                        
                   


            
                   
                                                         
                   






                                                             

                   
                                                                     
                   

                                                                                          
 













                                                                   
 
 







                                                               
 


                                                                         
 


                                                                                                             
         
                                                                                                                

                              
                                                              


                              

































                                                                                                                                
                            
                           


                        



                                    
                   


                                                                                                                               


                                                                                                                                                                                                                                                                                         


                                                                                                                             






























                                                                                                                                                                     

                         
                 
                   



                                                                                                           
                   



                                                                                                               
 
                   
            
                 
                                    
                   






                                                                                       
                   






                                                                                      
                   






                                                                                       





                                                                                                
                   

            


                              








                                                                
                         





                                                                                                           
                        
 

                 





                                                          
                 





                                                                                                                   


                              



                                                          
                 
















                                                                                                                    
 
                 
                           


















































                                                                                                                                             
                   

                   

                 





































































































































                                                                                                                         


                               





                                                                                             



                                        




                                                 

                                                                                                                                                 
                                                                                                                                                                                         








































                                                                                                                                                                                                                                   





                                                                                                                                                                                               
                           
                                                                                                                                                                 
 
                        

                   






                                                                         
//======================================================================
//
// modexpa7_systolic_multiplier.v
// -----------------------------------------------------------------------------
// Systolic Montgomery multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================

module modexpa7_systolic_multiplier #
	(
			//
			// This sets the address widths of memory buffers. Internal data
			// width is 32 bits, so for e.g. 2048-bit operands buffers must store
			// 2048 / 32 = 64 words, and these need 5-bit address bus, because
			// 2 ** 6 = 64.
			//
		parameter	OPERAND_ADDR_WIDTH		= 4,
		
			//
			// Explain.
			//
		parameter	SYSTOLIC_ARRAY_POWER		= 2
	)
	(
		input											clk,
		input											rst_n,

		input											ena,
		output										rdy,

		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,

		input		[                32-1:0]	a_bram_out,
		input		[                32-1:0]	b_bram_out,
		input		[                32-1:0]	n_bram_out,
		input		[                32-1:0]	n_coeff_bram_out,

		output	[                32-1:0]	r_bram_in,
		output										r_bram_wr,

		input		[OPERAND_ADDR_WIDTH-1:0]	ab_num_words
	);
	
		
		//
		// Include Settings
		//
	`include "pe/modexpa7_primitive_switch.v"
	`include "modexpa7_settings.v"
		

		//
		// FSM Declaration
		//
	localparam	[ 7: 0]	FSM_STATE_IDLE								= 8'h00;

	localparam	[ 7: 0]	FSM_STATE_LOAD_B_START					= 8'h11;
	localparam	[ 7: 0]	FSM_STATE_LOAD_B_SHIFT					= 8'h12;
	localparam	[ 7: 0]	FSM_STATE_LOAD_B_WRITE					= 8'h13;
	localparam	[ 7: 0]	FSM_STATE_LOAD_B_FINAL					= 8'h14;

	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_START			= 8'h21;
	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_SHIFT			= 8'h22;
	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_WRITE			= 8'h23;
	localparam	[ 7: 0]	FSM_STATE_LOAD_N_COEFF_FINAL			= 8'h24;

	localparam	[ 7: 0]	FSM_STATE_LOAD_N_START					= 8'h31;
	localparam	[ 7: 0]	FSM_STATE_LOAD_N_SHIFT					= 8'h32;
	localparam	[ 7: 0]	FSM_STATE_LOAD_N_WRITE					= 8'h33;
	localparam	[ 7: 0]	FSM_STATE_LOAD_N_FINAL					= 8'h34;

	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_START				= 8'h41;
	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_CRUNCH				= 8'h42;
	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_RELOAD				= 8'h43;
	localparam	[ 7: 0]	FSM_STATE_MULT_A_B_FINAL				= 8'h44;

	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_START		= 8'h51;
	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_CRUNCH		= 8'h52;
	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_RELOAD		= 8'h53;
	localparam	[ 7: 0]	FSM_STATE_MULT_AB_N_COEFF_FINAL		= 8'h54;

	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_START				= 8'h61;
	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_CRUNCH				= 8'h62;
	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_RELOAD				= 8'h63;
	localparam	[ 7: 0]	FSM_STATE_MULT_Q_N_FINAL				= 8'h64;
	
	localparam	[ 7: 0]	FSM_STATE_SAVE_START						= 8'h71;
	localparam	[ 7: 0]	FSM_STATE_SAVE_WRITE						= 8'h72;
	localparam	[ 7: 0]	FSM_STATE_SAVE_FINAL						= 8'h73;	
	
	localparam	[ 7: 0]	FSM_STATE_STOP								= 8'hFF;
	
		//
		// FSM State / Next State
		//
	reg	[ 7: 0]	fsm_state = FSM_STATE_IDLE;
	reg	[ 7: 0]	fsm_next_state;


		//
		// Enable Delay and Trigger
		//
   reg ena_dly = 1'b0;
	
		/* delay enable by one clock cycle */
   always @(posedge clk) ena_dly <= ena;

		/* trigger new operation when enable goes high */
   wire ena_trig = ena && !ena_dly;
	
	
		//
		// Ready Flag Logic
		//
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;

   always @(posedge clk or negedge rst_n)
		
			/* reset flag */
		if (rst_n == 1'b0) rdy_reg <= 1'b1;
		else begin
		
				/* clear flag when operation is started */
			if (fsm_state == FSM_STATE_IDLE)	rdy_reg <= ~ena_trig;
			
				/* set flag after operation is finished */
			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;			
			
		end
		
		
		//
		// Parameters Latch
		//
	reg	[OPERAND_ADDR_WIDTH-1:0]	ab_num_words_latch;

		/* save number of words in a and b when new operation starts */
	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_LOAD_B_START)
			ab_num_words_latch <= ab_num_words;
			
			
		//
		// Systolic Cycle Counters
		//
		
		/* handy values */
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
	
		/* counters */
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init;
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load;
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload;
		
		/* handy increment values */
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_init_next		= syst_cnt_init   + 1'b1;
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_next		= syst_cnt_load   + 1'b1;
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_unload_next		= syst_cnt_unload + 1'b1;

		/* handy stop flags */
	wire										syst_cnt_init_done		= (syst_cnt_init   == syst_cnt_last) ? 1'b1 : 1'b0;
	wire										syst_cnt_load_done		= (syst_cnt_load   == syst_cnt_last) ? 1'b1 : 1'b0;
	wire										syst_cnt_unload_done		= (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;

		/* delayed load counter */
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_load_dly;
	always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;


		//
		// Multiplier Iteration Counter
		//
		
		/* handy values */
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
	
		/* counter */
	reg	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt;
	
		/* handy increment value and stop flag */
	wire	[SYSTOLIC_ARRAY_POWER-1:0]	mult_cnt_next = mult_cnt + 1'b1;
	wire										mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
			
			
		//
		// Initialization Counter Control Logic
		//
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_LOAD_B_START,
			FSM_STATE_LOAD_N_COEFF_START,
			FSM_STATE_LOAD_N_START:				mult_cnt <= mult_cnt_zero;
			
			FSM_STATE_LOAD_B_SHIFT,
			FSM_STATE_LOAD_N_COEFF_SHIFT,
			FSM_STATE_LOAD_N_SHIFT:				mult_cnt <= mult_cnt_next;
		endcase
		//
		case (fsm_state)
			FSM_STATE_LOAD_B_START,
			FSM_STATE_LOAD_N_COEFF_START,
			FSM_STATE_LOAD_N_START:				syst_cnt_init <= syst_cnt_zero;
			
			FSM_STATE_LOAD_B_WRITE,
			FSM_STATE_LOAD_N_COEFF_WRITE,
			FSM_STATE_LOAD_N_WRITE:				syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
		endcase
		//
	end
	
	
		//
		// Operand Loader
		//
	
		/*
		 * Explain how parallelized loader works here...
		 *
		 */
	
		/* loader banks */
	localparam	[ 1: 0]	LOADER_ADDR_MSB_B				= 2'd0;
	localparam	[ 1: 0]	LOADER_ADDR_MSB_N_COEFF		= 2'd1;
	localparam	[ 1: 0]	LOADER_ADDR_MSB_N				= 2'd2;
	
		/* loader input */
	reg	[                  2-1:0]	loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];	
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
	reg										loader_wren    [0:SYSTOLIC_ARRAY_LENGTH-1];
	reg	[                 32-1:0]	loader_din     [0:SYSTOLIC_ARRAY_LENGTH-1];
	
		/* loader output */
	wire	[                 32-1:0]	loader_dout    [0:SYSTOLIC_ARRAY_LENGTH-1];
			
		/* generate parallelized loader */
		
		//
		// Loader currently stores B, N_COEFF and N, it can be coded another way
		// to initially store B, then AB, then Q. Some memory can be saved thay way.
		// Maybe later...
		//
		
	genvar i;
	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
		//
		begin : gen_bram_1rw_readfirst_loader
			//
			bram_1rw_readfirst #
			(
				.MEM_WIDTH		(32),
				.MEM_ADDR_BITS	(SYSTOLIC_CNTR_WIDTH + 2)
			)
			bram_loader
			(
				.clk		(clk),
				.a_addr	({loader_addr_msb[i], loader_addr_lsb[i]}),
				.a_wr		(loader_wren[i]),
				.a_in		(loader_din[i]),
				.a_out	(loader_dout[i])
			);
			//
		end
		//
	endgenerate
	

		//
		// Block Memory Addresses
		//
		
		/*
		 * Explain why there are two memory sizes.
		 *
		 */
		
		/* the very first addresses */
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero			= {      {OPERAND_ADDR_WIDTH{1'b0}}};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_zero	= {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
	
		/* the very last addresses */
	wire	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last     = {ab_num_words_latch};
	wire	[OPERAND_ADDR_WIDTH  :0]	bram_addr_ext_last = {ab_num_words_latch, 1'b1};

		/* address registers */
	reg	[OPERAND_ADDR_WIDTH-1:0]	a_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr;
	reg	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext;
	reg	[OPERAND_ADDR_WIDTH-1:0]	q_addr;
	reg	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext;
	reg	[OPERAND_ADDR_WIDTH-1:0]	s_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_addr;
	reg	[OPERAND_ADDR_WIDTH-1:0]	r_addr;
		
		/* handy increment values */
	wire	[OPERAND_ADDR_WIDTH-1:0]	a_addr_next			= a_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	b_addr_next			= b_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_next	= n_coeff_addr + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	n_addr_next			= n_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH  :0]	ab_addr_ext_next	= ab_addr_ext  + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	q_addr_next			= q_addr       + 1'b1;
	wire	[OPERAND_ADDR_WIDTH  :0]	qn_addr_ext_next	= qn_addr_ext  + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	s_addr_next	= s_addr  + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_addr_next	= sn_addr  + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	r_addr_next	= r_addr  + 1'b1;
	
		/* handy stop flags */
	wire	a_addr_done			= (a_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	b_addr_done			= (b_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	n_coeff_addr_done	= (n_coeff_addr  == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	n_addr_done			= (n_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	ab_addr_ext_done	= (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
	wire	q_addr_done			= (q_addr      == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	qn_addr_ext_done	= (qn_addr_ext     == bram_addr_ext_last)     ? 1'b1 : 1'b0;
	wire	s_addr_done	= (s_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	sn_addr_done	= (sn_addr     == bram_addr_last)     ? 1'b1 : 1'b0;
	wire	r_addr_done	= (r_addr     == bram_addr_last)     ? 1'b1 : 1'b0;

		/* delayed B address */
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_addr_dly;
	always @(posedge clk) b_addr_dly <= b_addr;

	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_addr_dly;
	always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;

	reg	[OPERAND_ADDR_WIDTH-1:0]	n_addr_dly;
	always @(posedge clk) n_addr_dly <= n_addr;
				
		/* map registers to top-level ports */
	assign a_bram_addr = a_addr;
	assign b_bram_addr = b_addr;
	assign n_coeff_bram_addr = n_coeff_addr;
	assign n_bram_addr = n_addr;
	assign r_bram_addr = r_addr;


		//
		// Flag
		//
	reg	flag_select_s;
	
	
		//
		// Memory Address Control Logic
		//
	always @(posedge clk) begin
		//
		case (fsm_next_state)
			FSM_STATE_LOAD_B_START:				b_addr <= bram_addr_zero;
			FSM_STATE_LOAD_N_COEFF_START:		n_coeff_addr <= bram_addr_zero;
			FSM_STATE_LOAD_N_START:				n_addr <= bram_addr_zero;
			
			FSM_STATE_LOAD_B_SHIFT:				b_addr <= b_addr_next;
			FSM_STATE_LOAD_N_COEFF_SHIFT:		n_coeff_addr <= n_coeff_addr_next;
			FSM_STATE_LOAD_N_SHIFT:				n_addr <= n_addr_next;
		endcase
		//
		case (fsm_state)
			FSM_STATE_MULT_Q_N_RELOAD: 
				if (qn_addr_ext == {1'b0, bram_addr_last})
					n_addr		<= bram_addr_zero;
				else if (qn_addr_ext > {1'b0, bram_addr_last})
					n_addr		<= n_addr_next;
			
		endcase
		//
		case (fsm_state)
			FSM_STATE_SAVE_START:	r_addr <= bram_addr_zero;
			FSM_STATE_SAVE_WRITE:	r_addr <= r_addr_next;
		endcase
		//
		case (fsm_next_state)
			FSM_STATE_MULT_A_B_START:	a_addr <= bram_addr_zero;
			FSM_STATE_MULT_A_B_RELOAD:	a_addr <= !a_addr_done ? a_addr_next : a_addr;
		endcase
		//
	end
	
	
		//
		// Internal Memories
		//

		/* memory inputs */
	reg	[31: 0]	ab_data_in;
	reg	[31: 0]	q_data_in;
	reg	[31: 0]	qn_data_in;
	wire	[31: 0]	s_data_in;
	wire	[31: 0]	sn_data_in;
	reg	[31: 0]	r_data_in;

		/* memory outputs */
	wire	[31: 0]	ab_data_out;
	wire	[31: 0]	q_data_out;
	wire	[31: 0]	qn_data_out;
	wire	[31: 0]	s_data_out;
	wire	[31: 0]	sn_data_out;

		/* write enables */
	reg	ab_wren;
	reg	q_wren;
	reg	qn_wren;
	reg	s_wren;
	reg	sn_wren;
	reg	r_wren;
	
		/* map */
	assign r_bram_in = r_data_in;
	assign r_bram_wr = r_wren;

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
	bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
	
	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
	bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));

	bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));

	
		//
		// Wide Operand Loader
		//
	integer j;
	
		/* shift logic */
	always @(posedge clk)
		//
		case (fsm_state)
			//
			FSM_STATE_LOAD_B_SHIFT: begin
		
						/* update the rightmost part of loader buffer */
				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
				
						/* shift the loader buffer to the left */
				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_din[j-1] <= loader_din[j];
					
			end
			//
			FSM_STATE_LOAD_N_COEFF_SHIFT: begin
		
						/* update the rightmost part of loader buffer */
				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
				
						/* shift the loader buffer to the left */
				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_din[j-1] <= loader_din[j];
					
			end
			//
			FSM_STATE_LOAD_N_SHIFT: begin
		
						/* update the rightmost part of loader buffer */
				loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
				
						/* shift the loader buffer to the left */
				for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_din[j-1] <= loader_din[j];
					
			end					
			//
		endcase
		

		/* write enable logic */
	always @(posedge clk)
		//
		case (fsm_next_state)
		
			FSM_STATE_LOAD_B_WRITE,
			FSM_STATE_LOAD_N_COEFF_WRITE,
			FSM_STATE_LOAD_N_WRITE:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_wren[j] <= 1'b1;
					
			default:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_wren[j] <= 1'b0;
					
		endcase

		/* loader address update logic */
	always @(posedge clk) begin
		//
		case (fsm_state)
		
			FSM_STATE_LOAD_B_START,
			FSM_STATE_LOAD_N_COEFF_START,
			FSM_STATE_LOAD_N_START:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr_lsb[j] <= syst_cnt_zero;
					
			FSM_STATE_LOAD_B_WRITE,
			FSM_STATE_LOAD_N_COEFF_WRITE,
			FSM_STATE_LOAD_N_WRITE:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
					
		endcase
		//
		case (fsm_next_state)
			FSM_STATE_MULT_A_B_START,
			FSM_STATE_MULT_AB_N_COEFF_START,
			FSM_STATE_MULT_Q_N_START,
			FSM_STATE_MULT_A_B_RELOAD,
			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
			FSM_STATE_MULT_Q_N_RELOAD:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr_lsb[j] <= syst_cnt_zero;
													
			FSM_STATE_MULT_A_B_CRUNCH,
			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
			FSM_STATE_MULT_Q_N_CRUNCH:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
		endcase
		//
		case (fsm_next_state)
		
			FSM_STATE_LOAD_B_START,
			FSM_STATE_MULT_A_B_START:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr_msb[j] <= LOADER_ADDR_MSB_B;

			FSM_STATE_LOAD_N_COEFF_START,
			FSM_STATE_MULT_AB_N_COEFF_START:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
					
			FSM_STATE_LOAD_N_START,
			FSM_STATE_MULT_Q_N_START:
				//
				for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
					loader_addr_msb[j] <= LOADER_ADDR_MSB_N;

		endcase
		//
	end
	
	
		//
		// Systolic Array of Processing Elements
		//
	reg	[31: 0]	pe_a    [0:SYSTOLIC_ARRAY_LENGTH-1];
	reg	[31: 0]	pe_b    [0:SYSTOLIC_ARRAY_LENGTH-1];
	reg	[31: 0]	pe_t    [0:SYSTOLIC_ARRAY_LENGTH-1];
	reg	[31: 0]	pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
	wire	[31: 0]	pe_p    [0:SYSTOLIC_ARRAY_LENGTH-1];
	wire	[31: 0]	pe_c_out[0:SYSTOLIC_ARRAY_LENGTH-1];
	

		//
		// These can be turned into a FIFO (maybe later?)...
		//
	reg	[31: 0]	pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
	reg	[31: 0]	pe_t_mem    [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];

	generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
		begin : modexpa7_systolic_pe_multiplier		
			modexpa7_systolic_pe systolic_pe_inst
			(
				.clk		(clk),
				.a			(pe_a[i]),
				.b			(pe_b[i]),
				.t			(pe_t[i]),
				.c_in		(pe_c_in[i]),
				.p			(pe_p[i]),
				.c_out	(pe_c_out[i])
			);
		end
	endgenerate


		
			
			//
			// Shift Registers
			//
	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_load;
	reg	[SYSTOLIC_PE_LATENCY  :0]	shreg_latency;
	reg	[SYSTOLIC_NUM_CYCLES-1:0]	shreg_unload;

	wire	shreg_done_load = shreg_load[syst_cnt_last];
	wire	shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
	wire	shreg_done_unload = shreg_unload[syst_cnt_last];

	reg										shreg_now_loading;
	reg										shreg_now_latency;
	reg										shreg_now_unloading;
	
	reg										shreg_done_latency_dly;
	
	always @(posedge clk)
		shreg_done_latency_dly <= shreg_done_latency;

	always @(posedge clk)
		//
		case (fsm_state)
			//
			FSM_STATE_MULT_A_B_START,
			FSM_STATE_MULT_AB_N_COEFF_START,
			FSM_STATE_MULT_Q_N_START,
			FSM_STATE_MULT_A_B_RELOAD,
			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
			FSM_STATE_MULT_Q_N_RELOAD: begin
				shreg_now_loading	<= 1'b1;
				shreg_now_latency <= 1'b1;
				shreg_now_unloading <= 1'b0;
				shreg_load		<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
				shreg_latency	<= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
				shreg_unload	<= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
			end
			//
			FSM_STATE_MULT_A_B_CRUNCH,
			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
			FSM_STATE_MULT_Q_N_CRUNCH: begin
				shreg_load		<= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
				shreg_latency	<= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
				shreg_unload	<= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
				
				if (shreg_done_load) shreg_now_loading <= 1'b0;
				if (shreg_done_latency) shreg_now_latency <= 1'b0;
				if (shreg_done_latency) shreg_now_unloading <= 1'b1;
				else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
				
			end
			//
			default: begin
				shreg_now_loading <= 1'b0;
				shreg_now_latency <= 1'b0;
				shreg_now_unloading <= 1'b0;
			end
			//
		endcase
		
		
		
		
		
	always @(posedge clk) begin
		//
		case (fsm_state)
			FSM_STATE_MULT_A_B_START:				ab_addr_ext		<= bram_addr_ext_zero;
			FSM_STATE_MULT_AB_N_COEFF_START:		q_addr			<= bram_addr_zero;
			FSM_STATE_MULT_Q_N_START: begin		qn_addr_ext		<= bram_addr_ext_zero;
															ab_addr_ext		<= bram_addr_ext_zero;															
															end
			
			FSM_STATE_MULT_A_B_RELOAD:				ab_addr_ext		<= ab_addr_ext_next;
			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	q_addr			<= q_addr_next;
			FSM_STATE_MULT_Q_N_RELOAD: begin		qn_addr_ext		<= qn_addr_ext_next;
															ab_addr_ext		<= ab_addr_ext_next;
															end
		endcase
		//
		case (fsm_state)
		
			FSM_STATE_MULT_Q_N_RELOAD: begin
				if (qn_addr_ext == {1'b0, bram_addr_last}) begin
					s_addr		<= bram_addr_zero;
					sn_addr	<= bram_addr_zero;
				end
				
				if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
					s_addr <= s_addr_next;
					sn_addr <= sn_addr_next;
				end

				if (qn_addr_ext == bram_addr_ext_last) begin
					s_addr <= bram_addr_zero;
					sn_addr <= bram_addr_zero;
				end
			
			end
			
			FSM_STATE_MULT_Q_N_FINAL,
			FSM_STATE_SAVE_START,
			FSM_STATE_SAVE_WRITE: begin
				s_addr <= !s_addr_done ? s_addr_next : s_addr;
				sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
			end
			
		endcase
		
		//
		case (fsm_next_state)
			FSM_STATE_MULT_AB_N_COEFF_START:		ab_addr_ext <= bram_addr_ext_zero;
			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	ab_addr_ext <= ab_addr_ext_next;
		endcase
		//
		case (fsm_next_state)
			FSM_STATE_MULT_Q_N_START:		q_addr <= bram_addr_zero;
			FSM_STATE_MULT_Q_N_RELOAD:		q_addr <= !q_addr_done ? q_addr_next : q_addr;
		endcase

		//
	end
		
	always @(posedge clk) begin
		//
		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
			ab_wren <= shreg_done_latency_dly;
			ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
		end else begin
			ab_wren <= 1'b0;
			ab_data_in <= 32'hXXXXXXXX;
		end
		//
		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
			q_wren <= shreg_done_latency_dly;
			q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
		end else begin
			q_wren <= 1'b0;
			q_data_in <= 32'hXXXXXXXX;
		end
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
			qn_wren <= shreg_done_latency_dly;
			qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
		end else begin
			qn_wren <= 1'b0;
			qn_data_in <= 32'hXXXXXXXX;
		end		
		//
		case (fsm_state)
			FSM_STATE_SAVE_START:	r_wren <= 1'b1;
			FSM_STATE_SAVE_WRITE:	r_wren <= ~r_addr_done;
			default:						r_wren <= 1'b0;
		endcase
		//
	end
	
	
	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_MULT_A_B_START,
			FSM_STATE_MULT_AB_N_COEFF_START,
			FSM_STATE_MULT_Q_N_START,
			FSM_STATE_MULT_A_B_RELOAD,
			FSM_STATE_MULT_AB_N_COEFF_RELOAD,
			FSM_STATE_MULT_Q_N_RELOAD:
				//
				syst_cnt_load <= syst_cnt_zero;
			
			FSM_STATE_MULT_A_B_CRUNCH,
			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
			FSM_STATE_MULT_Q_N_CRUNCH:
				//
				syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
				
		endcase

		
		
	always @(posedge clk)
		//
		case (fsm_state)
			FSM_STATE_MULT_A_B_CRUNCH,
			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
			FSM_STATE_MULT_Q_N_CRUNCH: begin
		
			if (shreg_done_latency)	syst_cnt_unload <= syst_cnt_zero;
			else if (shreg_now_unloading)
				syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;

			end
		endcase
	
	always @(posedge clk)
		//
		case (fsm_state)
			FSM_STATE_MULT_A_B_CRUNCH,
			FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
			FSM_STATE_MULT_Q_N_CRUNCH: begin
		
				if (shreg_now_unloading)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						pe_c_out_mem[syst_cnt_unload][j] <= pe_c_out[j];
						
				if (shreg_now_unloading) begin
				
					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						pe_t_mem[syst_cnt_unload][j-1] <= pe_p[j];
						
					if (syst_cnt_unload > syst_cnt_zero)
						pe_t_mem[syst_cnt_unload-1'b1][SYSTOLIC_ARRAY_LENGTH-1] <= pe_p[0];
					else
						pe_t_mem[syst_cnt_last][SYSTOLIC_ARRAY_LENGTH-1] <= 32'd0;
					
				end
			end
		endcase

		
			//
			// T and C_IN can be moved to a separate code block
			//
	always @(posedge clk) begin
		//
		if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
			//
			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
				//
				if (shreg_now_loading) begin
					pe_a[j]		<= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
					pe_b[j]		<= loader_dout[j];
					pe_t[j]		<= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
					pe_c_in[j]	<= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
				end else begin
					pe_a[j]		<= 32'hXXXXXXXX;				
					pe_b[j]		<= 32'hXXXXXXXX;
					pe_t[j]		<= 32'hXXXXXXXX;
					pe_c_in[j]	<= 32'hXXXXXXXX;
				end
		//
		if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
			//
			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
				//
				if (shreg_now_loading) begin
					pe_a[j]		<= ab_data_out;
					pe_b[j]		<= loader_dout[j];
					pe_t[j]		<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
					pe_c_in[j]	<= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
				end else begin
					pe_a[j]		<= 32'hXXXXXXXX;				
					pe_b[j]		<= 32'hXXXXXXXX;
					pe_t[j]		<= 32'hXXXXXXXX;
					pe_c_in[j]	<= 32'hXXXXXXXX;
				end
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
			//
			for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
				//
				if (shreg_now_loading) begin
					pe_a[j]		<= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
					pe_b[j]		<= loader_dout[j];
					pe_t[j]		<= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
					pe_c_in[j]	<= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
				end else begin
					pe_a[j]		<= 32'hXXXXXXXX;				
					pe_b[j]		<= 32'hXXXXXXXX;
					pe_t[j]		<= 32'hXXXXXXXX;
					pe_c_in[j]	<= 32'hXXXXXXXX;
				end
		//
	
		//
	end
		
		
		//
		// Adder
		//
		/*
		 * This adder is used to calculate S = AB + QN.
		 *
		 */
	reg				add1_ce;					// clock enable
	reg	[31: 0]	add1_s;					// sum output
	wire				add1_c_in;				// carry input
	wire	[31: 0]	add1_a;					// A-input
	reg	[31: 0]	add1_b;					// B-input
	reg				add1_c_in_mask;		// flag to not carry anything into the very first word
	reg				add1_c_out;				// carry output
	
		/* add masking into carry feedback chain */
	assign add1_c_in = add1_c_out & ~add1_c_in_mask;

		/* mask carry for the very first word of N */
	//always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
	
	always @(posedge  clk)
		//
		if (add1_ce)
			//
			{add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
	
	assign add1_a = qn_data_in;
	
	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
			add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
		else
			add1_b <= 32'hXXXXXXXX;

	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
			add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
		else
			add1_c_in_mask <= 1'b0;

	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
			add1_ce <= shreg_done_latency_dly;
		else
			add1_ce <= 1'b0;


	assign s_data_in = add1_s;
	assign sn_data_in = sub1_d;
	
	always @(posedge clk) begin
		//
		s_wren <= add1_ce;
		sn_wren <= sub1_ce;
	end
		
		
		
		//
		// Subtractor
		//
		/*
		 * This subtractor is used to calculate SN = S - N.
		 *
		 */
	reg				sub1_ce;					// clock enable
	reg	[31: 0]	sub1_d;					// difference output
	wire				sub1_b_in;				// borrow input
	wire	[31: 0]	sub1_a;					// A-input
	reg	[31: 0]	sub1_b;					// B-input
	reg				sub1_b_in_mask;		// flag to not borrow anything from the very first word
	reg				sub1_b_out;				// borrow output
	
		/* add masking into borrow feedback chain */
	assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
	
	always @(posedge  clk)
		//
		if (sub1_ce)
			//
			{sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
	
	assign sub1_a = add1_s;
	
	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
			sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
		else
			sub1_b <= 32'hXXXXXXXX;

	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
			sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
		else
			sub1_b_in_mask <= 1'b0;

	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
			sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
		else
			sub1_ce <= 1'b0;


	assign s_data_in = add1_s;
	
	always @(posedge clk)
		//
		s_wren <= add1_ce;
		
		

	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
			flag_select_s <= sub1_b_out & ~add1_c_out;
		

	always @(posedge clk)
		//
		case (fsm_state)
			FSM_STATE_SAVE_START,
			FSM_STATE_SAVE_WRITE:
				r_data_in <= flag_select_s ? s_data_out : sn_data_out;
		endcase

		
			
		//
		// FSM Process
		//
	always @(posedge clk or negedge rst_n)
		//
		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
		else						fsm_state <= fsm_next_state;
	
	
		//
		// FSM Transition Logic
		//
	always @* begin
		//
		fsm_next_state = FSM_STATE_STOP;
		//
		case (fsm_state)

			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_LOAD_B_START;
												else							fsm_next_state = FSM_STATE_IDLE;
			//
			FSM_STATE_LOAD_B_START:											fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
			FSM_STATE_LOAD_B_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_B_WRITE;
												else								fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
			FSM_STATE_LOAD_B_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_B_FINAL;
												else							fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
			FSM_STATE_LOAD_B_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
			//
			FSM_STATE_LOAD_N_COEFF_START:											fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
			FSM_STATE_LOAD_N_COEFF_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
												else								fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
			FSM_STATE_LOAD_N_COEFF_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
												else							fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
			FSM_STATE_LOAD_N_COEFF_FINAL:										fsm_next_state = FSM_STATE_LOAD_N_START;
			//
			FSM_STATE_LOAD_N_START:											fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
			FSM_STATE_LOAD_N_SHIFT:		if (mult_cnt_done)			fsm_next_state = FSM_STATE_LOAD_N_WRITE;
												else								fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
			FSM_STATE_LOAD_N_WRITE:		if (syst_cnt_init_done)		fsm_next_state = FSM_STATE_LOAD_N_FINAL;
												else							fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
			FSM_STATE_LOAD_N_FINAL:										fsm_next_state = FSM_STATE_MULT_A_B_START;
			//
			FSM_STATE_MULT_A_B_START:									fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
			FSM_STATE_MULT_A_B_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
			FSM_STATE_MULT_A_B_RELOAD:	if (ab_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
												else							fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
			FSM_STATE_MULT_A_B_FINAL:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
			//
			FSM_STATE_MULT_AB_N_COEFF_START:									fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
			FSM_STATE_MULT_AB_N_COEFF_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
			FSM_STATE_MULT_AB_N_COEFF_RELOAD:	if (q_addr_done)	fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
															else							fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
			FSM_STATE_MULT_AB_N_COEFF_FINAL:									fsm_next_state = FSM_STATE_MULT_Q_N_START;
			//
			FSM_STATE_MULT_Q_N_START:									fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
			FSM_STATE_MULT_Q_N_CRUNCH:	if (shreg_done_unload)	fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
			FSM_STATE_MULT_Q_N_RELOAD:	if (qn_addr_ext_done)	fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
															else							fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
			FSM_STATE_MULT_Q_N_FINAL:									fsm_next_state = FSM_STATE_SAVE_START;
			//
			FSM_STATE_SAVE_START:										fsm_next_state = FSM_STATE_SAVE_WRITE;
			FSM_STATE_SAVE_WRITE:	if (r_addr_done)				fsm_next_state = FSM_STATE_SAVE_FINAL;
											else								fsm_next_state = FSM_STATE_SAVE_WRITE;
			FSM_STATE_SAVE_FINAL:										fsm_next_state = FSM_STATE_STOP;
			//
			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;

		endcase
		//
	end


endmodule

//======================================================================
// End of file
//======================================================================