aboutsummaryrefslogblamecommitdiff
path: root/src/rtl/modexpa7_systolic_multiplier.v
blob: cb1c71667f22f8158df6954fd67ef36a963c7ecd (plain) (tree)

















































































































































































































































































































































































































                                                                                                                                                                                    
                                                         










                                                                                 
                                                        











                                                                                       
                                                         





























































































































































































                                                                                                                                                            
                                     









                                                     
                                          





















































































































































































































































                                                                                                                                                                                                                       
//======================================================================
//
// modexpa7_systolic_multiplier.v
// -----------------------------------------------------------------------------
// Systolic Montgomery multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================

module modexpa7_systolic_multiplier #
	(
			//
			// This sets the address widths of memory buffers. Internal data
			// width is 32 bits, so for e.g. 1024-bit operands buffers must store
			// 1024 / 32 = 32 words, and these need 5-bit address bus, because
			// 2 ** 5 = 32.
			//
		parameter	OPERAND_ADDR_WIDTH		= 5,
		
			//
			// This sets the width of the systolic cycle counter. TODO: Explain.
			//
		parameter	SYSTOLIC_ARRAY_POWER		= 3
	)
	(
		input											clk,
		input											rst_n,

		input											ena,
		output										rdy,

		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,

		input		[                32-1:0]	a_bram_out,
		input		[                32-1:0]	b_bram_out,
		input		[                32-1:0]	n_bram_out,
		input		[                32-1:0]	n_coeff_bram_out,

		output	[                32-1:0]	r_bram_in,
		output										r_bram_wr,

		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
	);
	
	
		//
		// Constants
		//
	localparam	SYSTOLIC_CNTR_WIDTH		= OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
	localparam	SYSTOLIC_ARRAY_LENGTH	= 2 ** SYSTOLIC_ARRAY_POWER;
	localparam	SYSTOLIC_NUM_CYCLES		= 2 ** SYSTOLIC_CNTR_WIDTH;

	localparam	SYSTOLIC_PE_LATENCY		= 4;
	

		//
		// FSM Declaration
		//
	localparam	[ 3: 0]	FSM_STATE_IDLE					= 4'd0;
	localparam	[ 3: 0]	FSM_STATE_INIT_ZERO_ADDR	= 4'd1;
	localparam	[ 3: 0]	FSM_STATE_INIT_NEXT_ADDR	= 4'd2;
	localparam	[ 3: 0]	FSM_STATE_INIT_LAST_ADDR	= 4'd3;
	localparam	[ 3: 0]	FSM_STATE_PIPE_CRUNCH		= 4'd4;
	localparam	[ 3: 0]	FSM_STATE_PIPE_RELOAD		= 4'd5;
	localparam	[ 3: 0]	FSM_STATE_SAVE_ZERO_ADDR	= 4'd6;
	localparam	[ 3: 0]	FSM_STATE_SAVE_NEXT_ADDR	= 4'd7;
	localparam	[ 3: 0]	FSM_STATE_SAVE_LAST_ADDR	= 4'd8;
	localparam	[ 3: 0]	FSM_STATE_STOP					= 4'd9;
	
	reg	[ 3: 0]	fsm_state = FSM_STATE_IDLE;
	reg	[ 3: 0]	fsm_next_state;

	
		//
		// Enable Delay (Trigger)
		//
   reg ena_dly = 1'b0;
   wire ena_trig = ena && !ena_dly;
   always @(posedge clk) ena_dly <= ena;		

		
		//
		// Parameters Latch
		//
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;

	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
			n_num_words_latch <= n_num_words;


		//
		// Addresses
		//
	localparam	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
	wire			[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
	
	
		//
		// BRAM Addresses
		//
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_reg;
	reg	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_reg;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_reg;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_reg;
	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_reg;
	reg	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr_reg;

	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr = s_bram_addr_reg;
	
	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_dly;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_dly;
	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_dly;
	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_dly;
	
	wire	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_next       = b_bram_addr + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_next       = a_bram_addr + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_next       = n_bram_addr + 1'b1;
	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_next       = s_bram_addr + 1'b1;
	
	wire										b_bram_addr_done = 
		(b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;

	wire										s_bram_addr_done = 
		(s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
	
	assign b_bram_addr = b_bram_addr_reg;
	assign a_bram_addr = a_bram_addr_reg;
	assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
	assign n_bram_addr = n_bram_addr_reg;
	assign r_bram_addr = r_bram_addr_reg;

	always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
	always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
	always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
	always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
		
	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_INIT_ZERO_ADDR:	b_bram_addr_reg <= bram_addr_zero;
			FSM_STATE_INIT_NEXT_ADDR:	b_bram_addr_reg <= b_bram_addr_next;
		endcase

	always @(posedge clk)
		case (fsm_next_state)
			FSM_STATE_SAVE_ZERO_ADDR:	s_bram_addr_reg <= bram_addr_zero;
			FSM_STATE_SAVE_NEXT_ADDR:	s_bram_addr_reg <= s_bram_addr_next;
		endcase

	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_INIT_LAST_ADDR:	a_bram_addr_reg <= bram_addr_zero;
			FSM_STATE_PIPE_RELOAD:		a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
		endcase

	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_INIT_ZERO_ADDR:	n_coeff_bram_addr_reg <= bram_addr_zero;
			FSM_STATE_INIT_NEXT_ADDR:	n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
		endcase


		
		
		//
		// Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
		//
	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};

	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb;
	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb;
	
	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb_next =
		{pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};

	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb_next =
		{pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};

	wire										pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
	wire										pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];

	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
			//
			case (fsm_state)
				FSM_STATE_INIT_LAST_ADDR,
				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_lsb <= pe_latency_start;
				FSM_STATE_PIPE_CRUNCH:		pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
														pe_latency_ab_lsb : pe_latency_ab_lsb_next;
			endcase

		//
		// Buffers
		//
	integer i, j;

	reg	[31: 0]	b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
	reg	[31: 0]	n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
	reg	[31: 0]	n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
	
	always @(posedge clk)
		//
		case (fsm_state)
			FSM_STATE_INIT_ZERO_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						b_buf[i][j] <= 32'd0;

			FSM_STATE_INIT_NEXT_ADDR,
			FSM_STATE_INIT_LAST_ADDR:
				b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
		endcase

	always @(posedge clk)
		//
		case (fsm_state)
			FSM_STATE_INIT_ZERO_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						n_coeff_buf[i][j] <= 32'd0;

			FSM_STATE_INIT_NEXT_ADDR,
			FSM_STATE_INIT_LAST_ADDR:
				n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
		endcase

	always @(posedge clk)
		//
		case (fsm_state)
			FSM_STATE_INIT_ZERO_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						n_buf[i][j] <= 32'd0;

			FSM_STATE_INIT_NEXT_ADDR,
			FSM_STATE_INIT_LAST_ADDR:
				n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
		endcase

		
	
		
	
	
		//
		// Cycle Counters
		//
	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab;
	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q;
	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn;
	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s;
	
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt;
	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
	
	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
	
	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_half = {1'b0, n_num_words};
	
	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_last = {n_num_words, 1'b1};
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];

	wire										mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
	wire										mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
	wire										mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
	wire										mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
	
	wire										syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;

	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab_next = mult_cnt_ab + 1'b1;
	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q_next = mult_cnt_q + 1'b1;
	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn_next = mult_cnt_qn + 1'b1;
	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s_next = mult_cnt_s + 1'b1;
	
	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;

	
	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
			//
			case (fsm_state)
				FSM_STATE_INIT_LAST_ADDR,
				FSM_STATE_PIPE_RELOAD:		syst_cnt <= syst_cnt_zero;
				FSM_STATE_PIPE_CRUNCH:		syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
			endcase

	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
			//
			case (fsm_state)
				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_ab <= mult_cnt_zero;
				FSM_STATE_PIPE_RELOAD:		mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
			endcase

	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
			//
			case (fsm_state)
				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_q <= mult_cnt_zero;
				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
			endcase

	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
			//
			case (fsm_state)
				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_qn <= mult_cnt_zero;
				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
			endcase
		
	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
			//
			case (fsm_state)
				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_s <= mult_cnt_zero;
				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
			endcase
		
		
	always @(posedge clk) begin
		syst_cnt_dly[0] <= syst_cnt;
		for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
			syst_cnt_dly[i] <= syst_cnt_dly[i-1];
	end
	
		//
		// Systolic Array
		//
	wire	[31: 0]	mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
	wire	[31: 0]	mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];

	wire	[31: 0]	mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
	wire	[31: 0]	mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];

	wire	[31: 0]	mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
	wire	[31: 0]	mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
	
	wire	[31: 0]	mul_ab_a	= (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
	reg	[31: 0]	mul_q_a_int;
	reg	[31: 0]	mul_q_a;
	reg	[31: 0]	mul_qn_a_int;
	reg	[31: 0]	mul_qn_a;
	
	reg	[31: 0]	t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
	reg	[31: 0]	c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];

	reg	[31: 0]	t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
	reg	[31: 0]	c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];

	reg	[31: 0]	t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
	reg	[31: 0]	c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];

	genvar syst;
	generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
		begin : gen_mul
	
			modexpa7_systolic_pe mul_ab_inst
			(
				.clk		(clk),
				.a			(mul_ab_a),
				.b			(b_buf[syst_cnt][syst]),
				.t			(t_ab[syst_cnt][syst]),
				.c_in		(c_ab_in[syst_cnt][syst]),
				
				.p			(mul_ab_p[syst]),
				.c_out	(mul_ab_c_out[syst])
			);
			
			modexpa7_systolic_pe mul_q_inst
			(
				.clk		(clk),
				.a			(mul_q_a),
				.b			(n_coeff_buf[syst_cnt][syst]),
				.t			(t_q[syst_cnt][syst]),
				.c_in		(c_q_in[syst_cnt][syst]),
				
				.p			(mul_q_p[syst]),
				.c_out	(mul_q_c_out[syst])
			);
			

			modexpa7_systolic_pe mul_qn_inst
			(
				.clk		(clk),
				.a			(mul_qn_a),
				.b			(n_buf[syst_cnt][syst]),
				.t			(t_qn[syst_cnt][syst]),
				.c_in		(c_qn_in[syst_cnt][syst]),
				
				.p			(mul_qn_p[syst]),
				.c_out	(mul_qn_c_out[syst])
			);
			
		end
	endgenerate
	
		//
		// c_ab
		//
	always @(posedge clk)
		//
		case (fsm_state)
			
			FSM_STATE_INIT_LAST_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						c_ab_in[i][j] <= 32'd0;
						
			FSM_STATE_PIPE_CRUNCH:
				if (pe_latency_ab_lsb_done)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
		endcase
	
		//
		// c_q
		//
	always @(posedge clk)
		//
		case (fsm_state)
			
			FSM_STATE_INIT_LAST_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						c_q_in[i][j] <= 32'd0;
						
			FSM_STATE_PIPE_CRUNCH:
				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
		endcase

		//
		// c_qn
		//
	always @(posedge clk)
		//
		case (fsm_state)
			
			FSM_STATE_INIT_LAST_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						c_qn_in[i][j] <= 32'd0;
						
			FSM_STATE_PIPE_CRUNCH:
				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
		endcase
		
		//
		// t_ab
		//
	always @(posedge clk)
		//
		case (fsm_state)
		
			FSM_STATE_INIT_LAST_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						t_ab[i][j] <= 32'd0;
						
			FSM_STATE_PIPE_CRUNCH:
				if (pe_latency_ab_lsb_done) begin
					if (syst_cnt_latency > syst_cnt_zero)
						t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
				end
				
		endcase


		//
		// t_q
		//
	always @(posedge clk)
		//
		case (fsm_state)
		
			FSM_STATE_INIT_LAST_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						t_q[i][j] <= 32'd0;
						
			FSM_STATE_PIPE_CRUNCH:
				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
					if (syst_cnt_latency > syst_cnt_zero)
						t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
				end
				
		endcase


		//
		// t_qn
		//
	always @(posedge clk)
		//
		case (fsm_state)
		
			FSM_STATE_INIT_LAST_ADDR:
				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						t_qn[i][j] <= 32'd0;
						
			FSM_STATE_PIPE_CRUNCH:
				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
					if (syst_cnt_latency > syst_cnt_zero)
						t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
						t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
				end
				
		endcase

		//
		// Latency 2
		//
	always @(posedge clk)
		//
		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
			//
			case (fsm_state)
				FSM_STATE_INIT_LAST_ADDR,
				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_msb <= pe_latency_start;
				FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)
					pe_latency_ab_msb <= pe_latency_ab_msb_done ?
														pe_latency_ab_msb : pe_latency_ab_msb_next;
			endcase


		//
		// Adder
		//
	reg				pe_add_ce;
	reg	[31: 0]	pe_add_a0;
	reg	[31: 0]	pe_add_a1;
	reg	[31: 0]	pe_add_a2;
	reg	[31: 0]	pe_add_b0;

	reg				pe_add_c_in;
	wire	[31: 0]	pe_add_s;
	wire				pe_add_c_out;

	reg				pe_sub_ce;
	reg	[31: 0]	pe_sub_a0;
	reg	[31: 0]	pe_sub_b0;

	reg				pe_sub_b_in;
	wire	[31: 0]	pe_sub_d;
	wire				pe_sub_b_out;
	
	always @(posedge clk)
		pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;

	always @(posedge clk)
		pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);

	always @(posedge clk)
		//
		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
			pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;

	always @(posedge clk)
		//
		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
			pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
	
	
	modexpa7_adder32 pe_add_inst
	(
		.clk		(clk),
		.ce		(pe_add_ce),
		.a			(pe_add_a2),
		.b			(pe_add_b0),
		.c_in		(pe_add_c_in),
		.s			(pe_add_s),
		.c_out	(pe_add_c_out)
	);

	modexpa7_subtractor32 pe_sub_inst
	(
		.clk		(clk),
		.ce		(pe_sub_ce),
		.a			(pe_sub_a0),
		.b			(pe_sub_b0),
		.b_in		(pe_sub_b_in),
		.d			(pe_sub_d),
		.b_out	(pe_sub_b_out)
	);
	
	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
			pe_add_a0 <= mul_ab_p[0];
			pe_add_a1 <= pe_add_a0;
			pe_add_a2 <= pe_add_a1;
		end

	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
			pe_sub_a0 <= pe_add_s;

	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
			pe_add_b0 <= mul_qn_p[0];
	
	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
			pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
	
	
	always @(posedge clk)
		//
		case (fsm_next_state)
			FSM_STATE_INIT_ZERO_ADDR:	n_bram_addr_reg <= bram_addr_zero;
			FSM_STATE_INIT_NEXT_ADDR:	n_bram_addr_reg <= n_bram_addr_next;
			FSM_STATE_PIPE_RELOAD: begin
				if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
				if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
			end
		endcase
		
		
		//
		// Ready Flag Logic
		//
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;

   always @(posedge clk or negedge rst_n)
		//
		if (rst_n == 1'b0)	rdy_reg	<= 1'b1;
		else begin
			if (fsm_state == FSM_STATE_IDLE)		rdy_reg <= ~ena_trig;
			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;
		end
	

		//
		//
		//
	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
			mul_q_a_int <= mul_ab_p[0];

	always @(posedge clk)
		//
		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
			mul_qn_a_int <= mul_q_p[0];

	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_PIPE_RELOAD)
			mul_q_a <= mul_q_a_int;	// TODO: Add masking! Maybe not needed after all?..

	always @(posedge clk)
		//
		if (fsm_state == FSM_STATE_PIPE_RELOAD)
			mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
	
		//
		// Debug
		//
	//always @(posedge clk) begin
		//
		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
			//$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
		//
		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
			//$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
		//
		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
			//$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
		//
		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
			//$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
		//
	//end
		
		
	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_rd;
	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr;
	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
	reg										s_bram_en;
	
	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_rd;
	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr;
	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
	reg										sn_bram_en;
	
	assign s_bram_addr_rd = s_bram_addr;
	assign sn_bram_addr_rd = s_bram_addr;
	
	wire	[31: 0]	s_bram_din;
	wire	[31: 0]	s_bram_dout;
	
	wire	[31: 0]	sn_bram_din;
	wire	[31: 0]	sn_bram_dout;
	
	assign s_bram_din = pe_add_s;
	assign sn_bram_din = pe_sub_d;
	
	always @(posedge clk)
		//
		s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);

	always @(posedge clk)
		//
		sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
	
	always @(posedge clk) begin
		//
		if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
		if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
	end

	always @(posedge clk) begin
		//
		if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
		if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
	end
	
	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_s (.clk(clk),
		.a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
		.b_addr(s_bram_addr_rd), .b_out(s_bram_dout));

	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
	bram_sn (.clk(clk),
		.a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
		.b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
		
		
	reg	r_bram_en;
	
	always @(posedge clk)
		//
		case (fsm_state)
			FSM_STATE_SAVE_ZERO_ADDR,
			FSM_STATE_SAVE_NEXT_ADDR:	r_bram_en <= 1'b1;
			default:							r_bram_en <= 1'b0;
			
		endcase
		
		
		
	reg	r_bram_wr_reg;
	
	assign r_bram_wr = r_bram_wr_reg;
	
	always @(posedge clk)
		//
		r_bram_wr_reg <= r_bram_en;
		
		
	wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
		
		
	reg	[31: 0]	r_bram_in_reg;
	
	assign r_bram_in = r_bram_in_reg;

		always @(posedge clk)
			//
			if (r_bram_en)
				r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
	
	always @(posedge clk)
		//
		if (r_bram_en)
			r_bram_addr_reg <= s_bram_addr_dly;
	
	
		//
		// FSM Transition Logic
		//
	always @(posedge clk or negedge rst_n)
		//
		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
		else						fsm_state <= fsm_next_state;
	
	always @* begin
		//
		fsm_next_state = FSM_STATE_STOP;
		//
		case (fsm_state)
		
			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
												else							fsm_next_state = FSM_STATE_IDLE;
												
			FSM_STATE_INIT_ZERO_ADDR:									fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
			
			FSM_STATE_INIT_NEXT_ADDR:	if (b_bram_addr_done)	fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
												else							fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
												
			FSM_STATE_INIT_LAST_ADDR:									fsm_next_state = FSM_STATE_PIPE_CRUNCH;
			
			FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)		fsm_next_state = pe_latency_ab_msb_done ?
																					FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;

			FSM_STATE_PIPE_RELOAD:		if (mult_cnt_s_done)		fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;
												
			FSM_STATE_SAVE_ZERO_ADDR:									fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
			
			FSM_STATE_SAVE_NEXT_ADDR:	if (s_bram_addr_done)	fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
												else							fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
			
			FSM_STATE_SAVE_LAST_ADDR:									fsm_next_state = FSM_STATE_STOP;
			
			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
			
		endcase
	end


endmodule

//======================================================================
// End of file
//======================================================================