From 0b873507ad47e3046935dfc8b3f91d36bc21c7b0 Mon Sep 17 00:00:00 2001
From: "Pavel V. Shatov (Meister)" <meisterpaul1@yandex.ru>
Date: Tue, 27 Jun 2017 13:44:08 +0300
Subject: Added systolic modular multiplier w/ testbench.  * works in simulator
  * may have to change how internal operand buffer is pre-loaded    (shift
 register instead of wide mux?)  * code needs some cleanup

---
 src/rtl/modexpa7_systolic_multiplier.v | 876 +++++++++++++++++++++++++++++++++
 src/rtl/util/bram_1rw_1ro_readfirst.v  |  88 ++++
 src/rtl/util/bram_1rw_readfirst.v      |  75 +++
 3 files changed, 1039 insertions(+)
 create mode 100644 src/rtl/modexpa7_systolic_multiplier.v
 create mode 100644 src/rtl/util/bram_1rw_1ro_readfirst.v
 create mode 100644 src/rtl/util/bram_1rw_readfirst.v

(limited to 'src/rtl')

diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
new file mode 100644
index 0000000..0849b61
--- /dev/null
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -0,0 +1,876 @@
+//======================================================================
+//
+// modexpa7_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module modexpa7_systolic_multiplier #
+	(
+			//
+			// This sets the address widths of memory buffers. Internal data
+			// width is 32 bits, so for e.g. 1024-bit operands buffers must store
+			// 1024 / 32 = 32 words, and these need 5-bit address bus, because
+			// 2 ** 5 = 32.
+			//
+		parameter	OPERAND_ADDR_WIDTH		= 5,
+		
+			//
+			// This sets the width of the systolic cycle counter. TODO: Explain.
+			//
+		parameter	SYSTOLIC_ARRAY_POWER		= 3
+	)
+	(
+		input											clk,
+		input											rst_n,
+
+		input											ena,
+		output										rdy,
+
+		output	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr,
+		output	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr,
+
+		input		[                32-1:0]	a_bram_out,
+		input		[                32-1:0]	b_bram_out,
+		input		[                32-1:0]	n_bram_out,
+		input		[                32-1:0]	n_coeff_bram_out,
+
+		output	[                32-1:0]	r_bram_in,
+		output										r_bram_wr,
+
+		input		[OPERAND_ADDR_WIDTH-1:0]	n_num_words
+	);
+	
+	
+		//
+		// Constants
+		//
+	localparam	SYSTOLIC_CNTR_WIDTH		= OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
+	localparam	SYSTOLIC_ARRAY_LENGTH	= 2 ** SYSTOLIC_ARRAY_POWER;
+	localparam	SYSTOLIC_NUM_CYCLES		= 2 ** SYSTOLIC_CNTR_WIDTH;
+
+	localparam	SYSTOLIC_PE_LATENCY		= 4;
+	
+
+		//
+		// FSM Declaration
+		//
+	localparam	[ 3: 0]	FSM_STATE_IDLE					= 4'd0;
+	localparam	[ 3: 0]	FSM_STATE_INIT_ZERO_ADDR	= 4'd1;
+	localparam	[ 3: 0]	FSM_STATE_INIT_NEXT_ADDR	= 4'd2;
+	localparam	[ 3: 0]	FSM_STATE_INIT_LAST_ADDR	= 4'd3;
+	localparam	[ 3: 0]	FSM_STATE_PIPE_CRUNCH		= 4'd4;
+	localparam	[ 3: 0]	FSM_STATE_PIPE_RELOAD		= 4'd5;
+	localparam	[ 3: 0]	FSM_STATE_SAVE_ZERO_ADDR	= 4'd6;
+	localparam	[ 3: 0]	FSM_STATE_SAVE_NEXT_ADDR	= 4'd7;
+	localparam	[ 3: 0]	FSM_STATE_SAVE_LAST_ADDR	= 4'd8;
+	localparam	[ 3: 0]	FSM_STATE_STOP					= 4'd9;
+	
+	reg	[ 3: 0]	fsm_state = FSM_STATE_IDLE;
+	reg	[ 3: 0]	fsm_next_state;
+
+	
+		//
+		// Enable Delay (Trigger)
+		//
+   reg ena_dly = 1'b0;
+   wire ena_trig = ena && !ena_dly;
+   always @(posedge clk) ena_dly <= ena;		
+
+		
+		//
+		// Parameters Latch
+		//
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_num_words_latch;
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
+			n_num_words_latch <= n_num_words;
+
+
+		//
+		// Addresses
+		//
+	localparam	[OPERAND_ADDR_WIDTH-1:0]	bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
+	wire			[OPERAND_ADDR_WIDTH-1:0]	bram_addr_last = n_num_words_latch;
+	
+	
+		//
+		// BRAM Addresses
+		//
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_reg;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	r_bram_addr_reg;
+
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr = s_bram_addr_reg;
+	
+	reg	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_dly;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_dly;
+	
+	wire	[OPERAND_ADDR_WIDTH-1:0]	b_bram_addr_next       = b_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	a_bram_addr_next       = a_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	n_bram_addr_next       = n_bram_addr + 1'b1;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_next       = s_bram_addr + 1'b1;
+	
+	wire										b_bram_addr_done = 
+		(b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+	wire										s_bram_addr_done = 
+		(s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+	
+	assign b_bram_addr = b_bram_addr_reg;
+	assign a_bram_addr = a_bram_addr_reg;
+	assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
+	assign n_bram_addr = n_bram_addr_reg;
+	assign r_bram_addr = r_bram_addr_reg;
+
+	always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
+	always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
+	always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
+	always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
+		
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_ZERO_ADDR:	b_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_INIT_NEXT_ADDR:	b_bram_addr_reg <= b_bram_addr_next;
+		endcase
+
+	always @(posedge clk)
+		case (fsm_next_state)
+			FSM_STATE_SAVE_ZERO_ADDR:	s_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_SAVE_NEXT_ADDR:	s_bram_addr_reg <= s_bram_addr_next;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_LAST_ADDR:	a_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_PIPE_RELOAD:		a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_ZERO_ADDR:	n_coeff_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_INIT_NEXT_ADDR:	n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
+		endcase
+
+
+		
+		
+		//
+		// Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
+		//
+	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+
+	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb;
+	reg	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb;
+	
+	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_lsb_next =
+		{pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};
+
+	wire	[SYSTOLIC_PE_LATENCY:0]		pe_latency_ab_msb_next =
+		{pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};
+
+	wire										pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
+	wire										pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR,
+				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_lsb <= pe_latency_start;
+				FSM_STATE_PIPE_CRUNCH:		pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
+														pe_latency_ab_lsb : pe_latency_ab_lsb_next;
+			endcase
+
+		//
+		// Buffers
+		//
+	integer i, j;
+
+	reg	[31: 0]	b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_INIT_ZERO_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						b_buf[i][j] <= 32'd0;
+
+			FSM_STATE_INIT_NEXT_ADDR,
+			FSM_STATE_INIT_LAST_ADDR:
+				b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_INIT_ZERO_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						n_coeff_buf[i][j] <= 32'd0;
+
+			FSM_STATE_INIT_NEXT_ADDR,
+			FSM_STATE_INIT_LAST_ADDR:
+				n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
+		endcase
+
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_INIT_ZERO_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						n_buf[i][j] <= 32'd0;
+
+			FSM_STATE_INIT_NEXT_ADDR,
+			FSM_STATE_INIT_LAST_ADDR:
+				n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
+		endcase
+
+		
+	
+		
+	
+	
+		//
+		// Cycle Counters
+		//
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab;
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q;
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn;
+	reg	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s;
+	
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt;
+	reg	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
+	
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+	
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_half = {1'b0, n_num_words};
+	
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_last = {n_num_words, 1'b1};
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+	wire										mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
+	wire										mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
+	wire										mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
+	wire										mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
+	
+	wire										syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;
+
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_ab_next = mult_cnt_ab + 1'b1;
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_q_next = mult_cnt_q + 1'b1;
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_qn_next = mult_cnt_qn + 1'b1;
+	wire	[ OPERAND_ADDR_WIDTH  :0]	mult_cnt_s_next = mult_cnt_s + 1'b1;
+	
+	wire	[SYSTOLIC_CNTR_WIDTH-1:0]	syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;
+
+	
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR,
+				FSM_STATE_PIPE_RELOAD:		syst_cnt <= syst_cnt_zero;
+				FSM_STATE_PIPE_CRUNCH:		syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
+			endcase
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_ab <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
+			endcase
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_q <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
+			endcase
+
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_qn <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
+			endcase
+		
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR:	mult_cnt_s <= mult_cnt_zero;
+				FSM_STATE_PIPE_RELOAD:		if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
+			endcase
+		
+		
+	always @(posedge clk) begin
+		syst_cnt_dly[0] <= syst_cnt;
+		for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
+			syst_cnt_dly[i] <= syst_cnt_dly[i-1];
+	end
+	
+		//
+		// Systolic Array
+		//
+	wire	[31: 0]	mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+	wire	[31: 0]	mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	wire	[31: 0]	mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+	wire	[31: 0]	mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	wire	[31: 0]	mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+	wire	[31: 0]	mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+	
+	wire	[31: 0]	mul_ab_a	= (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
+	reg	[31: 0]	mul_q_a_int;
+	reg	[31: 0]	mul_q_a;
+	reg	[31: 0]	mul_qn_a_int;
+	reg	[31: 0]	mul_qn_a;
+	
+	reg	[31: 0]	t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	reg	[31: 0]	t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	reg	[31: 0]	t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+	reg	[31: 0]	c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+	genvar syst;
+	generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
+		begin : gen_mul
+	
+			/*modexpa7_*/pe_mul mul_ab_inst
+			(
+				.clk		(clk),
+				.a			(mul_ab_a),
+				.b			(b_buf[syst_cnt][syst]),
+				.t			(t_ab[syst_cnt][syst]),
+				.c_in		(c_ab_in[syst_cnt][syst]),
+				
+				.p			(mul_ab_p[syst]),
+				.c_out	(mul_ab_c_out[syst])
+			);
+			
+			/*modexpa7_*/pe_mul mul_q_inst
+			(
+				.clk		(clk),
+				.a			(mul_q_a),
+				.b			(n_coeff_buf[syst_cnt][syst]),
+				.t			(t_q[syst_cnt][syst]),
+				.c_in		(c_q_in[syst_cnt][syst]),
+				
+				.p			(mul_q_p[syst]),
+				.c_out	(mul_q_c_out[syst])
+			);
+			
+
+			/*modexpa7_*/pe_mul mul_qn_inst
+			(
+				.clk		(clk),
+				.a			(mul_qn_a),
+				.b			(n_buf[syst_cnt][syst]),
+				.t			(t_qn[syst_cnt][syst]),
+				.c_in		(c_qn_in[syst_cnt][syst]),
+				
+				.p			(mul_qn_p[syst]),
+				.c_out	(mul_qn_c_out[syst])
+			);
+			
+		end
+	endgenerate
+	
+		//
+		// c_ab
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_ab_in[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
+		endcase
+	
+		//
+		// c_q
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_q_in[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
+		endcase
+
+		//
+		// c_qn
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_qn_in[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
+		endcase
+		
+		//
+		// t_ab
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+		
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_ab[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done) begin
+					if (syst_cnt_latency > syst_cnt_zero)
+						t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
+					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
+				end
+				
+		endcase
+
+
+		//
+		// t_q
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+		
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_q[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
+					if (syst_cnt_latency > syst_cnt_zero)
+						t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
+					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
+				end
+				
+		endcase
+
+
+		//
+		// t_qn
+		//
+	always @(posedge clk)
+		//
+		case (fsm_state)
+		
+			FSM_STATE_INIT_LAST_ADDR:
+				for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+					for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_qn[i][j] <= 32'd0;
+						
+			FSM_STATE_PIPE_CRUNCH:
+				if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
+					if (syst_cnt_latency > syst_cnt_zero)
+						t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
+					for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+						t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
+				end
+				
+		endcase
+
+		//
+		// Latency 2
+		//
+	always @(posedge clk)
+		//
+		if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+			//
+			case (fsm_state)
+				FSM_STATE_INIT_LAST_ADDR,
+				FSM_STATE_PIPE_RELOAD:		pe_latency_ab_msb <= pe_latency_start;
+				FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)
+					pe_latency_ab_msb <= pe_latency_ab_msb_done ?
+														pe_latency_ab_msb : pe_latency_ab_msb_next;
+			endcase
+
+
+		//
+		// Adder
+		//
+	reg				pe_add_ce;
+	reg	[31: 0]	pe_add_a0;
+	reg	[31: 0]	pe_add_a1;
+	reg	[31: 0]	pe_add_a2;
+	reg	[31: 0]	pe_add_b0;
+
+	reg				pe_add_c_in;
+	wire	[31: 0]	pe_add_s;
+	wire				pe_add_c_out;
+
+	reg				pe_sub_ce;
+	reg	[31: 0]	pe_sub_a0;
+	reg	[31: 0]	pe_sub_b0;
+
+	reg				pe_sub_b_in;
+	wire	[31: 0]	pe_sub_d;
+	wire				pe_sub_b_out;
+	
+	always @(posedge clk)
+		pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;
+
+	always @(posedge clk)
+		pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);
+
+	always @(posedge clk)
+		//
+		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
+			pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;
+
+	always @(posedge clk)
+		//
+		if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
+			pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
+	
+	
+	modexpa7_pe_add pe_add_inst
+	(
+		.clk		(clk),
+		.ce		(pe_add_ce),
+		.a			(pe_add_a2),
+		.b			(pe_add_b0),
+		.c_in		(pe_add_c_in),
+		.s			(pe_add_s),
+		.c_out	(pe_add_c_out)
+	);
+
+	modexpa7_pe_sub pe_sub_inst
+	(
+		.clk		(clk),
+		.ce		(pe_sub_ce),
+		.a			(pe_sub_a0),
+		.b			(pe_sub_b0),
+		.b_in		(pe_sub_b_in),
+		.d			(pe_sub_d),
+		.b_out	(pe_sub_b_out)
+	);
+	
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
+			pe_add_a0 <= mul_ab_p[0];
+			pe_add_a1 <= pe_add_a0;
+			pe_add_a2 <= pe_add_a1;
+		end
+
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			pe_sub_a0 <= pe_add_s;
+
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			pe_add_b0 <= mul_qn_p[0];
+	
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
+	
+	
+	always @(posedge clk)
+		//
+		case (fsm_next_state)
+			FSM_STATE_INIT_ZERO_ADDR:	n_bram_addr_reg <= bram_addr_zero;
+			FSM_STATE_INIT_NEXT_ADDR:	n_bram_addr_reg <= n_bram_addr_next;
+			FSM_STATE_PIPE_RELOAD: begin
+				if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
+				if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
+			end
+		endcase
+		
+		
+		//
+		// Ready Flag Logic
+		//
+	reg rdy_reg = 1'b1;
+	assign rdy = rdy_reg;
+
+   always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	rdy_reg	<= 1'b1;
+		else begin
+			if (fsm_state == FSM_STATE_IDLE)		rdy_reg <= ~ena_trig;
+			if (fsm_state == FSM_STATE_STOP)	rdy_reg <= 1'b1;
+		end
+	
+
+		//
+		//
+		//
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			mul_q_a_int <= mul_ab_p[0];
+
+	always @(posedge clk)
+		//
+		if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			mul_qn_a_int <= mul_q_p[0];
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			mul_q_a <= mul_q_a_int;	// TODO: Add masking! Maybe not needed after all?..
+
+	always @(posedge clk)
+		//
+		if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
+	
+		//
+		// Debug
+		//
+	//always @(posedge clk) begin
+		//
+		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			//$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
+		//
+		//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+			//$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
+		//
+		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			//$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
+		//
+		//if (fsm_state == FSM_STATE_PIPE_RELOAD)
+			//$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
+		//
+	//end
+		
+		
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_rd;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
+	reg										s_bram_en;
+	
+	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_rd;
+	reg	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr;
+	wire	[OPERAND_ADDR_WIDTH-1:0]	sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
+	reg										sn_bram_en;
+	
+	assign s_bram_addr_rd = s_bram_addr;
+	assign sn_bram_addr_rd = s_bram_addr;
+	
+	wire	[31: 0]	s_bram_din;
+	wire	[31: 0]	s_bram_dout;
+	
+	wire	[31: 0]	sn_bram_din;
+	wire	[31: 0]	sn_bram_dout;
+	
+	assign s_bram_din = pe_add_s;
+	assign sn_bram_din = pe_sub_d;
+	
+	always @(posedge clk)
+		//
+		s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);
+
+	always @(posedge clk)
+		//
+		sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
+	
+	always @(posedge clk) begin
+		//
+		if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
+		if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
+	end
+
+	always @(posedge clk) begin
+		//
+		if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
+		if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
+	end
+	
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_s (.clk(clk),
+		.a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
+		.b_addr(s_bram_addr_rd), .b_out(s_bram_dout));
+
+	bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+	bram_sn (.clk(clk),
+		.a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
+		.b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
+		
+		
+	reg	r_bram_en;
+	
+	always @(posedge clk)
+		//
+		case (fsm_state)
+			FSM_STATE_SAVE_ZERO_ADDR,
+			FSM_STATE_SAVE_NEXT_ADDR:	r_bram_en <= 1'b1;
+			default:							r_bram_en <= 1'b0;
+			
+		endcase
+		
+		
+		
+	reg	r_bram_wr_reg;
+	
+	assign r_bram_wr = r_bram_wr_reg;
+	
+	always @(posedge clk)
+		//
+		r_bram_wr_reg <= r_bram_en;
+		
+		
+	wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
+		
+		
+	reg	[31: 0]	r_bram_in_reg;
+	
+	assign r_bram_in = r_bram_in_reg;
+
+		always @(posedge clk)
+			//
+			if (r_bram_en)
+				r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
+	
+	always @(posedge clk)
+		//
+		if (r_bram_en)
+			r_bram_addr_reg <= s_bram_addr_dly;
+	
+	
+		//
+		// FSM Transition Logic
+		//
+	always @(posedge clk or negedge rst_n)
+		//
+		if (rst_n == 1'b0)	fsm_state <= FSM_STATE_IDLE;
+		else						fsm_state <= fsm_next_state;
+	
+	always @* begin
+		//
+		fsm_next_state = FSM_STATE_STOP;
+		//
+		case (fsm_state)
+		
+			FSM_STATE_IDLE:				if (ena_trig)				fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
+												else							fsm_next_state = FSM_STATE_IDLE;
+												
+			FSM_STATE_INIT_ZERO_ADDR:									fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+			
+			FSM_STATE_INIT_NEXT_ADDR:	if (b_bram_addr_done)	fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
+												else							fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+												
+			FSM_STATE_INIT_LAST_ADDR:									fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+			
+			FSM_STATE_PIPE_CRUNCH:		if (syst_cnt_done)		fsm_next_state = pe_latency_ab_msb_done ?
+																					FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
+												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+			FSM_STATE_PIPE_RELOAD:		if (mult_cnt_s_done)		fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
+												else							fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+												
+			FSM_STATE_SAVE_ZERO_ADDR:									fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+			
+			FSM_STATE_SAVE_NEXT_ADDR:	if (s_bram_addr_done)	fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
+												else							fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+			
+			FSM_STATE_SAVE_LAST_ADDR:									fsm_next_state = FSM_STATE_STOP;
+			
+			FSM_STATE_STOP:												fsm_next_state = FSM_STATE_IDLE;
+			
+		endcase
+	end
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================
diff --git a/src/rtl/util/bram_1rw_1ro_readfirst.v b/src/rtl/util/bram_1rw_1ro_readfirst.v
new file mode 100644
index 0000000..56cb24e
--- /dev/null
+++ b/src/rtl/util/bram_1rw_1ro_readfirst.v
@@ -0,0 +1,88 @@
+//======================================================================
+//
+// Copyright (c) 2015, 2017 NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module bram_1rw_1ro_readfirst
+  #(parameter MEM_WIDTH            = 32,
+    parameter MEM_ADDR_BITS        = 8)
+   (
+    input wire                     clk,
+
+    input wire [MEM_ADDR_BITS-1:0] a_addr,
+    input wire                     a_wr,
+    input wire [MEM_WIDTH-1:0]     a_in,
+    output wire [MEM_WIDTH-1:0]    a_out,
+
+    input wire [MEM_ADDR_BITS-1:0] b_addr,
+    output wire [MEM_WIDTH-1:0]    b_out
+    );
+
+
+   //
+   // BRAM
+   //
+   (* RAM_STYLE="BLOCK" *)
+   reg [MEM_WIDTH-1:0]             bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+   //
+   // Output Registers
+   //
+   reg [MEM_WIDTH-1:0]             bram_reg_a;
+   reg [MEM_WIDTH-1:0]             bram_reg_b;
+
+   assign a_out = bram_reg_a;
+   assign b_out = bram_reg_b;
+
+
+   //
+   // Read-Write Port A
+   //
+   always @(posedge clk) begin
+      //
+      bram_reg_a <= bram[a_addr];
+      //
+      if (a_wr) bram[a_addr] <= a_in;
+      //
+   end
+
+
+   //
+   // Read-Only Port B
+   //
+   always @(posedge clk)
+     //
+     bram_reg_b <= bram[b_addr];
+
+
+endmodule
diff --git a/src/rtl/util/bram_1rw_readfirst.v b/src/rtl/util/bram_1rw_readfirst.v
new file mode 100644
index 0000000..30ecae8
--- /dev/null
+++ b/src/rtl/util/bram_1rw_readfirst.v
@@ -0,0 +1,75 @@
+//======================================================================
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module bram_1rw_readfirst
+  #(parameter MEM_WIDTH            = 32,
+    parameter MEM_ADDR_BITS        = 8)
+   (
+    input wire                     clk,
+
+    input wire [MEM_ADDR_BITS-1:0] a_addr,
+    input wire                     a_wr,
+    input wire [MEM_WIDTH-1:0]     a_in,
+    output wire [MEM_WIDTH-1:0]    a_out
+    );
+
+
+   //
+   // BRAM
+   //
+   (* RAM_STYLE="BLOCK" *)
+   reg [MEM_WIDTH-1:0]             bram[0:(2**MEM_ADDR_BITS)-1];
+	
+	
+   //
+   // Output Register
+   //
+   reg [MEM_WIDTH-1:0]             bram_reg_a;
+
+   assign a_out = bram_reg_a;
+
+
+   //
+   // Read-Write Port A
+   //
+   always @(posedge clk) begin
+      //
+      bram_reg_a <= bram[a_addr];
+      //
+      if (a_wr) bram[a_addr] <= a_in;
+      //
+   end
+
+
+endmodule
-- 
cgit v1.2.3