aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-06-27 13:44:08 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-06-27 13:44:08 +0300
commit0b873507ad47e3046935dfc8b3f91d36bc21c7b0 (patch)
tree2d43183574d7d6695be5c0c48dcd60ceb697c31a
parent46b01cbf6e375eee7291efe7a4842a928bde4440 (diff)
Added systolic modular multiplier w/ testbench.
* works in simulator * may have to change how internal operand buffer is pre-loaded (shift register instead of wide mux?) * code needs some cleanup
-rw-r--r--src/rtl/modexpa7_systolic_multiplier.v876
-rw-r--r--src/rtl/util/bram_1rw_1ro_readfirst.v88
-rw-r--r--src/rtl/util/bram_1rw_readfirst.v75
-rw-r--r--src/tb/tb_systolic_multiplier.v545
4 files changed, 1584 insertions, 0 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
new file mode 100644
index 0000000..0849b61
--- /dev/null
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -0,0 +1,876 @@
+//======================================================================
+//
+// modexpa7_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module modexpa7_systolic_multiplier #
+ (
+ //
+ // This sets the address widths of memory buffers. Internal data
+ // width is 32 bits, so for e.g. 1024-bit operands buffers must store
+ // 1024 / 32 = 32 words, and these need 5-bit address bus, because
+ // 2 ** 5 = 32.
+ //
+ parameter OPERAND_ADDR_WIDTH = 5,
+
+ //
+ // This sets the width of the systolic cycle counter. TODO: Explain.
+ //
+ parameter SYSTOLIC_ARRAY_POWER = 3
+ )
+ (
+ input clk,
+ input rst_n,
+
+ input ena,
+ output rdy,
+
+ output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr,
+
+ input [ 32-1:0] a_bram_out,
+ input [ 32-1:0] b_bram_out,
+ input [ 32-1:0] n_bram_out,
+ input [ 32-1:0] n_coeff_bram_out,
+
+ output [ 32-1:0] r_bram_in,
+ output r_bram_wr,
+
+ input [OPERAND_ADDR_WIDTH-1:0] n_num_words
+ );
+
+
+ //
+ // Constants
+ //
+ localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
+ localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER;
+ localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH;
+
+ localparam SYSTOLIC_PE_LATENCY = 4;
+
+
+ //
+ // FSM Declaration
+ //
+ localparam [ 3: 0] FSM_STATE_IDLE = 4'd0;
+ localparam [ 3: 0] FSM_STATE_INIT_ZERO_ADDR = 4'd1;
+ localparam [ 3: 0] FSM_STATE_INIT_NEXT_ADDR = 4'd2;
+ localparam [ 3: 0] FSM_STATE_INIT_LAST_ADDR = 4'd3;
+ localparam [ 3: 0] FSM_STATE_PIPE_CRUNCH = 4'd4;
+ localparam [ 3: 0] FSM_STATE_PIPE_RELOAD = 4'd5;
+ localparam [ 3: 0] FSM_STATE_SAVE_ZERO_ADDR = 4'd6;
+ localparam [ 3: 0] FSM_STATE_SAVE_NEXT_ADDR = 4'd7;
+ localparam [ 3: 0] FSM_STATE_SAVE_LAST_ADDR = 4'd8;
+ localparam [ 3: 0] FSM_STATE_STOP = 4'd9;
+
+ reg [ 3: 0] fsm_state = FSM_STATE_IDLE;
+ reg [ 3: 0] fsm_next_state;
+
+
+ //
+ // Enable Delay (Trigger)
+ //
+ reg ena_dly = 1'b0;
+ wire ena_trig = ena && !ena_dly;
+ always @(posedge clk) ena_dly <= ena;
+
+
+ //
+ // Parameters Latch
+ //
+ reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch;
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
+ n_num_words_latch <= n_num_words;
+
+
+ //
+ // Addresses
+ //
+ localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch;
+
+
+ //
+ // BRAM Addresses
+ //
+ reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] r_bram_addr_reg;
+
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr = s_bram_addr_reg;
+
+ reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_dly;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_dly;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_dly;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_dly;
+
+ wire [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_next = b_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_next = a_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_next = n_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_next = s_bram_addr + 1'b1;
+
+ wire b_bram_addr_done =
+ (b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+ wire s_bram_addr_done =
+ (s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+ assign b_bram_addr = b_bram_addr_reg;
+ assign a_bram_addr = a_bram_addr_reg;
+ assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
+ assign n_bram_addr = n_bram_addr_reg;
+ assign r_bram_addr = r_bram_addr_reg;
+
+ always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
+ always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
+ always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
+ always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_ZERO_ADDR: b_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_INIT_NEXT_ADDR: b_bram_addr_reg <= b_bram_addr_next;
+ endcase
+
+ always @(posedge clk)
+ case (fsm_next_state)
+ FSM_STATE_SAVE_ZERO_ADDR: s_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_SAVE_NEXT_ADDR: s_bram_addr_reg <= s_bram_addr_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_LAST_ADDR: a_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_PIPE_RELOAD: a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_ZERO_ADDR: n_coeff_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_INIT_NEXT_ADDR: n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
+ endcase
+
+
+
+
+ //
+ // Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
+ //
+ wire [SYSTOLIC_PE_LATENCY:0] pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+
+ reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb;
+ reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb;
+
+ wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb_next =
+ {pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};
+
+ wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb_next =
+ {pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};
+
+ wire pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
+ wire pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR,
+ FSM_STATE_PIPE_RELOAD: pe_latency_ab_lsb <= pe_latency_start;
+ FSM_STATE_PIPE_CRUNCH: pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
+ pe_latency_ab_lsb : pe_latency_ab_lsb_next;
+ endcase
+
+ //
+ // Buffers
+ //
+ integer i, j;
+
+ reg [31: 0] b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_ZERO_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ b_buf[i][j] <= 32'd0;
+
+ FSM_STATE_INIT_NEXT_ADDR,
+ FSM_STATE_INIT_LAST_ADDR:
+ b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_ZERO_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ n_coeff_buf[i][j] <= 32'd0;
+
+ FSM_STATE_INIT_NEXT_ADDR,
+ FSM_STATE_INIT_LAST_ADDR:
+ n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_ZERO_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ n_buf[i][j] <= 32'd0;
+
+ FSM_STATE_INIT_NEXT_ADDR,
+ FSM_STATE_INIT_LAST_ADDR:
+ n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
+ endcase
+
+
+
+
+
+
+ //
+ // Cycle Counters
+ //
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab;
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_q;
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn;
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_s;
+
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_half = {1'b0, n_num_words};
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_last = {n_num_words, 1'b1};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+ wire mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
+ wire mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
+ wire mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
+ wire mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
+
+ wire syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab_next = mult_cnt_ab + 1'b1;
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_q_next = mult_cnt_q + 1'b1;
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn_next = mult_cnt_qn + 1'b1;
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_s_next = mult_cnt_s + 1'b1;
+
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;
+
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR,
+ FSM_STATE_PIPE_RELOAD: syst_cnt <= syst_cnt_zero;
+ FSM_STATE_PIPE_CRUNCH: syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_ab <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_q <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_qn <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_s <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
+ endcase
+
+
+ always @(posedge clk) begin
+ syst_cnt_dly[0] <= syst_cnt;
+ for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
+ syst_cnt_dly[i] <= syst_cnt_dly[i-1];
+ end
+
+ //
+ // Systolic Array
+ //
+ wire [31: 0] mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+ wire [31: 0] mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ wire [31: 0] mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+ wire [31: 0] mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ wire [31: 0] mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+ wire [31: 0] mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ wire [31: 0] mul_ab_a = (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
+ reg [31: 0] mul_q_a_int;
+ reg [31: 0] mul_q_a;
+ reg [31: 0] mul_qn_a_int;
+ reg [31: 0] mul_qn_a;
+
+ reg [31: 0] t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ reg [31: 0] t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ reg [31: 0] t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ genvar syst;
+ generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
+ begin : gen_mul
+
+ /*modexpa7_*/pe_mul mul_ab_inst
+ (
+ .clk (clk),
+ .a (mul_ab_a),
+ .b (b_buf[syst_cnt][syst]),
+ .t (t_ab[syst_cnt][syst]),
+ .c_in (c_ab_in[syst_cnt][syst]),
+
+ .p (mul_ab_p[syst]),
+ .c_out (mul_ab_c_out[syst])
+ );
+
+ /*modexpa7_*/pe_mul mul_q_inst
+ (
+ .clk (clk),
+ .a (mul_q_a),
+ .b (n_coeff_buf[syst_cnt][syst]),
+ .t (t_q[syst_cnt][syst]),
+ .c_in (c_q_in[syst_cnt][syst]),
+
+ .p (mul_q_p[syst]),
+ .c_out (mul_q_c_out[syst])
+ );
+
+
+ /*modexpa7_*/pe_mul mul_qn_inst
+ (
+ .clk (clk),
+ .a (mul_qn_a),
+ .b (n_buf[syst_cnt][syst]),
+ .t (t_qn[syst_cnt][syst]),
+ .c_in (c_qn_in[syst_cnt][syst]),
+
+ .p (mul_qn_p[syst]),
+ .c_out (mul_qn_c_out[syst])
+ );
+
+ end
+ endgenerate
+
+ //
+ // c_ab
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_ab_in[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
+ endcase
+
+ //
+ // c_q
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_q_in[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
+ endcase
+
+ //
+ // c_qn
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_qn_in[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
+ endcase
+
+ //
+ // t_ab
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_ab[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done) begin
+ if (syst_cnt_latency > syst_cnt_zero)
+ t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
+ end
+
+ endcase
+
+
+ //
+ // t_q
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_q[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
+ if (syst_cnt_latency > syst_cnt_zero)
+ t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
+ end
+
+ endcase
+
+
+ //
+ // t_qn
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_qn[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
+ if (syst_cnt_latency > syst_cnt_zero)
+ t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
+ end
+
+ endcase
+
+ //
+ // Latency 2
+ //
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR,
+ FSM_STATE_PIPE_RELOAD: pe_latency_ab_msb <= pe_latency_start;
+ FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done)
+ pe_latency_ab_msb <= pe_latency_ab_msb_done ?
+ pe_latency_ab_msb : pe_latency_ab_msb_next;
+ endcase
+
+
+ //
+ // Adder
+ //
+ reg pe_add_ce;
+ reg [31: 0] pe_add_a0;
+ reg [31: 0] pe_add_a1;
+ reg [31: 0] pe_add_a2;
+ reg [31: 0] pe_add_b0;
+
+ reg pe_add_c_in;
+ wire [31: 0] pe_add_s;
+ wire pe_add_c_out;
+
+ reg pe_sub_ce;
+ reg [31: 0] pe_sub_a0;
+ reg [31: 0] pe_sub_b0;
+
+ reg pe_sub_b_in;
+ wire [31: 0] pe_sub_d;
+ wire pe_sub_b_out;
+
+ always @(posedge clk)
+ pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;
+
+ always @(posedge clk)
+ pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);
+
+ always @(posedge clk)
+ //
+ if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
+ pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;
+
+ always @(posedge clk)
+ //
+ if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
+ pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
+
+
+ modexpa7_pe_add pe_add_inst
+ (
+ .clk (clk),
+ .ce (pe_add_ce),
+ .a (pe_add_a2),
+ .b (pe_add_b0),
+ .c_in (pe_add_c_in),
+ .s (pe_add_s),
+ .c_out (pe_add_c_out)
+ );
+
+ modexpa7_pe_sub pe_sub_inst
+ (
+ .clk (clk),
+ .ce (pe_sub_ce),
+ .a (pe_sub_a0),
+ .b (pe_sub_b0),
+ .b_in (pe_sub_b_in),
+ .d (pe_sub_d),
+ .b_out (pe_sub_b_out)
+ );
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
+ pe_add_a0 <= mul_ab_p[0];
+ pe_add_a1 <= pe_add_a0;
+ pe_add_a2 <= pe_add_a1;
+ end
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ pe_sub_a0 <= pe_add_s;
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ pe_add_b0 <= mul_qn_p[0];
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
+
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_ZERO_ADDR: n_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_INIT_NEXT_ADDR: n_bram_addr_reg <= n_bram_addr_next;
+ FSM_STATE_PIPE_RELOAD: begin
+ if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
+ if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
+ end
+ endcase
+
+
+ //
+ // Ready Flag Logic
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else begin
+ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
+ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
+ end
+
+
+ //
+ //
+ //
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ mul_q_a_int <= mul_ab_p[0];
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ mul_qn_a_int <= mul_q_p[0];
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ mul_q_a <= mul_q_a_int; // TODO: Add masking! Maybe not needed after all?..
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
+
+ //
+ // Debug
+ //
+ //always @(posedge clk) begin
+ //
+ //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ //$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
+ //
+ //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ //$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
+ //
+ //if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ //$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
+ //
+ //if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ //$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
+ //
+ //end
+
+
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_rd;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr;
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
+ reg s_bram_en;
+
+ wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_rd;
+ reg [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr;
+ wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
+ reg sn_bram_en;
+
+ assign s_bram_addr_rd = s_bram_addr;
+ assign sn_bram_addr_rd = s_bram_addr;
+
+ wire [31: 0] s_bram_din;
+ wire [31: 0] s_bram_dout;
+
+ wire [31: 0] sn_bram_din;
+ wire [31: 0] sn_bram_dout;
+
+ assign s_bram_din = pe_add_s;
+ assign sn_bram_din = pe_sub_d;
+
+ always @(posedge clk)
+ //
+ s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);
+
+ always @(posedge clk)
+ //
+ sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
+
+ always @(posedge clk) begin
+ //
+ if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
+ if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
+ end
+
+ always @(posedge clk) begin
+ //
+ if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
+ if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
+ end
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_s (.clk(clk),
+ .a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
+ .b_addr(s_bram_addr_rd), .b_out(s_bram_dout));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_sn (.clk(clk),
+ .a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
+ .b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
+
+
+ reg r_bram_en;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_ZERO_ADDR,
+ FSM_STATE_SAVE_NEXT_ADDR: r_bram_en <= 1'b1;
+ default: r_bram_en <= 1'b0;
+
+ endcase
+
+
+
+ reg r_bram_wr_reg;
+
+ assign r_bram_wr = r_bram_wr_reg;
+
+ always @(posedge clk)
+ //
+ r_bram_wr_reg <= r_bram_en;
+
+
+ wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
+
+
+ reg [31: 0] r_bram_in_reg;
+
+ assign r_bram_in = r_bram_in_reg;
+
+ always @(posedge clk)
+ //
+ if (r_bram_en)
+ r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
+
+ always @(posedge clk)
+ //
+ if (r_bram_en)
+ r_bram_addr_reg <= s_bram_addr_dly;
+
+
+ //
+ // FSM Transition Logic
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_next_state;
+
+ always @* begin
+ //
+ fsm_next_state = FSM_STATE_STOP;
+ //
+ case (fsm_state)
+
+ FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
+ else fsm_next_state = FSM_STATE_IDLE;
+
+ FSM_STATE_INIT_ZERO_ADDR: fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+
+ FSM_STATE_INIT_NEXT_ADDR: if (b_bram_addr_done) fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
+ else fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+
+ FSM_STATE_INIT_LAST_ADDR: fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+ FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) fsm_next_state = pe_latency_ab_msb_done ?
+ FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
+ else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_s_done) fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
+ else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+ FSM_STATE_SAVE_ZERO_ADDR: fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+
+ FSM_STATE_SAVE_NEXT_ADDR: if (s_bram_addr_done) fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
+ else fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+
+ FSM_STATE_SAVE_LAST_ADDR: fsm_next_state = FSM_STATE_STOP;
+
+ FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
+
+ endcase
+ end
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================
diff --git a/src/rtl/util/bram_1rw_1ro_readfirst.v b/src/rtl/util/bram_1rw_1ro_readfirst.v
new file mode 100644
index 0000000..56cb24e
--- /dev/null
+++ b/src/rtl/util/bram_1rw_1ro_readfirst.v
@@ -0,0 +1,88 @@
+//======================================================================
+//
+// Copyright (c) 2015, 2017 NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module bram_1rw_1ro_readfirst
+ #(parameter MEM_WIDTH = 32,
+ parameter MEM_ADDR_BITS = 8)
+ (
+ input wire clk,
+
+ input wire [MEM_ADDR_BITS-1:0] a_addr,
+ input wire a_wr,
+ input wire [MEM_WIDTH-1:0] a_in,
+ output wire [MEM_WIDTH-1:0] a_out,
+
+ input wire [MEM_ADDR_BITS-1:0] b_addr,
+ output wire [MEM_WIDTH-1:0] b_out
+ );
+
+
+ //
+ // BRAM
+ //
+ (* RAM_STYLE="BLOCK" *)
+ reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+ //
+ // Output Registers
+ //
+ reg [MEM_WIDTH-1:0] bram_reg_a;
+ reg [MEM_WIDTH-1:0] bram_reg_b;
+
+ assign a_out = bram_reg_a;
+ assign b_out = bram_reg_b;
+
+
+ //
+ // Read-Write Port A
+ //
+ always @(posedge clk) begin
+ //
+ bram_reg_a <= bram[a_addr];
+ //
+ if (a_wr) bram[a_addr] <= a_in;
+ //
+ end
+
+
+ //
+ // Read-Only Port B
+ //
+ always @(posedge clk)
+ //
+ bram_reg_b <= bram[b_addr];
+
+
+endmodule
diff --git a/src/rtl/util/bram_1rw_readfirst.v b/src/rtl/util/bram_1rw_readfirst.v
new file mode 100644
index 0000000..30ecae8
--- /dev/null
+++ b/src/rtl/util/bram_1rw_readfirst.v
@@ -0,0 +1,75 @@
+//======================================================================
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module bram_1rw_readfirst
+ #(parameter MEM_WIDTH = 32,
+ parameter MEM_ADDR_BITS = 8)
+ (
+ input wire clk,
+
+ input wire [MEM_ADDR_BITS-1:0] a_addr,
+ input wire a_wr,
+ input wire [MEM_WIDTH-1:0] a_in,
+ output wire [MEM_WIDTH-1:0] a_out
+ );
+
+
+ //
+ // BRAM
+ //
+ (* RAM_STYLE="BLOCK" *)
+ reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+ //
+ // Output Register
+ //
+ reg [MEM_WIDTH-1:0] bram_reg_a;
+
+ assign a_out = bram_reg_a;
+
+
+ //
+ // Read-Write Port A
+ //
+ always @(posedge clk) begin
+ //
+ bram_reg_a <= bram[a_addr];
+ //
+ if (a_wr) bram[a_addr] <= a_in;
+ //
+ end
+
+
+endmodule
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
new file mode 100644
index 0000000..3cbb8d1
--- /dev/null
+++ b/src/tb/tb_systolic_multiplier.v
@@ -0,0 +1,545 @@
+//======================================================================
+//
+// tb_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Testbench for systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module tb_systolic_multiplier;
+
+
+ //
+ // Test Vectors
+ //
+ `include "../modexp_fpga_model_vectors.v";
+
+
+ //
+ // Parameters
+ //
+ localparam NUM_WORDS_384 = 384 / 32;
+ localparam NUM_WORDS_512 = 512 / 32;
+
+
+ //
+ // Model Settings
+ //
+ localparam NUM_ROUNDS = 10;
+
+
+ //
+ // Clock (100 MHz)
+ //
+ reg clk = 1'b0;
+ always #5 clk = ~clk;
+
+
+ //
+ // Inputs
+ //
+ reg rst_n;
+ reg ena;
+
+ reg [ 3: 0] n_num_words;
+
+
+ //
+ // Outputs
+ //
+ wire rdy;
+
+
+ //
+ // Integers
+ //
+ integer w;
+
+
+ //
+ // BRAM Interfaces
+ //
+ wire [ 3: 0] core_a_addr;
+ wire [ 3: 0] core_b_addr;
+ wire [ 3: 0] core_n_addr;
+ wire [ 3: 0] core_n_coeff_addr;
+ wire [ 3: 0] core_r_addr;
+
+ wire [31: 0] core_a_data;
+ wire [31: 0] core_b_data;
+ wire [31: 0] core_n_data;
+ wire [31: 0] core_n_coeff_data;
+ wire [31: 0] core_r_data;
+
+ wire core_r_wren;
+
+ reg [ 3: 0] tb_abn_addr;
+ reg [ 3: 0] tb_r_addr;
+
+ reg [31:0] tb_a_data;
+ reg [31:0] tb_b_data;
+ reg [31:0] tb_n_data;
+ reg [31:0] tb_n_coeff_data;
+ wire [31:0] tb_r_data;
+
+ reg tb_abn_wren;
+
+
+ //
+ // BRAMs
+ //
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+ bram_a (.clk(clk),
+ .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_a_data), .a_out(),
+ .b_addr(core_a_addr), .b_out(core_a_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+ bram_b (.clk(clk),
+ .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_b_data), .a_out(),
+ .b_addr(core_b_addr), .b_out(core_b_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+ bram_n (.clk(clk),
+ .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_n_data), .a_out(),
+ .b_addr(core_n_addr), .b_out(core_n_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+ bram_n_coeff (.clk(clk),
+ .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_n_coeff_data), .a_out(),
+ .b_addr(core_n_coeff_addr), .b_out(core_n_coeff_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4))
+ bram_r (.clk(clk),
+ .a_addr(core_r_addr), .a_wr(core_r_wren), .a_in(core_r_data), .a_out(),
+ .b_addr(tb_r_addr), .b_out(tb_r_data));
+
+
+ //
+ // UUT
+ //
+ modexpa7_systolic_multiplier #
+ (
+ .OPERAND_ADDR_WIDTH (4), // 32 * (2**4) = 512-bit operands
+ .SYSTOLIC_ARRAY_POWER (2) // 2 ** 2 = 4-tap array
+ )
+ uut
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+
+ .ena (ena),
+ .rdy (rdy),
+
+ .a_bram_addr (core_a_addr),
+ .b_bram_addr (core_b_addr),
+ .n_bram_addr (core_n_addr),
+ .n_coeff_bram_addr (core_n_coeff_addr),
+ .r_bram_addr (core_r_addr),
+
+ .a_bram_out (core_a_data),
+ .b_bram_out (core_b_data),
+ .n_bram_out (core_n_data),
+ .n_coeff_bram_out (core_n_coeff_data),
+
+ .r_bram_in (core_r_data),
+ .r_bram_wr (core_r_wren),
+
+ .n_num_words (n_num_words)
+ );
+
+
+ //
+ // Script
+ //
+ initial begin
+
+ rst_n = 1'b0;
+ ena = 1'b0;
+
+ #200;
+ rst_n = 1'b1;
+ #100;
+
+ test_systolic_multiplier_384(M_384, N_384, N_COEFF_384, FACTOR_384, COEFF_384);
+ test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512);
+
+ end
+
+
+ //
+ // Test Tasks
+ //
+ task test_systolic_multiplier_384;
+
+ input [383:0] m;
+ input [383:0] n;
+ input [383:0] n_coeff;
+ input [383:0] factor;
+ input [383:0] coeff;
+
+ reg [767:0] m_factor_full;
+ reg [383:0] m_factor_modulo;
+
+ reg [383:0] a;
+ reg [383:0] b;
+ reg [383:0] r;
+
+ reg [767:0] ab_full;
+ reg [383:0] ab_modulo;
+
+ integer round;
+ integer num_passed;
+ integer num_failed;
+
+ begin
+
+ m_factor_full = m * factor; // m * factor
+ m_factor_modulo = m_factor_full % n; // m * factor % n
+
+ m_factor_full = m_factor_modulo * coeff; // m * factor * coeff
+ m_factor_modulo = m_factor_full % n; // m * factor * coeff % n
+
+ a = m_factor_modulo; // start with a = m_factor...
+ b = m_factor_modulo; // ... and b = m_factor
+
+ n_num_words = 4'd11; // set number of words
+
+ num_passed = 0; // nothing tested so far
+ num_failed = 0; //
+
+ for (round=0; round<NUM_ROUNDS; round=round+1) begin
+
+ // obtain reference value of product
+ ab_full = a * b; // calculate product
+ ab_modulo = ab_full % n; // reduce
+
+ ab_full = ab_modulo * coeff; // take extra coefficient into account
+ ab_modulo = ab_full % n; // reduce again
+
+ write_memories_384(a, b, n, n_coeff); // fill memories
+
+ ena = 1; // start operation
+ #10; //
+ ena = 0; // clear flag
+
+ while (!rdy) #10; // wait for operation to complete
+
+ read_memory_384(r); // get result from memory
+
+ $display("test_systolic_multiplier_384(): round #%0d of %0d", round+1, NUM_ROUNDS);
+ $display(" calculated: %x", r);
+ $display(" expected: %x", ab_modulo);
+
+ // check calculated value
+ if (r === ab_modulo) begin
+ $display(" OK");
+ num_passed = num_passed + 1;
+ end else begin
+ $display(" ERROR");
+ num_failed = num_failed + 1;
+ end
+
+ b = ab_modulo; // prepare for next round
+
+ end
+
+ // final step, display results
+ if (num_passed == NUM_ROUNDS)
+ $display("SUCCESS: All tests passed.");
+ else
+ $display("FAILURE: %0d test(s) not passed.", num_failed);
+
+ end
+
+ endtask
+
+
+ //
+ // Test Tasks
+ //
+ task test_systolic_multiplier_512;
+
+ input [ 511:0] m;
+ input [ 511:0] n;
+ input [ 511:0] n_coeff;
+ input [ 511:0] factor;
+ input [ 511:0] coeff;
+
+ reg [1023:0] m_factor_full;
+ reg [ 511:0] m_factor_modulo;
+
+ reg [ 511:0] a;
+ reg [ 511:0] b;
+ reg [ 511:0] r;
+
+ reg [1023:0] ab_full;
+ reg [ 511:0] ab_modulo;
+
+ integer round;
+ integer num_passed;
+ integer num_failed;
+
+ begin
+
+ m_factor_full = m * factor; // m * factor
+ m_factor_modulo = m_factor_full % n; // m * factor % n
+
+ m_factor_full = m_factor_modulo * coeff; // m * factor * coeff
+ m_factor_modulo = m_factor_full % n; // m * factor * coeff % n
+
+ a = m_factor_modulo; // start with a = m_factor...
+ b = m_factor_modulo; // ... and b = m_factor
+
+ n_num_words = 4'd15; // set number of words
+
+ num_passed = 0; // nothing tested so far
+ num_failed = 0; //
+
+ for (round=0; round<NUM_ROUNDS; round=round+1) begin
+
+ // obtain reference value of product
+ ab_full = a * b; // calculate product
+ ab_modulo = ab_full % n; // reduce
+
+ ab_full = ab_modulo * coeff; // take extra coefficient into account
+ ab_modulo = ab_full % n; // reduce again
+
+ write_memories_512(a, b, n, n_coeff); // fill memories
+
+ ena = 1; // start operation
+ #10; //
+ ena = 0; // clear flag
+
+ while (!rdy) #10; // wait for operation to complete
+
+ read_memory_512(r); // get result from memory
+
+ $display("test_systolic_multiplier_512(): round #%0d of %0d", round+1, NUM_ROUNDS);
+ $display(" calculated: %x", r);
+ $display(" expected: %x", ab_modulo);
+
+ // check calculated value
+ if (r === ab_modulo) begin
+ $display(" OK");
+ num_passed = num_passed + 1;
+ end else begin
+ $display(" ERROR");
+ num_failed = num_failed + 1;
+ end
+
+ b = ab_modulo; // prepare for next round
+
+ end
+
+ // final step, display results
+ if (num_passed == NUM_ROUNDS)
+ $display("SUCCESS: All tests passed.");
+ else
+ $display("FAILURE: %0d test(s) not passed.", num_failed);
+
+ end
+
+ endtask
+
+
+ //
+ // BRAM Writer
+ //
+ task write_memories_384;
+
+ input [383:0] a;
+ input [383:0] b;
+ input [383:0] n;
+ input [383:0] n_coeff;
+
+ reg [383:0] a_shreg;
+ reg [383:0] b_shreg;
+ reg [383:0] n_shreg;
+ reg [383:0] n_coeff_shreg;
+
+ begin
+
+ tb_abn_wren = 1; // start filling memories
+
+ a_shreg = a; // initialize shift registers
+ b_shreg = b; //
+ n_shreg = n; //
+ n_coeff_shreg = n_coeff; //
+
+ for (w=0; w<NUM_WORDS_384; w=w+1) begin // write all words
+
+ tb_abn_addr = w[3:0]; // set addresses
+
+ tb_a_data = a_shreg[31:0]; // set data words
+ tb_b_data = b_shreg[31:0]; //
+ tb_n_data = n_shreg[31:0]; //
+ tb_n_coeff_data = n_coeff_shreg[31:0]; //
+
+ a_shreg = {{32{1'bX}}, a_shreg[383:32]}; // shift inputs
+ b_shreg = {{32{1'bX}}, b_shreg[383:32]}; //
+ n_shreg = {{32{1'bX}}, n_shreg[383:32]}; //
+ n_coeff_shreg = {{32{1'bX}}, n_coeff_shreg[383:32]}; //
+
+ #10; // wait for 1 clock tick
+
+ end
+
+ tb_abn_addr = {4{1'bX}}; // wipe addresses
+
+ tb_a_data = {32{1'bX}}; // wipe data words
+ tb_b_data = {32{1'bX}}; //
+ tb_n_data = {32{1'bX}}; //
+ tb_n_coeff_data = {32{1'bX}}; //
+
+ tb_abn_wren = 0; // stop filling memories
+
+ end
+
+ endtask
+
+
+ //
+ // BRAM Writer
+ //
+ task write_memories_512;
+
+ input [511:0] a;
+ input [511:0] b;
+ input [511:0] n;
+ input [511:0] n_coeff;
+
+ reg [511:0] a_shreg;
+ reg [511:0] b_shreg;
+ reg [511:0] n_shreg;
+ reg [511:0] n_coeff_shreg;
+
+ begin
+
+ tb_abn_wren = 1; // start filling memories
+
+ a_shreg = a; // initialize shift registers
+ b_shreg = b; //
+ n_shreg = n; //
+ n_coeff_shreg = n_coeff; //
+
+ for (w=0; w<NUM_WORDS_512; w=w+1) begin // write all words
+
+ tb_abn_addr = w[3:0]; // set addresses
+
+ tb_a_data = a_shreg[31:0]; // set data words
+ tb_b_data = b_shreg[31:0]; //
+ tb_n_data = n_shreg[31:0]; //
+ tb_n_coeff_data = n_coeff_shreg[31:0]; //
+
+ a_shreg = {{32{1'bX}}, a_shreg[511:32]}; // shift inputs
+ b_shreg = {{32{1'bX}}, b_shreg[511:32]}; //
+ n_shreg = {{32{1'bX}}, n_shreg[511:32]}; //
+ n_coeff_shreg = {{32{1'bX}}, n_coeff_shreg[511:32]}; //
+
+ #10; // wait for 1 clock tick
+
+ end
+
+ tb_abn_addr = {4{1'bX}}; // wipe addresses
+
+ tb_a_data = {32{1'bX}}; // wipe data words
+ tb_b_data = {32{1'bX}}; //
+ tb_n_data = {32{1'bX}}; //
+ tb_n_coeff_data = {32{1'bX}}; //
+
+ tb_abn_wren = 0; // stop filling memories
+
+ end
+
+ endtask
+
+
+ //
+ // BRAM Reader
+ //
+ task read_memory_384;
+
+ output [383:0] r;
+ reg [383:0] r_shreg;
+
+ begin
+
+ for (w=0; w<NUM_WORDS_384; w=w+1) begin // read result
+
+ tb_r_addr = w[3:0]; // set address
+ #10; // wait for 1 clock tick
+ r_shreg = {tb_r_data, r_shreg[383:32]}; // store data word
+
+ end
+
+ tb_r_addr = {4{1'bX}}; // wipe address
+ r = r_shreg; // return
+
+ end
+
+ endtask
+
+
+ //
+ // BRAM Reader
+ //
+ task read_memory_512;
+
+ output [511:0] r;
+ reg [511:0] r_shreg;
+
+ begin
+
+ for (w=0; w<NUM_WORDS_512; w=w+1) begin // read result
+
+ tb_r_addr = w[3:0]; // set address
+ #10; // wait for 1 clock tick
+ r_shreg = {tb_r_data, r_shreg[511:32]}; // store data word
+
+ end
+
+ tb_r_addr = {4{1'bX}}; // wipe address
+ r = r_shreg; // return
+
+ end
+
+ endtask
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================