From 0b873507ad47e3046935dfc8b3f91d36bc21c7b0 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Tue, 27 Jun 2017 13:44:08 +0300 Subject: Added systolic modular multiplier w/ testbench. * works in simulator * may have to change how internal operand buffer is pre-loaded (shift register instead of wide mux?) * code needs some cleanup --- src/rtl/modexpa7_systolic_multiplier.v | 876 +++++++++++++++++++++++++++++++++ src/rtl/util/bram_1rw_1ro_readfirst.v | 88 ++++ src/rtl/util/bram_1rw_readfirst.v | 75 +++ src/tb/tb_systolic_multiplier.v | 545 ++++++++++++++++++++ 4 files changed, 1584 insertions(+) create mode 100644 src/rtl/modexpa7_systolic_multiplier.v create mode 100644 src/rtl/util/bram_1rw_1ro_readfirst.v create mode 100644 src/rtl/util/bram_1rw_readfirst.v create mode 100644 src/tb/tb_systolic_multiplier.v diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v new file mode 100644 index 0000000..0849b61 --- /dev/null +++ b/src/rtl/modexpa7_systolic_multiplier.v @@ -0,0 +1,876 @@ +//====================================================================== +// +// modexpa7_systolic_multiplier.v +// ----------------------------------------------------------------------------- +// Systolic Montgomery multiplier. +// +// Authors: Pavel Shatov +// +// Copyright (c) 2017, NORDUnet A/S All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +//====================================================================== + +module modexpa7_systolic_multiplier # + ( + // + // This sets the address widths of memory buffers. Internal data + // width is 32 bits, so for e.g. 1024-bit operands buffers must store + // 1024 / 32 = 32 words, and these need 5-bit address bus, because + // 2 ** 5 = 32. + // + parameter OPERAND_ADDR_WIDTH = 5, + + // + // This sets the width of the systolic cycle counter. TODO: Explain. + // + parameter SYSTOLIC_ARRAY_POWER = 3 + ) + ( + input clk, + input rst_n, + + input ena, + output rdy, + + output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, + + input [ 32-1:0] a_bram_out, + input [ 32-1:0] b_bram_out, + input [ 32-1:0] n_bram_out, + input [ 32-1:0] n_coeff_bram_out, + + output [ 32-1:0] r_bram_in, + output r_bram_wr, + + input [OPERAND_ADDR_WIDTH-1:0] n_num_words + ); + + + // + // Constants + // + localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER; + localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER; + localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH; + + localparam SYSTOLIC_PE_LATENCY = 4; + + + // + // FSM Declaration + // + localparam [ 3: 0] FSM_STATE_IDLE = 4'd0; + localparam [ 3: 0] FSM_STATE_INIT_ZERO_ADDR = 4'd1; + localparam [ 3: 0] FSM_STATE_INIT_NEXT_ADDR = 4'd2; + localparam [ 3: 0] FSM_STATE_INIT_LAST_ADDR = 4'd3; + localparam [ 3: 0] FSM_STATE_PIPE_CRUNCH = 4'd4; + localparam [ 3: 0] FSM_STATE_PIPE_RELOAD = 4'd5; + localparam [ 3: 0] FSM_STATE_SAVE_ZERO_ADDR = 4'd6; + localparam [ 3: 0] FSM_STATE_SAVE_NEXT_ADDR = 4'd7; + localparam [ 3: 0] FSM_STATE_SAVE_LAST_ADDR = 4'd8; + localparam [ 3: 0] FSM_STATE_STOP = 4'd9; + + reg [ 3: 0] fsm_state = FSM_STATE_IDLE; + reg [ 3: 0] fsm_next_state; + + + // + // Enable Delay (Trigger) + // + reg ena_dly = 1'b0; + wire ena_trig = ena && !ena_dly; + always @(posedge clk) ena_dly <= ena; + + + // + // Parameters Latch + // + reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; + + always @(posedge clk) + // + if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR) + n_num_words_latch <= n_num_words; + + + // + // Addresses + // + localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}}; + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch; + + + // + // BRAM Addresses + // + reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_reg; + reg [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_reg; + reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_reg; + reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_reg; + reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_reg; + reg [OPERAND_ADDR_WIDTH-1:0] r_bram_addr_reg; + + wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr = s_bram_addr_reg; + + reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_dly; + reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_dly; + reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_dly; + reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_dly; + + wire [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_next = b_bram_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_next = a_bram_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_next = n_bram_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_next = s_bram_addr + 1'b1; + + wire b_bram_addr_done = + (b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0; + + wire s_bram_addr_done = + (s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0; + + assign b_bram_addr = b_bram_addr_reg; + assign a_bram_addr = a_bram_addr_reg; + assign n_coeff_bram_addr = n_coeff_bram_addr_reg; + assign n_bram_addr = n_bram_addr_reg; + assign r_bram_addr = r_bram_addr_reg; + + always @(posedge clk) b_bram_addr_dly <= b_bram_addr; + always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr; + always @(posedge clk) n_bram_addr_dly <= n_bram_addr; + always @(posedge clk) s_bram_addr_dly <= s_bram_addr; + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_INIT_ZERO_ADDR: b_bram_addr_reg <= bram_addr_zero; + FSM_STATE_INIT_NEXT_ADDR: b_bram_addr_reg <= b_bram_addr_next; + endcase + + always @(posedge clk) + case (fsm_next_state) + FSM_STATE_SAVE_ZERO_ADDR: s_bram_addr_reg <= bram_addr_zero; + FSM_STATE_SAVE_NEXT_ADDR: s_bram_addr_reg <= s_bram_addr_next; + endcase + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_INIT_LAST_ADDR: a_bram_addr_reg <= bram_addr_zero; + FSM_STATE_PIPE_RELOAD: a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr; + endcase + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_INIT_ZERO_ADDR: n_coeff_bram_addr_reg <= bram_addr_zero; + FSM_STATE_INIT_NEXT_ADDR: n_coeff_bram_addr_reg <= n_coeff_bram_addr_next; + endcase + + + + + // + // Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles... + // + wire [SYSTOLIC_PE_LATENCY:0] pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1}; + + reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb; + reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb; + + wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb_next = + {pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]}; + + wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb_next = + {pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]}; + + wire pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]; + wire pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]; + + always @(posedge clk) + // + if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) + // + case (fsm_state) + FSM_STATE_INIT_LAST_ADDR, + FSM_STATE_PIPE_RELOAD: pe_latency_ab_lsb <= pe_latency_start; + FSM_STATE_PIPE_CRUNCH: pe_latency_ab_lsb <= pe_latency_ab_lsb_done ? + pe_latency_ab_lsb : pe_latency_ab_lsb_next; + endcase + + // + // Buffers + // + integer i, j; + + reg [31: 0] b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; + reg [31: 0] n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; + reg [31: 0] n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_INIT_ZERO_ADDR: + for (i=0; i mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next; + endcase + + always @(posedge clk) + // + if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) + // + case (fsm_state) + FSM_STATE_INIT_LAST_ADDR: mult_cnt_qn <= mult_cnt_zero; + FSM_STATE_PIPE_RELOAD: if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next; + endcase + + always @(posedge clk) + // + if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) + // + case (fsm_state) + FSM_STATE_INIT_LAST_ADDR: mult_cnt_s <= mult_cnt_zero; + FSM_STATE_PIPE_RELOAD: if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next; + endcase + + + always @(posedge clk) begin + syst_cnt_dly[0] <= syst_cnt; + for (i=1; i mult_cnt_zero)) + for (j=0; j mult_cnt_zero)) + for (j=0; j syst_cnt_zero) + t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0]; + for (j=1; j mult_cnt_zero)) begin + if (syst_cnt_latency > syst_cnt_zero) + t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0]; + for (j=1; j mult_cnt_zero)) begin + if (syst_cnt_latency > syst_cnt_zero) + t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0]; + for (j=1; j mult_cnt_zero) && !mult_cnt_s_done; + + always @(posedge clk) + pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero); + + always @(posedge clk) + // + if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done) + pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out; + + always @(posedge clk) + // + if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero)) + pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out; + + + modexpa7_pe_add pe_add_inst + ( + .clk (clk), + .ce (pe_add_ce), + .a (pe_add_a2), + .b (pe_add_b0), + .c_in (pe_add_c_in), + .s (pe_add_s), + .c_out (pe_add_c_out) + ); + + modexpa7_pe_sub pe_sub_inst + ( + .clk (clk), + .ce (pe_sub_ce), + .a (pe_sub_a0), + .b (pe_sub_b0), + .b_in (pe_sub_b_in), + .d (pe_sub_d), + .b_out (pe_sub_b_out) + ); + + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin + pe_add_a0 <= mul_ab_p[0]; + pe_add_a1 <= pe_add_a0; + pe_add_a2 <= pe_add_a1; + end + + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) + pe_sub_a0 <= pe_add_s; + + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) + pe_add_b0 <= mul_qn_p[0]; + + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) + pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out; + + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_INIT_ZERO_ADDR: n_bram_addr_reg <= bram_addr_zero; + FSM_STATE_INIT_NEXT_ADDR: n_bram_addr_reg <= n_bram_addr_next; + FSM_STATE_PIPE_RELOAD: begin + if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero; + if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next; + end + endcase + + + // + // Ready Flag Logic + // + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) rdy_reg <= 1'b1; + else begin + if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; + if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; + end + + + // + // + // + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) + mul_q_a_int <= mul_ab_p[0]; + + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) + mul_qn_a_int <= mul_q_p[0]; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_PIPE_RELOAD) + mul_q_a <= mul_q_a_int; // TODO: Add masking! Maybe not needed after all?.. + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_PIPE_RELOAD) + mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0; + + // + // Debug + // + //always @(posedge clk) begin + // + //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) + //$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]); + // + //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) + //$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]); + // + //if (fsm_state == FSM_STATE_PIPE_RELOAD) + //$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s); + // + //if (fsm_state == FSM_STATE_PIPE_RELOAD) + //$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d); + // + //end + + + wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_rd; + reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr; + wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr_next = s_bram_addr_wr + 1'b1; + reg s_bram_en; + + wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_rd; + reg [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr; + wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1; + reg sn_bram_en; + + assign s_bram_addr_rd = s_bram_addr; + assign sn_bram_addr_rd = s_bram_addr; + + wire [31: 0] s_bram_din; + wire [31: 0] s_bram_dout; + + wire [31: 0] sn_bram_din; + wire [31: 0] sn_bram_dout; + + assign s_bram_din = pe_add_s; + assign sn_bram_din = pe_sub_d; + + always @(posedge clk) + // + s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half); + + always @(posedge clk) + // + sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half); + + always @(posedge clk) begin + // + if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero; + if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next; + end + + always @(posedge clk) begin + // + if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero; + if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next; + end + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_s (.clk(clk), + .a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(), + .b_addr(s_bram_addr_rd), .b_out(s_bram_dout)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_sn (.clk(clk), + .a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(), + .b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout)); + + + reg r_bram_en; + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_SAVE_ZERO_ADDR, + FSM_STATE_SAVE_NEXT_ADDR: r_bram_en <= 1'b1; + default: r_bram_en <= 1'b0; + + endcase + + + + reg r_bram_wr_reg; + + assign r_bram_wr = r_bram_wr_reg; + + always @(posedge clk) + // + r_bram_wr_reg <= r_bram_en; + + + wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out; + + + reg [31: 0] r_bram_in_reg; + + assign r_bram_in = r_bram_in_reg; + + always @(posedge clk) + // + if (r_bram_en) + r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout; + + always @(posedge clk) + // + if (r_bram_en) + r_bram_addr_reg <= s_bram_addr_dly; + + + // + // FSM Transition Logic + // + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; + else fsm_state <= fsm_next_state; + + always @* begin + // + fsm_next_state = FSM_STATE_STOP; + // + case (fsm_state) + + FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_ZERO_ADDR; + else fsm_next_state = FSM_STATE_IDLE; + + FSM_STATE_INIT_ZERO_ADDR: fsm_next_state = FSM_STATE_INIT_NEXT_ADDR; + + FSM_STATE_INIT_NEXT_ADDR: if (b_bram_addr_done) fsm_next_state = FSM_STATE_INIT_LAST_ADDR; + else fsm_next_state = FSM_STATE_INIT_NEXT_ADDR; + + FSM_STATE_INIT_LAST_ADDR: fsm_next_state = FSM_STATE_PIPE_CRUNCH; + + FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) fsm_next_state = pe_latency_ab_msb_done ? + FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH; + else fsm_next_state = FSM_STATE_PIPE_CRUNCH; + + FSM_STATE_PIPE_RELOAD: if (mult_cnt_s_done) fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR; + else fsm_next_state = FSM_STATE_PIPE_CRUNCH; + + FSM_STATE_SAVE_ZERO_ADDR: fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR; + + FSM_STATE_SAVE_NEXT_ADDR: if (s_bram_addr_done) fsm_next_state = FSM_STATE_SAVE_LAST_ADDR; + else fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR; + + FSM_STATE_SAVE_LAST_ADDR: fsm_next_state = FSM_STATE_STOP; + + FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; + + endcase + end + + +endmodule + +//====================================================================== +// End of file +//====================================================================== diff --git a/src/rtl/util/bram_1rw_1ro_readfirst.v b/src/rtl/util/bram_1rw_1ro_readfirst.v new file mode 100644 index 0000000..56cb24e --- /dev/null +++ b/src/rtl/util/bram_1rw_1ro_readfirst.v @@ -0,0 +1,88 @@ +//====================================================================== +// +// Copyright (c) 2015, 2017 NORDUnet A/S All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +//====================================================================== + +`timescale 1ns / 1ps + +module bram_1rw_1ro_readfirst + #(parameter MEM_WIDTH = 32, + parameter MEM_ADDR_BITS = 8) + ( + input wire clk, + + input wire [MEM_ADDR_BITS-1:0] a_addr, + input wire a_wr, + input wire [MEM_WIDTH-1:0] a_in, + output wire [MEM_WIDTH-1:0] a_out, + + input wire [MEM_ADDR_BITS-1:0] b_addr, + output wire [MEM_WIDTH-1:0] b_out + ); + + + // + // BRAM + // + (* RAM_STYLE="BLOCK" *) + reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1]; + + + // + // Output Registers + // + reg [MEM_WIDTH-1:0] bram_reg_a; + reg [MEM_WIDTH-1:0] bram_reg_b; + + assign a_out = bram_reg_a; + assign b_out = bram_reg_b; + + + // + // Read-Write Port A + // + always @(posedge clk) begin + // + bram_reg_a <= bram[a_addr]; + // + if (a_wr) bram[a_addr] <= a_in; + // + end + + + // + // Read-Only Port B + // + always @(posedge clk) + // + bram_reg_b <= bram[b_addr]; + + +endmodule diff --git a/src/rtl/util/bram_1rw_readfirst.v b/src/rtl/util/bram_1rw_readfirst.v new file mode 100644 index 0000000..30ecae8 --- /dev/null +++ b/src/rtl/util/bram_1rw_readfirst.v @@ -0,0 +1,75 @@ +//====================================================================== +// +// Copyright (c) 2017, NORDUnet A/S All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +//====================================================================== + +`timescale 1ns / 1ps + +module bram_1rw_readfirst + #(parameter MEM_WIDTH = 32, + parameter MEM_ADDR_BITS = 8) + ( + input wire clk, + + input wire [MEM_ADDR_BITS-1:0] a_addr, + input wire a_wr, + input wire [MEM_WIDTH-1:0] a_in, + output wire [MEM_WIDTH-1:0] a_out + ); + + + // + // BRAM + // + (* RAM_STYLE="BLOCK" *) + reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1]; + + + // + // Output Register + // + reg [MEM_WIDTH-1:0] bram_reg_a; + + assign a_out = bram_reg_a; + + + // + // Read-Write Port A + // + always @(posedge clk) begin + // + bram_reg_a <= bram[a_addr]; + // + if (a_wr) bram[a_addr] <= a_in; + // + end + + +endmodule diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v new file mode 100644 index 0000000..3cbb8d1 --- /dev/null +++ b/src/tb/tb_systolic_multiplier.v @@ -0,0 +1,545 @@ +//====================================================================== +// +// tb_systolic_multiplier.v +// ----------------------------------------------------------------------------- +// Testbench for systolic Montgomery multiplier. +// +// Authors: Pavel Shatov +// +// Copyright (c) 2017, NORDUnet A/S All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +//====================================================================== + +`timescale 1ns / 1ps + +module tb_systolic_multiplier; + + + // + // Test Vectors + // + `include "../modexp_fpga_model_vectors.v"; + + + // + // Parameters + // + localparam NUM_WORDS_384 = 384 / 32; + localparam NUM_WORDS_512 = 512 / 32; + + + // + // Model Settings + // + localparam NUM_ROUNDS = 10; + + + // + // Clock (100 MHz) + // + reg clk = 1'b0; + always #5 clk = ~clk; + + + // + // Inputs + // + reg rst_n; + reg ena; + + reg [ 3: 0] n_num_words; + + + // + // Outputs + // + wire rdy; + + + // + // Integers + // + integer w; + + + // + // BRAM Interfaces + // + wire [ 3: 0] core_a_addr; + wire [ 3: 0] core_b_addr; + wire [ 3: 0] core_n_addr; + wire [ 3: 0] core_n_coeff_addr; + wire [ 3: 0] core_r_addr; + + wire [31: 0] core_a_data; + wire [31: 0] core_b_data; + wire [31: 0] core_n_data; + wire [31: 0] core_n_coeff_data; + wire [31: 0] core_r_data; + + wire core_r_wren; + + reg [ 3: 0] tb_abn_addr; + reg [ 3: 0] tb_r_addr; + + reg [31:0] tb_a_data; + reg [31:0] tb_b_data; + reg [31:0] tb_n_data; + reg [31:0] tb_n_coeff_data; + wire [31:0] tb_r_data; + + reg tb_abn_wren; + + + // + // BRAMs + // + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4)) + bram_a (.clk(clk), + .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_a_data), .a_out(), + .b_addr(core_a_addr), .b_out(core_a_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4)) + bram_b (.clk(clk), + .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_b_data), .a_out(), + .b_addr(core_b_addr), .b_out(core_b_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4)) + bram_n (.clk(clk), + .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_n_data), .a_out(), + .b_addr(core_n_addr), .b_out(core_n_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4)) + bram_n_coeff (.clk(clk), + .a_addr(tb_abn_addr), .a_wr(tb_abn_wren), .a_in(tb_n_coeff_data), .a_out(), + .b_addr(core_n_coeff_addr), .b_out(core_n_coeff_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(4)) + bram_r (.clk(clk), + .a_addr(core_r_addr), .a_wr(core_r_wren), .a_in(core_r_data), .a_out(), + .b_addr(tb_r_addr), .b_out(tb_r_data)); + + + // + // UUT + // + modexpa7_systolic_multiplier # + ( + .OPERAND_ADDR_WIDTH (4), // 32 * (2**4) = 512-bit operands + .SYSTOLIC_ARRAY_POWER (2) // 2 ** 2 = 4-tap array + ) + uut + ( + .clk (clk), + .rst_n (rst_n), + + .ena (ena), + .rdy (rdy), + + .a_bram_addr (core_a_addr), + .b_bram_addr (core_b_addr), + .n_bram_addr (core_n_addr), + .n_coeff_bram_addr (core_n_coeff_addr), + .r_bram_addr (core_r_addr), + + .a_bram_out (core_a_data), + .b_bram_out (core_b_data), + .n_bram_out (core_n_data), + .n_coeff_bram_out (core_n_coeff_data), + + .r_bram_in (core_r_data), + .r_bram_wr (core_r_wren), + + .n_num_words (n_num_words) + ); + + + // + // Script + // + initial begin + + rst_n = 1'b0; + ena = 1'b0; + + #200; + rst_n = 1'b1; + #100; + + test_systolic_multiplier_384(M_384, N_384, N_COEFF_384, FACTOR_384, COEFF_384); + test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512); + + end + + + // + // Test Tasks + // + task test_systolic_multiplier_384; + + input [383:0] m; + input [383:0] n; + input [383:0] n_coeff; + input [383:0] factor; + input [383:0] coeff; + + reg [767:0] m_factor_full; + reg [383:0] m_factor_modulo; + + reg [383:0] a; + reg [383:0] b; + reg [383:0] r; + + reg [767:0] ab_full; + reg [383:0] ab_modulo; + + integer round; + integer num_passed; + integer num_failed; + + begin + + m_factor_full = m * factor; // m * factor + m_factor_modulo = m_factor_full % n; // m * factor % n + + m_factor_full = m_factor_modulo * coeff; // m * factor * coeff + m_factor_modulo = m_factor_full % n; // m * factor * coeff % n + + a = m_factor_modulo; // start with a = m_factor... + b = m_factor_modulo; // ... and b = m_factor + + n_num_words = 4'd11; // set number of words + + num_passed = 0; // nothing tested so far + num_failed = 0; // + + for (round=0; round