//====================================================================== // // modexpa7_systolic_multiplier.v // ----------------------------------------------------------------------------- // Systolic Montgomery multiplier. // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_systolic_multiplier # ( // // This sets the address widths of memory buffers. Internal data // width is 32 bits, so for e.g. 1024-bit operands buffers must store // 1024 / 32 = 32 words, and these need 5-bit address bus, because // 2 ** 5 = 32. // parameter OPERAND_ADDR_WIDTH = 5, // // This sets the width of the systolic cycle counter. TODO: Explain. // parameter SYSTOLIC_ARRAY_POWER = 3 ) ( input clk, input rst_n, input ena, output rdy, output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, input [ 32-1:0] a_bram_out, input [ 32-1:0] b_bram_out, input [ 32-1:0] n_bram_out, input [ 32-1:0] n_coeff_bram_out, output [ 32-1:0] r_bram_in, output r_bram_wr, input [OPERAND_ADDR_WIDTH-1:0] n_num_words ); // // Constants // localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER; localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER; localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH; localparam SYSTOLIC_PE_LATENCY = 4; // // FSM Declaration // localparam [ 3: 0] FSM_STATE_IDLE = 4'd0; localparam [ 3: 0] FSM_STATE_INIT_ZERO_ADDR = 4'd1; localparam [ 3: 0] FSM_STATE_INIT_NEXT_ADDR = 4'd2; localparam [ 3: 0] FSM_STATE_INIT_LAST_ADDR = 4'd3; localparam [ 3: 0] FSM_STATE_PIPE_CRUNCH = 4'd4; localparam [ 3: 0] FSM_STATE_PIPE_RELOAD = 4'd5; localparam [ 3: 0] FSM_STATE_SAVE_ZERO_ADDR = 4'd6; localparam [ 3: 0] FSM_STATE_SAVE_NEXT_ADDR = 4'd7; localparam [ 3: 0] FSM_STATE_SAVE_LAST_ADDR = 4'd8; localparam [ 3: 0] FSM_STATE_STOP = 4'd9; reg [ 3: 0] fsm_state = FSM_STATE_IDLE; reg [ 3: 0] fsm_next_state; // // Enable Delay (Trigger) // reg ena_dly = 1'b0; wire ena_trig = ena && !ena_dly; always @(posedge clk) ena_dly <= ena; // // Parameters Latch // reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; always @(posedge clk) // if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR) n_num_words_latch <= n_num_words; // // Addresses // localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}}; wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch; // // BRAM Addresses // reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_reg; reg [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_reg; reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_reg; reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_reg; reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_reg; reg [OPERAND_ADDR_WIDTH-1:0] r_bram_addr_reg; wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr = s_bram_addr_reg; reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_dly; reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_dly; reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_dly; reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_dly; wire [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_next = b_bram_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_next = a_bram_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_next = n_bram_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_next = s_bram_addr + 1'b1; wire b_bram_addr_done = (b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0; wire s_bram_addr_done = (s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0; assign b_bram_addr = b_bram_addr_reg; assign a_bram_addr = a_bram_addr_reg; assign n_coeff_bram_addr = n_coeff_bram_addr_reg; assign n_bram_addr = n_bram_addr_reg; assign r_bram_addr = r_bram_addr_reg; always @(posedge clk) b_bram_addr_dly <= b_bram_addr; always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr; always @(posedge clk) n_bram_addr_dly <= n_bram_addr; always @(posedge clk) s_bram_addr_dly <= s_bram_addr; always @(posedge clk) // case (fsm_next_state) FSM_STATE_INIT_ZERO_ADDR: b_bram_addr_reg <= bram_addr_zero; FSM_STATE_INIT_NEXT_ADDR: b_bram_addr_reg <= b_bram_addr_next; endcase always @(posedge clk) case (fsm_next_state) FSM_STATE_SAVE_ZERO_ADDR: s_bram_addr_reg <= bram_addr_zero; FSM_STATE_SAVE_NEXT_ADDR: s_bram_addr_reg <= s_bram_addr_next; endcase always @(posedge clk) // case (fsm_next_state) FSM_STATE_INIT_LAST_ADDR: a_bram_addr_reg <= bram_addr_zero; FSM_STATE_PIPE_RELOAD: a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr; endcase always @(posedge clk) // case (fsm_next_state) FSM_STATE_INIT_ZERO_ADDR: n_coeff_bram_addr_reg <= bram_addr_zero; FSM_STATE_INIT_NEXT_ADDR: n_coeff_bram_addr_reg <= n_coeff_bram_addr_next; endcase // // Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles... // wire [SYSTOLIC_PE_LATENCY:0] pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1}; reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb; reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb; wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb_next = {pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]}; wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb_next = {pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]}; wire pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]; wire pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]; always @(posedge clk) // if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) // case (fsm_state) FSM_STATE_INIT_LAST_ADDR, FSM_STATE_PIPE_RELOAD: pe_latency_ab_lsb <= pe_latency_start; FSM_STATE_PIPE_CRUNCH: pe_latency_ab_lsb <= pe_latency_ab_lsb_done ? pe_latency_ab_lsb : pe_latency_ab_lsb_next; endcase // // Buffers // integer i, j; reg [31: 0] b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; reg [31: 0] n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; reg [31: 0] n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; always @(posedge clk) // case (fsm_state) FSM_STATE_INIT_ZERO_ADDR: for (i=0; i mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next; endcase always @(posedge clk) // if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) // case (fsm_state) FSM_STATE_INIT_LAST_ADDR: mult_cnt_qn <= mult_cnt_zero; FSM_STATE_PIPE_RELOAD: if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next; endcase always @(posedge clk) // if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) // case (fsm_state) FSM_STATE_INIT_LAST_ADDR: mult_cnt_s <= mult_cnt_zero; FSM_STATE_PIPE_RELOAD: if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next; endcase always @(posedge clk) begin syst_cnt_dly[0] <= syst_cnt; for (i=1; i mult_cnt_zero)) for (j=0; j mult_cnt_zero)) for (j=0; j syst_cnt_zero) t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0]; for (j=1; j mult_cnt_zero)) begin if (syst_cnt_latency > syst_cnt_zero) t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0]; for (j=1; j mult_cnt_zero)) begin if (syst_cnt_latency > syst_cnt_zero) t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0]; for (j=1; j mult_cnt_zero) && !mult_cnt_s_done; always @(posedge clk) pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero); always @(posedge clk) // if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done) pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out; always @(posedge clk) // if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero)) pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out; modexpa7_adder32 pe_add_inst ( .clk (clk), .ce (pe_add_ce), .a (pe_add_a2), .b (pe_add_b0), .c_in (pe_add_c_in), .s (pe_add_s), .c_out (pe_add_c_out) ); modexpa7_subtractor32 pe_sub_inst ( .clk (clk), .ce (pe_sub_ce), .a (pe_sub_a0), .b (pe_sub_b0), .b_in (pe_sub_b_in), .d (pe_sub_d), .b_out (pe_sub_b_out) ); always @(posedge clk) // if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin pe_add_a0 <= mul_ab_p[0]; pe_add_a1 <= pe_add_a0; pe_add_a2 <= pe_add_a1; end always @(posedge clk) // if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) pe_sub_a0 <= pe_add_s; always @(posedge clk) // if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) pe_add_b0 <= mul_qn_p[0]; always @(posedge clk) // if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out; always @(posedge clk) // case (fsm_next_state) FSM_STATE_INIT_ZERO_ADDR: n_bram_addr_reg <= bram_addr_zero; FSM_STATE_INIT_NEXT_ADDR: n_bram_addr_reg <= n_bram_addr_next; FSM_STATE_PIPE_RELOAD: begin if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero; if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next; end endcase // // Ready Flag Logic // reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; end // // // always @(posedge clk) // if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) mul_q_a_int <= mul_ab_p[0]; always @(posedge clk) // if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) mul_qn_a_int <= mul_q_p[0]; always @(posedge clk) // if (fsm_state == FSM_STATE_PIPE_RELOAD) mul_q_a <= mul_q_a_int; // TODO: Add masking! Maybe not needed after all?.. always @(posedge clk) // if (fsm_state == FSM_STATE_PIPE_RELOAD) mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0; // // Debug // //always @(posedge clk) begin // //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) //$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]); // //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) //$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]); // //if (fsm_state == FSM_STATE_PIPE_RELOAD) //$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s); // //if (fsm_state == FSM_STATE_PIPE_RELOAD) //$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d); // //end wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_rd; reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr; wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr_next = s_bram_addr_wr + 1'b1; reg s_bram_en; wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_rd; reg [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr; wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1; reg sn_bram_en; assign s_bram_addr_rd = s_bram_addr; assign sn_bram_addr_rd = s_bram_addr; wire [31: 0] s_bram_din; wire [31: 0] s_bram_dout; wire [31: 0] sn_bram_din; wire [31: 0] sn_bram_dout; assign s_bram_din = pe_add_s; assign sn_bram_din = pe_sub_d; always @(posedge clk) // s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half); always @(posedge clk) // sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half); always @(posedge clk) begin // if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero; if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next; end always @(posedge clk) begin // if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero; if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next; end bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_s (.clk(clk), .a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(), .b_addr(s_bram_addr_rd), .b_out(s_bram_dout)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_sn (.clk(clk), .a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(), .b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout)); reg r_bram_en; always @(posedge clk) // case (fsm_state) FSM_STATE_SAVE_ZERO_ADDR, FSM_STATE_SAVE_NEXT_ADDR: r_bram_en <= 1'b1; default: r_bram_en <= 1'b0; endcase reg r_bram_wr_reg; assign r_bram_wr = r_bram_wr_reg; always @(posedge clk) // r_bram_wr_reg <= r_bram_en; wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out; reg [31: 0] r_bram_in_reg; assign r_bram_in = r_bram_in_reg; always @(posedge clk) // if (r_bram_en) r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout; always @(posedge clk) // if (r_bram_en) r_bram_addr_reg <= s_bram_addr_dly; // // FSM Transition Logic // always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_ZERO_ADDR; else fsm_next_state = FSM_STATE_IDLE; FSM_STATE_INIT_ZERO_ADDR: fsm_next_state = FSM_STATE_INIT_NEXT_ADDR; FSM_STATE_INIT_NEXT_ADDR: if (b_bram_addr_done) fsm_next_state = FSM_STATE_INIT_LAST_ADDR; else fsm_next_state = FSM_STATE_INIT_NEXT_ADDR; FSM_STATE_INIT_LAST_ADDR: fsm_next_state = FSM_STATE_PIPE_CRUNCH; FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) fsm_next_state = pe_latency_ab_msb_done ? FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH; else fsm_next_state = FSM_STATE_PIPE_CRUNCH; FSM_STATE_PIPE_RELOAD: if (mult_cnt_s_done) fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR; else fsm_next_state = FSM_STATE_PIPE_CRUNCH; FSM_STATE_SAVE_ZERO_ADDR: fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR; FSM_STATE_SAVE_NEXT_ADDR: if (s_bram_addr_done) fsm_next_state = FSM_STATE_SAVE_LAST_ADDR; else fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR; FSM_STATE_SAVE_LAST_ADDR: fsm_next_state = FSM_STATE_STOP; FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; endcase end endmodule //====================================================================== // End of file //======================================================================