//====================================================================== // // modexpa7_systolic_multiplier.v // ----------------------------------------------------------------------------- // Systolic Montgomery multiplier. // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_systolic_multiplier # ( // // This sets the address widths of memory buffers. Internal data // width is 32 bits, so for e.g. 2048-bit operands buffers must store // 2048 / 32 = 64 words, and these need 5-bit address bus, because // 2 ** 6 = 64. // parameter OPERAND_ADDR_WIDTH = 4, // // Explain. // parameter SYSTOLIC_ARRAY_POWER = 2 ) ( input clk, input rst_n, input ena, output rdy, output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, input [ 32-1:0] a_bram_out, input [ 32-1:0] b_bram_out, input [ 32-1:0] n_bram_out, input [ 32-1:0] n_coeff_bram_out, output [ 32-1:0] r_bram_in, output r_bram_wr, input [OPERAND_ADDR_WIDTH-1:0] ab_num_words ); // // Include Settings // `include "pe/modexpa7_primitive_switch.v" `include "modexpa7_settings.v" // // FSM Declaration // localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11; localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12; localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13; localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24; localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31; localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32; localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33; localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34; localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41; localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42; localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43; localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54; localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61; localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62; localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63; localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64; localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; // // FSM State / Next State // reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; // // Enable Delay and Trigger // reg ena_dly = 1'b0; /* delay enable by one clock cycle */ always @(posedge clk) ena_dly <= ena; /* trigger new operation when enable goes high */ wire ena_trig = ena && !ena_dly; // // Ready Flag Logic // reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) /* reset flag */ if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin /* clear flag when operation is started */ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; /* set flag after operation is finished */ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; end // // Parameters Latch // reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch; /* save number of words in a and b when new operation starts */ always @(posedge clk) // if (fsm_next_state == FSM_STATE_LOAD_B_START) ab_num_words_latch <= ab_num_words; // // Systolic Cycle Counters // /* handy values */ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; /* counters */ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init; reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; /* handy increment values */ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; /* handy stop flags */ wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0; wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; /* delayed load counter */ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly; always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load; // // Multiplier Iteration Counter // /* handy values */ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; /* counter */ reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt; /* handy increment value and stop flag */ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1; wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0; // // Initialization Counter Control Logic // always @(posedge clk) begin // case (fsm_state) FSM_STATE_LOAD_B_START, FSM_STATE_LOAD_N_COEFF_START, FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero; FSM_STATE_LOAD_B_SHIFT, FSM_STATE_LOAD_N_COEFF_SHIFT, FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next; endcase // case (fsm_state) FSM_STATE_LOAD_B_START, FSM_STATE_LOAD_N_COEFF_START, FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero; FSM_STATE_LOAD_B_WRITE, FSM_STATE_LOAD_N_COEFF_WRITE, FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init; endcase // end // // Operand Loader // /* * Explain how parallelized loader works here... * */ /* loader banks */ localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0; localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1; localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2; /* loader input */ reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1]; reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1]; reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1]; reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; /* loader output */ wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1]; /* generate parallelized loader */ // // Loader currently stores B, N_COEFF and N, it can be coded another way // to initially stire B, then AB, then Q. Some memory can be saved thay way. // Maybe later... // genvar i; generate for (i=0; i syst_cnt_zero) pe_t_mem[syst_cnt_unload-1'b1][SYSTOLIC_ARRAY_LENGTH-1] <= pe_p[0]; else pe_t_mem[syst_cnt_last][SYSTOLIC_ARRAY_LENGTH-1] <= 32'd0; end end endcase // // T and C_IN can be moved to a separate code block // always @(posedge clk) begin // if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) // for (j=0; j {1'b0, a_addr}) ? 32'd0 : a_bram_out; pe_b[j] <= loader_dout[j]; pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j]; pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j]; end else begin pe_a[j] <= 32'hXXXXXXXX; pe_b[j] <= 32'hXXXXXXXX; pe_t[j] <= 32'hXXXXXXXX; pe_c_in[j] <= 32'hXXXXXXXX; end // if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) // for (j=0; j {1'b0, q_addr}) ? 32'd0 : q_data_out; pe_b[j] <= loader_dout[j]; pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j]; pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j]; end else begin pe_a[j] <= 32'hXXXXXXXX; pe_b[j] <= 32'hXXXXXXXX; pe_t[j] <= 32'hXXXXXXXX; pe_c_in[j] <= 32'hXXXXXXXX; end // // end // // FSM Process // always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; // // FSM Transition Logic // always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START; else fsm_next_state = FSM_STATE_IDLE; // FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT; FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE; else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL; else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START; // FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE; else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL; else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START; // FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT; FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE; else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL; else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START; // FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD; else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL; else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START; // FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD; else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL; else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START; // FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD; else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL; else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_STOP; // FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; endcase // end endmodule //====================================================================== // End of file //======================================================================