//====================================================================== // // modexpa7_systolic_multiplier.v // ----------------------------------------------------------------------------- // Systolic Montgomery multiplier. // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_systolic_multiplier # ( // // This sets the address widths of memory buffers. Internal data // width is 32 bits, so for e.g. 2048-bit operands buffers must store // 2048 / 32 = 64 words, and these need 6-bit address bus, because // 2 ** 6 = 64. // parameter OPERAND_ADDR_WIDTH = 4, // // Explain. // parameter SYSTOLIC_ARRAY_POWER = 2 ) ( input clk, input rst_n, input ena, output rdy, output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, input [ 32-1:0] a_bram_out, input [ 32-1:0] b_bram_out, input [ 32-1:0] n_bram_out, input [ 32-1:0] n_coeff_bram_out, output [ 32-1:0] r_bram_in, output r_bram_wr, input [OPERAND_ADDR_WIDTH-1:0] n_num_words ); /* * Include Settings */ `include "pe/modexpa7_primitive_switch.v" `include "modexpa7_settings.v" /* * FSM Declaration */ localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_LOAD_START = 8'h11; localparam [ 7: 0] FSM_STATE_LOAD_SHIFT = 8'h12; localparam [ 7: 0] FSM_STATE_LOAD_WRITE = 8'h13; localparam [ 7: 0] FSM_STATE_LOAD_FINAL = 8'h14; localparam [ 7: 0] FSM_STATE_MULT_START = 8'h21; localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h22; localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h23; localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; /* * FSM State / Next State */ reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; /* * Enable Delay and Trigger */ reg ena_dly = 1'b0; // delay enable by one clock cycle always @(posedge clk) ena_dly <= ena; // trigger new operation when enable goes high wire ena_trig = ena && !ena_dly; /* * Ready Flag Logic */ reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) // reset flag if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin // clear flag when operation is started if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; // set flag after operation is finished if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; end /* * Parameters Latch */ reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; reg [OPERAND_ADDR_WIDTH :0] p_num_words_latch; // save number of words in n when new operation starts always @(posedge clk) // if ((fsm_state == FSM_STATE_IDLE) && ena_trig) n_num_words_latch <= n_num_words; /* * Counters */ // handy values wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; // counter reg [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt; reg [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt; // handy increment value and stop flag wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_next = load_mult_cnt + 1'b1; wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_next = load_syst_cnt + 1'b1; wire load_mult_cnt_done = (load_mult_cnt == load_mult_cnt_last) ? 1'b1 : 1'b0; wire load_syst_cnt_done = (load_syst_cnt == load_syst_cnt_last) ? 1'b1 : 1'b0; /* * Loader Count Logic */ always @(posedge clk) begin // case (fsm_state) FSM_STATE_LOAD_START: {load_syst_cnt, load_mult_cnt} <= {load_syst_cnt_zero, load_mult_cnt_zero}; // FSM_STATE_LOAD_SHIFT: load_mult_cnt <= load_mult_cnt_next; FSM_STATE_LOAD_WRITE: load_syst_cnt <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt; endcase // end /* * Wide Operand Loader */ /* * Explain how parallelized loader works here... * */ // loader input reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_wr; wire [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_rd; reg loader_wren; reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; // loader output wire [ 32-1:0] loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1]; // array_input wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] pe_a_wide; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] pe_b_wide; // generate parallelized loader genvar i; generate for (i=0; i