//====================================================================== // // modexpa7_systolic_multiplier_array.v // ----------------------------------------------------------------------------- // Systolic Montgomery multiplier Processing Element Array // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_systolic_multiplier_array # ( parameter OPERAND_ADDR_WIDTH = 4, parameter SYSTOLIC_ARRAY_POWER = 2 ) ( input clk, input rst_n, input ena, output rdy, input crt, output [OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER - 1 : 0] loader_addr_rd, input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide, input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide, output [ OPERAND_ADDR_WIDTH - 1 : 0] a_bram_addr, output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr, output [ 32 - 1 : 0] p_bram_in, output p_bram_wr, input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words, input [ OPERAND_ADDR_WIDTH : 0] p_num_words ); /* * Include Settings */ `include "pe/modexpa7_primitive_switch.v" `include "modexpa7_settings.v" /* * FSM Declaration */ localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_MULT_START = 8'h11; localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12; localparam [ 7: 0] FSM_STATE_MULT_RELOAD = 8'h13; localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h14; localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; /* * FSM State / Next State */ reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; /* * Enable Delay and Trigger */ reg ena_dly = 1'b0; // delay enable by one clock cycle always @(posedge clk) ena_dly <= ena; // trigger new operation when enable goes high wire ena_trig = ena && !ena_dly; /* * Ready Flag Logic */ reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) // reset flag if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin // clear flag when operation is started if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; // set flag after operation is finished if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; end /* * Parameters Latch */ reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; reg [OPERAND_ADDR_WIDTH :0] p_num_words_latch; // save number of words in n when new operation starts always @(posedge clk) // if ((fsm_state == FSM_STATE_IDLE) && ena_trig) begin n_num_words_latch <= n_num_words; p_num_words_latch <= p_num_words; end /* * Systolic Cycle Counters */ // handy values wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; // counters reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; // handy increment values wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; // handy stop flags wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; always @(posedge clk) // case (fsm_next_state) FSM_STATE_MULT_START, FSM_STATE_MULT_RELOAD: // syst_cnt_load <= syst_cnt_zero; FSM_STATE_MULT_CRUNCH: // syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; endcase always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_CRUNCH) begin // if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero; else if (shreg_now_unloading) syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; // end /* * Timing Shift Registers */ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load; reg [SYSTOLIC_PE_LATENCY :0] shreg_latency; reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload; wire shreg_done_load = shreg_load[syst_cnt_last]; wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; wire shreg_done_unload = shreg_unload[syst_cnt_last]; reg shreg_now_loading; //reg shreg_now_latency; reg shreg_now_unloading; reg shreg_done_latency_dly; always @(posedge clk) shreg_done_latency_dly <= shreg_done_latency; always @(posedge clk) // case (fsm_state) // FSM_STATE_MULT_START, FSM_STATE_MULT_RELOAD: begin // shreg_now_loading <= 1'b1; //shreg_now_latency <= 1'b1; shreg_now_unloading <= 1'b0; // shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1}; shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1}; shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; // end // FSM_STATE_MULT_CRUNCH: begin // shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0}; shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0}; shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]}; // if (shreg_done_load) shreg_now_loading <= 1'b0; //if (shreg_done_latency) shreg_now_latency <= 1'b0; if (shreg_done_latency) shreg_now_unloading <= 1'b1; else if (shreg_done_unload) shreg_now_unloading <= 1'b0; end // default: begin shreg_now_loading <= 1'b0; //shreg_now_latency <= 1'b0; shreg_now_unloading <= 1'b0; end // endcase /* * Systolic Array of Processing Elements */ reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1]; reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1]; wire [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1]; wire [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1]; wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1]; wire [31: 0] pe_c_out [0:SYSTOLIC_ARRAY_LENGTH-1]; /* * FIFOs */ reg fifo_c_rst; reg fifo_t_rst; reg fifo_c_wren; wire fifo_c_rden; reg fifo_t_wren; wire fifo_t_rden; reg [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout; wire [32 * 1 - 1 : 0] fifo_t_din_msb; reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] fifo_t_din_lsb; assign fifo_t_din = {fifo_t_din_msb, fifo_t_din_lsb}; modexpa7_simple_fifo # ( .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH), .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH) ) fifo_c ( .clk (clk), .rst (fifo_c_rst), .wr_en (fifo_c_wren), .d_in (fifo_c_din), .rd_en (fifo_c_rden), .d_out (fifo_c_dout) ); modexpa7_simple_fifo # ( .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH), .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH) ) fifo_t ( .clk (clk), .rst (fifo_t_rst), .wr_en (fifo_t_wren), .d_in (fifo_t_din), .rd_en (fifo_t_rden), .d_out (fifo_t_dout) ); genvar i; generate for (i=0; i {1'b0, a_addr}) ? 32'd0 : pe_a_wide[32 * (j + 1) - 1 -: 32]; pe_b[j] <= pe_b_wide[32 * (j + 1) - 1 -: 32]; end else begin pe_a[j] <= 32'hXXXXXXXX; pe_b[j] <= 32'hXXXXXXXX; end /* * */ always @(posedge clk) // case (fsm_next_state) FSM_STATE_MULT_RELOAD: p_wren <= 1'b1; default: p_wren <= 1'b0; endcase always @(posedge clk) // if ((fsm_state == FSM_STATE_MULT_CRUNCH) && shreg_done_latency_dly) p_data_in <= crt ? pe_a_wide[31:0] : pe_p[0]; /* * Block Memory Address Control */ always @(posedge clk) begin // case (fsm_state) FSM_STATE_MULT_START: p_addr <= bram_addr_zero; FSM_STATE_MULT_RELOAD: p_addr <= p_addr_next; endcase // case (fsm_next_state) FSM_STATE_MULT_START: a_addr <= bram_addr_zero; FSM_STATE_MULT_RELOAD: if (crt) a_addr <= !a_addr_done_crt ? a_addr_next : a_addr; else a_addr <= !a_addr_done ? a_addr_next : a_addr; endcase // end /* * Loader Address Control */ reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr; assign loader_addr_rd = loader_addr; always @(posedge clk) // case (fsm_next_state) // FSM_STATE_MULT_START, FSM_STATE_MULT_RELOAD: loader_addr <= syst_cnt_zero; // FSM_STATE_MULT_CRUNCH: // loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; // endcase /* * FSM Process */ always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; /* * FSM Transition Logic */ always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) // FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_MULT_START; else fsm_next_state = FSM_STATE_IDLE; // FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_RELOAD; else fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_RELOAD: if (p_addr_done) fsm_next_state = FSM_STATE_MULT_FINAL; else fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP; // FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; // endcase // end endmodule //====================================================================== // End of file //======================================================================