diff options
Diffstat (limited to 'src/rtl')
-rw-r--r-- | src/rtl/modexpa7_systolic_multiplier.v | 1546 | ||||
-rw-r--r-- | src/rtl/modexpa7_systolic_multiplier_old.v | 1260 |
2 files changed, 1615 insertions, 1191 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v index f53354e..a1e141e 100644 --- a/src/rtl/modexpa7_systolic_multiplier.v +++ b/src/rtl/modexpa7_systolic_multiplier.v @@ -1,1229 +1,393 @@ -//======================================================================
-//
-// modexpa7_systolic_multiplier.v
-// -----------------------------------------------------------------------------
-// Systolic Montgomery multiplier.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2017, NORDUnet A/S All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// - Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may
-// be used to endorse or promote products derived from this software
-// without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//======================================================================
-
-module modexpa7_systolic_multiplier #
- (
- //
- // This sets the address widths of memory buffers. Internal data
- // width is 32 bits, so for e.g. 2048-bit operands buffers must store
- // 2048 / 32 = 64 words, and these need 6-bit address bus, because
- // 2 ** 6 = 64.
- //
- parameter OPERAND_ADDR_WIDTH = 4,
-
- //
- // Explain.
- //
- parameter SYSTOLIC_ARRAY_POWER = 1
- )
- (
- input clk,
- input rst_n,
-
- input ena,
- output rdy,
-
+//====================================================================== +// +// modexpa7_systolic_multiplier.v +// ----------------------------------------------------------------------------- +// Systolic Montgomery multiplier. +// +// Authors: Pavel Shatov +// +// Copyright (c) 2017, NORDUnet A/S All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +//====================================================================== + +module modexpa7_systolic_multiplier # + ( + // + // This sets the address widths of memory buffers. Internal data + // width is 32 bits, so for e.g. 2048-bit operands buffers must store + // 2048 / 32 = 64 words, and these need 6-bit address bus, because + // 2 ** 6 = 64. + // + parameter OPERAND_ADDR_WIDTH = 4, + + // + // Explain. + // + parameter SYSTOLIC_ARRAY_POWER = 2 + ) + ( + input clk, + input rst_n, + + input ena, + output rdy, + output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, -
+ input [ 32-1:0] a_bram_out, input [ 32-1:0] b_bram_out, input [ 32-1:0] n_bram_out, input [ 32-1:0] n_coeff_bram_out, -
- output [ 32-1:0] r_bram_in,
- output r_bram_wr,
-
- input [OPERAND_ADDR_WIDTH-1:0] ab_num_words
- );
-
-
- //
- // Include Settings
- //
- `include "pe/modexpa7_primitive_switch.v"
- `include "modexpa7_settings.v"
-
-
- //
- // FSM Declaration
- //
- localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
-
- localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11;
- localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12;
- localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13;
- localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14;
-
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24;
-
- localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31;
- localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32;
- localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33;
- localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34;
-
- localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44;
-
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54;
-
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64;
-
- localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71;
- localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72;
- localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73;
-
- localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
-
- //
- // FSM State / Next State
- //
- reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
- reg [ 7: 0] fsm_next_state;
-
-
- //
- // Enable Delay and Trigger
- //
- reg ena_dly = 1'b0;
-
- /* delay enable by one clock cycle */
- always @(posedge clk) ena_dly <= ena;
-
- /* trigger new operation when enable goes high */
- wire ena_trig = ena && !ena_dly;
-
-
- //
- // Ready Flag Logic
- //
- reg rdy_reg = 1'b1;
- assign rdy = rdy_reg;
-
- always @(posedge clk or negedge rst_n)
-
- /* reset flag */
- if (rst_n == 1'b0) rdy_reg <= 1'b1;
- else begin
-
- /* clear flag when operation is started */
- if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
-
- /* set flag after operation is finished */
- if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
-
- end
-
-
- //
- // Parameters Latch
- //
- reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch;
-
- /* save number of words in a and b when new operation starts */
- always @(posedge clk)
- //
- if (fsm_next_state == FSM_STATE_LOAD_B_START)
- ab_num_words_latch <= ab_num_words;
-
-
- //
- // Systolic Cycle Counters
- //
-
- /* handy values */
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-
- /* counters */
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
-
- /* handy increment values */
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
-
- /* handy stop flags */
- wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
- /* delayed load counter */
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly;
- always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
-
-
- //
- // Multiplier Iteration Counter
- //
-
- /* handy values */
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
-
- /* counter */
- reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt;
-
- /* handy increment value and stop flag */
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1;
- wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
-
-
- //
- // Initialization Counter Control Logic
- //
- always @(posedge clk) begin
- //
- case (fsm_state)
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero;
+ + output [ 32-1:0] r_bram_in, + output r_bram_wr, + + input [OPERAND_ADDR_WIDTH-1:0] n_num_words + ); + + + /* + * Include Settings + */ + `include "pe/modexpa7_primitive_switch.v" + `include "modexpa7_settings.v" + + + /* + * FSM Declaration + */ + localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; + + localparam [ 7: 0] FSM_STATE_LOAD_START = 8'h11; + localparam [ 7: 0] FSM_STATE_LOAD_SHIFT = 8'h12; + localparam [ 7: 0] FSM_STATE_LOAD_WRITE = 8'h13; + localparam [ 7: 0] FSM_STATE_LOAD_FINAL = 8'h14; +
+ localparam [ 7: 0] FSM_STATE_MULT_START = 8'h21; + localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h22; + localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h23; + + localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
+ + /* + * FSM State / Next State + */ + reg [ 7: 0] fsm_state = FSM_STATE_IDLE; + reg [ 7: 0] fsm_next_state; + + + /* + * Enable Delay and Trigger + */ + reg ena_dly = 1'b0; + + // delay enable by one clock cycle + always @(posedge clk) ena_dly <= ena; + + // trigger new operation when enable goes high + wire ena_trig = ena && !ena_dly; + + + /* + * Ready Flag Logic + */ + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; + + always @(posedge clk or negedge rst_n) + + // reset flag + if (rst_n == 1'b0) rdy_reg <= 1'b1; + else begin + + // clear flag when operation is started + if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; + + // set flag after operation is finished + if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; + + end + + + /* + * Parameters Latch + */ + reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; + + // save number of words in n when new operation starts + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_IDLE) && ena_trig) + n_num_words_latch <= n_num_words; - FSM_STATE_LOAD_B_SHIFT,
- FSM_STATE_LOAD_N_COEFF_SHIFT,
- FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next;
- endcase
- //
- case (fsm_state)
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero;
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
- endcase
- //
- end
-
-
- //
- // Operand Loader
- //
-
- /*
- * Explain how parallelized loader works here...
- *
- */
-
- /* loader banks */
- localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0;
- localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1;
- localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2;
-
- /* loader input */
- reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
- reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
-
- /* loader output */
- wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1];
- /* generate parallelized loader */
-
- //
- // Loader currently stores B, N_COEFF and N, it can be coded another way
- // to initially store B, then AB, then Q. Some memory can be saved thay way.
- // Maybe later...
- //
-
- genvar i;
- generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
- //
- begin : gen_bram_1rw_readfirst_loader
- //
- bram_1rw_readfirst #
- (
- .MEM_WIDTH (32),
- .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2)
- )
- bram_loader
- (
- .clk (clk),
- .a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}),
- .a_wr (loader_wren[i]),
- .a_in (loader_din[i]),
- .a_out (loader_dout[i])
- );
- //
- end
- //
- endgenerate
-
-
- //
- // Block Memory Addresses
- //
-
/*
- * Explain why there are two memory sizes.
- *
+ * Counters
*/
-
- /* the very first addresses */
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}};
- wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
-
- /* the very last addresses */
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch};
- wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1};
-
- /* address registers */
- reg [OPERAND_ADDR_WIDTH-1:0] a_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
- reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext;
- reg [OPERAND_ADDR_WIDTH-1:0] q_addr;
- reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext;
- reg [OPERAND_ADDR_WIDTH-1:0] s_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] sn_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
-
- /* handy increment values */
- wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
-
- /* handy stop flags */
- wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
- wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
- wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
- /* delayed B address */
- reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
- always @(posedge clk) b_addr_dly <= b_addr;
-
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly;
- always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
-
- reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly;
- always @(posedge clk) n_addr_dly <= n_addr;
-
- /* map registers to top-level ports */
- assign a_bram_addr = a_addr;
- assign b_bram_addr = b_addr;
- assign n_coeff_bram_addr = n_coeff_addr;
- assign n_bram_addr = n_addr;
- assign r_bram_addr = r_addr;
-
-
- //
- // Flag
- //
- reg flag_select_s;
-
-
- //
- // Memory Address Control Logic
- //
- always @(posedge clk) begin
- //
- case (fsm_next_state)
- FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero;
- FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero;
- FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero;
- FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next;
- FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next;
- FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next;
- endcase
- //
- case (fsm_state)
- FSM_STATE_MULT_Q_N_RELOAD:
- if (qn_addr_ext == {1'b0, bram_addr_last})
- n_addr <= bram_addr_zero;
- else if (qn_addr_ext > {1'b0, bram_addr_last})
- n_addr <= n_addr_next;
+ // handy values + wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; + wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; +
+ wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; + wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; + + // counter + reg [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt; + reg [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt; + + // handy increment value and stop flag + wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_next = load_mult_cnt + 1'b1; + wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_next = load_syst_cnt + 1'b1; +
+ wire load_mult_cnt_done = (load_mult_cnt == load_mult_cnt_last) ? 1'b1 : 1'b0; + wire load_syst_cnt_done = (load_syst_cnt == load_syst_cnt_last) ? 1'b1 : 1'b0; - endcase
- //
- case (fsm_state)
- FSM_STATE_SAVE_START: r_addr <= bram_addr_zero;
- FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next;
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero;
- FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
- endcase
- //
- end
-
-
- //
- // Internal Memories
- //
-
- /* memory inputs */
- reg [31: 0] ab_data_in;
- reg [31: 0] q_data_in;
- reg [31: 0] qn_data_in;
- wire [31: 0] s_data_in;
- wire [31: 0] sn_data_in;
- reg [31: 0] r_data_in;
-
- /* memory outputs */
- wire [31: 0] ab_data_out;
- wire [31: 0] q_data_out;
- wire [31: 0] qn_data_out;
- wire [31: 0] s_data_out;
- wire [31: 0] sn_data_out;
-
- /* write enables */
- reg ab_wren;
- reg q_wren;
- reg qn_wren;
- reg s_wren;
- reg sn_wren;
- reg r_wren;
-
- /* map */
- assign r_bram_in = r_data_in;
- assign r_bram_wr = r_wren;
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
- bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
- bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
-
-
- //
- // Wide Operand Loader
- //
- integer j;
-
- /* shift logic */
- always @(posedge clk)
- //
- case (fsm_state)
- //
- FSM_STATE_LOAD_B_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- FSM_STATE_LOAD_N_COEFF_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- FSM_STATE_LOAD_N_SHIFT: begin
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
+ /* + * Loader Count Logic + */ + always @(posedge clk) begin + // + case (fsm_state) + FSM_STATE_LOAD_START: {load_syst_cnt, load_mult_cnt} <= {load_syst_cnt_zero, load_mult_cnt_zero};
+ // + FSM_STATE_LOAD_SHIFT: load_mult_cnt <= load_mult_cnt_next; + FSM_STATE_LOAD_WRITE: load_syst_cnt <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt; + endcase + // + end +
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- endcase
-
-
- /* write enable logic */
- always @(posedge clk)
- //
- case (fsm_next_state)
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b1;
-
- default:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b0;
-
- endcase
-
- /* loader address update logic */
- always @(posedge clk) begin
- //
- case (fsm_state)
-
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= syst_cnt_zero;
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
-
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= syst_cnt_zero;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
- endcase
- //
- case (fsm_next_state)
-
- FSM_STATE_LOAD_B_START,
- FSM_STATE_MULT_A_B_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
-
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_MULT_AB_N_COEFF_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
-
- FSM_STATE_LOAD_N_START,
- FSM_STATE_MULT_Q_N_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
-
- endcase
- //
- end
-
-
- //
- // Systolic Array of Processing Elements
- //
- reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_c_out [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [31: 0] pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1];
-
-
- //
- // These can be turned into a FIFO (maybe later?)...
- //
- //reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
- //reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
-
- reg fifo_c_rst;
- reg fifo_t_rst;
-
- wire fifo_c_wren;
- wire fifo_c_rden;
-
- wire fifo_t_wren;
- wire fifo_t_rden;
-
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din;
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout;
-
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din;
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout;
-
- /**/
- modexpa7_simple_fifo #
- (
- .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
- .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
- )
- fifo_c
- (
- .clk (clk),
- .rst (fifo_c_rst),
- .wr_en (fifo_c_wren),
- .d_in (fifo_c_din),
- .rd_en (fifo_c_rden),
- .d_out (fifo_c_dout)
- );
-
- modexpa7_simple_fifo #
- (
- .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
- .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
- )
- fifo_t
- (
- .clk (clk),
- .rst (fifo_t_rst),
- .wr_en (fifo_t_wren),
- .d_in (fifo_t_din),
- .rd_en (fifo_t_rden),
- .d_out (fifo_t_dout)
- );
-
- generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
- begin : modexpa7_systolic_pe_multiplier
- modexpa7_systolic_pe systolic_pe_inst
- (
- .clk (clk),
- .a (pe_a[i]),
- .b (pe_b[i]),
- .t (pe_t[i]),
- .c_in (pe_c_in[i]),
- .p (pe_p[i]),
- .c_out (pe_c_out[i])
- );
- assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
- assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
- assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
- always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
- end
+ /* + * Wide Operand Loader + */ + + /* + * Explain how parallelized loader works here... + * + */ + + + // loader input + reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr[0:SYSTOLIC_ARRAY_LENGTH-1]; + reg loader_wren[0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; + + // loader output + wire [ 32-1:0] loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1]; + + // generate parallelized loader + genvar i; + generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1) + // + begin : gen_bram_1rw_readfirst_loader + // + bram_1rw_readfirst # + ( + .MEM_WIDTH (32), + .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH) + ) + bram_loader + ( + .clk (clk), + .a_addr (loader_addr[i]), + .a_wr (loader_wren[i]), + .a_in (loader_din[i]), + .a_out (loader_dout[i]) + ); + // + end + // endgenerate
-
-
-
-
- //
- // Shift Registers
- //
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
- reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
-
- wire shreg_done_load = shreg_load[syst_cnt_last];
- wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
- wire shreg_done_unload = shreg_unload[syst_cnt_last];
-
- reg shreg_now_loading;
- reg shreg_now_latency;
- reg shreg_now_unloading;
-
- reg shreg_done_latency_dly;
-
- always @(posedge clk)
- shreg_done_latency_dly <= shreg_done_latency;
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_LOAD_N_FINAL: begin
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- end
- //
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD: begin
- shreg_now_loading <= 1'b1;
- shreg_now_latency <= 1'b1;
- shreg_now_unloading <= 1'b0;
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- end
- //
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: begin
- shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
- shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
- shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
-
- if (shreg_done_load) shreg_now_loading <= 1'b0;
- if (shreg_done_latency) shreg_now_latency <= 1'b0;
- if (shreg_done_latency) shreg_now_unloading <= 1'b1;
- else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
- end
- //
- default: begin
- shreg_now_loading <= 1'b0;
- shreg_now_latency <= 1'b0;
- shreg_now_unloading <= 1'b0;
- end
- //
- endcase
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START: fifo_c_rst <= 1'b1;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0;
- endcase
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START: fifo_t_rst <= 1'b1;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0;
- endcase
-
-
- reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] pe_p_msb_dly;
-
- always @(posedge clk)
- //
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j];
-
- wire [31: 0] pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0;
- assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly};
+ /* + * Block Memory Addresses + */ + + /* + * Explain why there are two memory sizes. + */ + + // the very first addresses + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}}; + wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}}; + + // the very last addresses + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {n_num_words_latch}; + wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {n_num_words_latch, 1'b1}; + + // address registers + reg [OPERAND_ADDR_WIDTH-1:0] b_addr; + + // handy increment values + wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; + + // handy stop flags + wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; + + // delayed addresses + reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
+ + always @(posedge clk) b_addr_dly <= b_addr; + + + // map registers to top-level ports + assign b_bram_addr = b_addr; -
-
- reg shreg_now_unloading_dly;
- always @(posedge clk)
- shreg_now_unloading_dly <= shreg_now_unloading;
-
- assign fifo_c_wren = shreg_now_unloading_dly;
- assign fifo_c_rden = shreg_now_loading;
-
- assign fifo_t_wren = shreg_now_unloading_dly;
- assign fifo_t_rden = shreg_now_loading;
-
-
-
-
- always @(posedge clk) begin
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero;
- FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero;
- FSM_STATE_MULT_Q_N_START: begin qn_addr_ext <= bram_addr_ext_zero;
- ab_addr_ext <= bram_addr_ext_zero;
- end
-
- FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next;
- FSM_STATE_MULT_Q_N_RELOAD: begin qn_addr_ext <= qn_addr_ext_next;
- ab_addr_ext <= ab_addr_ext_next;
- end
- endcase
- //
- case (fsm_state)
-
- FSM_STATE_MULT_Q_N_RELOAD: begin
- if (qn_addr_ext == {1'b0, bram_addr_last}) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
- if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
- s_addr <= s_addr_next;
- sn_addr <= sn_addr_next;
- end
-
- if (qn_addr_ext == bram_addr_ext_last) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
-
- end
-
- FSM_STATE_MULT_Q_N_FINAL,
- FSM_STATE_SAVE_START,
- FSM_STATE_SAVE_WRITE: begin
- s_addr <= !s_addr_done ? s_addr_next : s_addr;
- sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
+ /* + * Loader Data Input + */ + integer j; + + // shift logic + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_SHIFT: begin + + // update the rightmost part of loader buffer + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; + + // shift the loader buffer to the left + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_din[j-1] <= loader_din[j]; + end
-
- endcase
-
- //
- case (fsm_next_state)
- FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next;
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero;
- FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr;
- endcase
+ //
+ endcase - //
- end
-
- always @(posedge clk) begin
- //
- if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
- ab_wren <= shreg_done_latency_dly;
- ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- ab_wren <= 1'b0;
- ab_data_in <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
- q_wren <= shreg_done_latency_dly;
- q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- q_wren <= 1'b0;
- q_data_in <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
- qn_wren <= shreg_done_latency_dly;
- qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- qn_wren <= 1'b0;
- qn_data_in <= 32'hXXXXXXXX;
- end
- //
- case (fsm_state)
- FSM_STATE_SAVE_START: r_wren <= 1'b1;
- FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done;
- default: r_wren <= 1'b0;
- endcase
- //
- end
-
-
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD:
- //
- syst_cnt_load <= syst_cnt_zero;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH:
+
+ /*
+ * Load Write Enable Logic
+ */ + always @(posedge clk) + // + case (fsm_next_state) + + FSM_STATE_LOAD_WRITE:
//
- syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_wren[j] <= 1'b1; + + default: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_wren[j] <= 1'b0; + endcase
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: begin
-
- if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero;
- else if (shreg_now_unloading)
- syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
-
- end
- endcase
-
- /*
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: begin
-
- //if (shreg_now_unloading)
- //for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //pe_c_out_mem[j][syst_cnt_unload] <= pe_c_out[j];
-
- if (shreg_now_unloading) begin
-
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- pe_t_mem[j-1][syst_cnt_unload] <= pe_p[j];
-
- if (syst_cnt_unload > syst_cnt_zero)
- pe_t_mem[SYSTOLIC_ARRAY_LENGTH-1][syst_cnt_unload-1'b1] <= pe_p[0];
- else
- pe_t_mem[SYSTOLIC_ARRAY_LENGTH-1][syst_cnt_last] <= 32'd0;
-
- end
- end
- endcase
- */
-
- //
- // T and C_IN can be moved to a separate code block
- //
- always @(posedge clk) begin
- //
- if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= ab_data_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
-
- //
- end
-
-
- //
- // Adder
- //
/*
- * This adder is used to calculate S = AB + QN.
- *
+ * Loader Address Update Logic
*/
- reg add1_ce; // clock enable
- reg [31: 0] add1_s; // sum output
- wire add1_c_in; // carry input
- wire [31: 0] add1_a; // A-input
- reg [31: 0] add1_b; // B-input
- reg add1_c_in_mask; // flag to not carry anything into the very first word
- reg add1_c_out; // carry output
-
- /* add masking into carry feedback chain */
- assign add1_c_in = add1_c_out & ~add1_c_in_mask;
-
- /* mask carry for the very first word of N */
- //always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+ + always @(posedge clk) + // + case (fsm_state) + + FSM_STATE_LOAD_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr[j] <= load_syst_cnt_zero; + + FSM_STATE_LOAD_WRITE: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr[j] <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt; + + endcase - always @(posedge clk)
- //
- if (add1_ce)
- //
- {add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
-
- assign add1_a = qn_data_in;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
- else
- add1_b <= 32'hXXXXXXXX;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
- else
- add1_c_in_mask <= 1'b0;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- add1_ce <= shreg_done_latency_dly;
- else
- add1_ce <= 1'b0;
-
- assign s_data_in = add1_s;
- assign sn_data_in = sub1_d;
-
- always @(posedge clk) begin
- //
- s_wren <= add1_ce;
- sn_wren <= sub1_ce;
- end
-
-
-
- //
- // Subtractor
- //
/*
- * This subtractor is used to calculate SN = S - N.
- *
+ * Memory Address Control Logic
*/
- reg sub1_ce; // clock enable
- reg [31: 0] sub1_d; // difference output
- wire sub1_b_in; // borrow input
- wire [31: 0] sub1_a; // A-input
- reg [31: 0] sub1_b; // B-input
- reg sub1_b_in_mask; // flag to not borrow anything from the very first word
- reg sub1_b_out; // borrow output
-
- /* add masking into borrow feedback chain */
- assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
-
- always @(posedge clk)
- //
- if (sub1_ce)
- //
- {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
-
- assign sub1_a = add1_s;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
- else
- sub1_b <= 32'hXXXXXXXX;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
- else
- sub1_b_in_mask <= 1'b0;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
- else
- sub1_ce <= 1'b0;
-
-
- assign s_data_in = add1_s;
-
- always @(posedge clk)
- //
- s_wren <= add1_ce;
-
-
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
- flag_select_s <= sub1_b_out & ~add1_c_out;
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_SAVE_START,
- FSM_STATE_SAVE_WRITE:
- r_data_in <= flag_select_s ? s_data_out : sn_data_out;
- endcase
-
-
-
- //
- // FSM Process
- //
- always @(posedge clk or negedge rst_n)
- //
- if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
- else fsm_state <= fsm_next_state;
-
-
- //
- // FSM Transition Logic
- //
- always @* begin
- //
- fsm_next_state = FSM_STATE_STOP;
- //
- case (fsm_state)
-
- FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START;
- else fsm_next_state = FSM_STATE_IDLE;
- //
- FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
- //
- FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START;
- //
- FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START;
- //
- FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
- else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
- //
- FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
- else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START;
- //
- FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
- else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START;
- //
- FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE;
- FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL;
- else fsm_next_state = FSM_STATE_SAVE_WRITE;
- FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP;
- //
- FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
-
+ always @(posedge clk) begin + // + case (fsm_next_state) + FSM_STATE_LOAD_START: b_addr <= bram_addr_zero; + FSM_STATE_LOAD_SHIFT: b_addr <= b_addr_next; endcase
//
end
-
-endmodule
-
-//======================================================================
-// End of file
-//======================================================================
+ + + + /* + * FSM Process + */ + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; + else fsm_state <= fsm_next_state; + + + /* + * FSM Transition Logic + */ + always @* begin + // + fsm_next_state = FSM_STATE_STOP; + // + case (fsm_state) + // + FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_START; + else fsm_next_state = FSM_STATE_IDLE; + // + FSM_STATE_LOAD_START: fsm_next_state = FSM_STATE_LOAD_SHIFT; + FSM_STATE_LOAD_SHIFT: if (load_mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_WRITE; + else fsm_next_state = FSM_STATE_LOAD_SHIFT; + FSM_STATE_LOAD_WRITE: if (load_syst_cnt_done) fsm_next_state = FSM_STATE_LOAD_FINAL; + else fsm_next_state = FSM_STATE_LOAD_SHIFT; + FSM_STATE_LOAD_FINAL: fsm_next_state = FSM_STATE_STOP; + //
+ //FSM_STATE_MULT_START: + //FSM_STATE_MULT_CRUNCH: + //FSM_STATE_MULT_FINAL: + // + FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; + // + endcase + // + end + + +endmodule + +//====================================================================== +// End of file +//====================================================================== diff --git a/src/rtl/modexpa7_systolic_multiplier_old.v b/src/rtl/modexpa7_systolic_multiplier_old.v new file mode 100644 index 0000000..8b00370 --- /dev/null +++ b/src/rtl/modexpa7_systolic_multiplier_old.v @@ -0,0 +1,1260 @@ +//====================================================================== +// +// modexpa7_systolic_multiplier.v +// ----------------------------------------------------------------------------- +// Systolic Montgomery multiplier. +// +// Authors: Pavel Shatov +// +// Copyright (c) 2017, NORDUnet A/S All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +//====================================================================== + +module modexpa7_systolic_multiplier # + ( + // + // This sets the address widths of memory buffers. Internal data + // width is 32 bits, so for e.g. 2048-bit operands buffers must store + // 2048 / 32 = 64 words, and these need 6-bit address bus, because + // 2 ** 6 = 64. + // + parameter OPERAND_ADDR_WIDTH = 4, + + // + // Explain. + // + parameter SYSTOLIC_ARRAY_POWER = 1 + ) + ( + input clk, + input rst_n, + + input ena, + output rdy, + + output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, + + input [ 32-1:0] a_bram_out, + input [ 32-1:0] b_bram_out, + input [ 32-1:0] n_bram_out, + input [ 32-1:0] n_coeff_bram_out, + + output [ 32-1:0] r_bram_in, + output r_bram_wr, + + input [OPERAND_ADDR_WIDTH-1:0] ab_num_words + ); + + + // + // Include Settings + // + `include "pe/modexpa7_primitive_switch.v" + `include "modexpa7_settings.v" + + + // + // FSM Declaration + // + localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; + + localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11; + localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12; + localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13; + localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14; + + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21; + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22; + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23; + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24; + + localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31; + localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32; + localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33; + localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34; + + localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41; + localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42; + localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43; + localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44; + + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51; + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52; + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53; + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54; + + localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_ADD_S = 8'h63; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_SUB_SN = 8'h64; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h65; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h66; + + localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71; + localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72; + localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73; + + localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; + + // + // FSM State / Next State + // + reg [ 7: 0] fsm_state = FSM_STATE_IDLE; + reg [ 7: 0] fsm_next_state; + + + // + // Enable Delay and Trigger + // + reg ena_dly = 1'b0; + + /* delay enable by one clock cycle */ + always @(posedge clk) ena_dly <= ena; + + /* trigger new operation when enable goes high */ + wire ena_trig = ena && !ena_dly; + + + // + // Ready Flag Logic + // + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; + + always @(posedge clk or negedge rst_n) + + /* reset flag */ + if (rst_n == 1'b0) rdy_reg <= 1'b1; + else begin + + /* clear flag when operation is started */ + if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; + + /* set flag after operation is finished */ + if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; + + end + + + // + // Parameters Latch + // + reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch; + + /* save number of words in a and b when new operation starts */ + always @(posedge clk) + // + if (fsm_next_state == FSM_STATE_LOAD_B_START) + ab_num_words_latch <= ab_num_words; + + + // + // Systolic Cycle Counters + // + + /* handy values */ + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; + + /* counters */ + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init; + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; + + /* handy increment values */ + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; + + /* handy stop flags */ + wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0; + wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; + wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; + + /* delayed load counter */ + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly; + always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load; + + + // + // Multiplier Iteration Counter + // + + /* handy values */ + wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; + wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; + + /* counter */ + reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt; + + /* handy increment value and stop flag */ + wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1; + wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0; + + + // + // Initialization Counter Control Logic + // + always @(posedge clk) begin + // + case (fsm_state) + FSM_STATE_LOAD_B_START, + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero; + + FSM_STATE_LOAD_B_SHIFT, + FSM_STATE_LOAD_N_COEFF_SHIFT, + FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next; + endcase + // + case (fsm_state) + FSM_STATE_LOAD_B_START, + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero; + + FSM_STATE_LOAD_B_WRITE, + FSM_STATE_LOAD_N_COEFF_WRITE, + FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init; + endcase + // + end + + + // + // Operand Loader + // + + /* + * Explain how parallelized loader works here... + * + */ + + /* loader banks */ + localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0; + localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1; + localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2; + + /* loader input */ + reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1]; + reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; + + /* loader output */ + wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1]; + + /* generate parallelized loader */ + + // + // Loader currently stores B, N_COEFF and N, it can be coded another way + // to initially store B, then AB, then Q. Some memory can be saved thay way. + // Maybe later... + // + + genvar i; + generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1) + // + begin : gen_bram_1rw_readfirst_loader + // + bram_1rw_readfirst # + ( + .MEM_WIDTH (32), + .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2) + ) + bram_loader + ( + .clk (clk), + .a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}), + .a_wr (loader_wren[i]), + .a_in (loader_din[i]), + .a_out (loader_dout[i]) + ); + // + end + // + endgenerate + + + // + // Block Memory Addresses + // + + /* + * Explain why there are two memory sizes. + * + */ + + /* the very first addresses */ + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}}; + wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}}; + + /* the very last addresses */ + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch}; + wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1}; + + /* address registers */ + reg [OPERAND_ADDR_WIDTH-1:0] a_addr; + reg [OPERAND_ADDR_WIDTH-1:0] b_addr; + reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr; + reg [OPERAND_ADDR_WIDTH-1:0] n_addr; + reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext; + reg [OPERAND_ADDR_WIDTH-1:0] q_addr; + reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext; + reg [OPERAND_ADDR_WIDTH-1:0] s_addr; + reg [OPERAND_ADDR_WIDTH-1:0] sn_addr; + reg [OPERAND_ADDR_WIDTH-1:0] r_addr; + + /* handy increment values */ + wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1; + + /* handy stop flags */ + wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0; + wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0; + wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0; + + /* delayed B address */ + reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly; + always @(posedge clk) b_addr_dly <= b_addr; + + reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly; + always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr; + + reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly; + always @(posedge clk) n_addr_dly <= n_addr; + + /* map registers to top-level ports */ + assign a_bram_addr = a_addr; + assign b_bram_addr = b_addr; + assign n_coeff_bram_addr = n_coeff_addr; + assign n_bram_addr = n_addr; + assign r_bram_addr = r_addr; + + + // + // Flag + // + reg flag_select_s; + + + // + // Memory Address Control Logic + // + always @(posedge clk) begin + // + case (fsm_next_state) + FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero; + FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero; + FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero; + + FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next; + FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next; + FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next; + endcase + // + case (fsm_state) + FSM_STATE_MULT_Q_N_RELOAD: + if (qn_addr_ext == {1'b0, bram_addr_last}) + n_addr <= bram_addr_zero; + else if (qn_addr_ext > {1'b0, bram_addr_last}) + n_addr <= n_addr_next; + + endcase + // + case (fsm_state) + FSM_STATE_SAVE_START: r_addr <= bram_addr_zero; + FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next; + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero; + FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; + endcase + // + end + + + // + // Internal Memories + // + + /* memory inputs */ + reg [31: 0] ab_data_in; + reg [31: 0] q_data_in; + reg [31: 0] qn_data_in; + wire [31: 0] s_data_in; + wire [31: 0] sn_data_in; + reg [31: 0] r_data_in; + + /* memory outputs */ + wire [31: 0] ab_data_out; + wire [31: 0] q_data_out; + wire [31: 0] qn_data_out; + wire [31: 0] s_data_out; + wire [31: 0] sn_data_out; + + /* write enables */ + reg ab_wren; + reg q_wren; + reg qn_wren; + reg s_wren; + reg sn_wren; + reg r_wren; + + /* map */ + assign r_bram_in = r_data_in; + assign r_bram_wr = r_wren; + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) + bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) + bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)); + + + // + // Wide Operand Loader + // + integer j; + + /* shift logic */ + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_B_SHIFT: begin + + /* update the rightmost part of loader buffer */ + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; + + /* shift the loader buffer to the left */ + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_din[j-1] <= loader_din[j]; + + end + // + FSM_STATE_LOAD_N_COEFF_SHIFT: begin + + /* update the rightmost part of loader buffer */ + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}}; + + /* shift the loader buffer to the left */ + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_din[j-1] <= loader_din[j]; + + end + // + FSM_STATE_LOAD_N_SHIFT: begin + + /* update the rightmost part of loader buffer */ + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}}; + + /* shift the loader buffer to the left */ + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_din[j-1] <= loader_din[j]; + + end + // + endcase + + + /* write enable logic */ + always @(posedge clk) + // + case (fsm_next_state) + + FSM_STATE_LOAD_B_WRITE, + FSM_STATE_LOAD_N_COEFF_WRITE, + FSM_STATE_LOAD_N_WRITE: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_wren[j] <= 1'b1; + + default: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_wren[j] <= 1'b0; + + endcase + + /* loader address update logic */ + always @(posedge clk) begin + // + case (fsm_state) + + FSM_STATE_LOAD_B_START, + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_LOAD_N_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= syst_cnt_zero; + + FSM_STATE_LOAD_B_WRITE, + FSM_STATE_LOAD_N_COEFF_WRITE, + FSM_STATE_LOAD_N_WRITE: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init; + + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START, + FSM_STATE_MULT_A_B_RELOAD, + FSM_STATE_MULT_AB_N_COEFF_RELOAD, + FSM_STATE_MULT_Q_N_RELOAD: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= syst_cnt_zero; + + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init; + endcase + // + case (fsm_next_state) + + FSM_STATE_LOAD_B_START, + FSM_STATE_MULT_A_B_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_msb[j] <= LOADER_ADDR_MSB_B; + + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_MULT_AB_N_COEFF_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF; + + FSM_STATE_LOAD_N_START, + FSM_STATE_MULT_Q_N_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_msb[j] <= LOADER_ADDR_MSB_N; + + endcase + // + end + + + // + // Systolic Array of Processing Elements + // + reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1]; + wire [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1]; + wire [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1]; + wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1]; + wire [31: 0] pe_c_out [0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [31: 0] pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1]; + + + // + // These can be turned into a FIFO (maybe later?)... + // + //reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1]; + //reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1]; + + reg fifo_c_rst; + reg fifo_t_rst; + + wire fifo_c_wren; + wire fifo_c_rden; + + wire fifo_t_wren; + wire fifo_t_rden; + + wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din; + wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout; + + wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din; + wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout; + + /**/ + modexpa7_simple_fifo # + ( + .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH), + .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH) + ) + fifo_c + ( + .clk (clk), + .rst (fifo_c_rst), + .wr_en (fifo_c_wren), + .d_in (fifo_c_din), + .rd_en (fifo_c_rden), + .d_out (fifo_c_dout) + ); + + modexpa7_simple_fifo # + ( + .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH), + .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH) + ) + fifo_t + ( + .clk (clk), + .rst (fifo_t_rst), + .wr_en (fifo_t_wren), + .d_in (fifo_t_din), + .rd_en (fifo_t_rden), + .d_out (fifo_t_dout) + ); + + generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1) + begin : modexpa7_systolic_pe_multiplier + modexpa7_systolic_pe systolic_pe_inst + ( + .clk (clk), + .a (pe_a[i]), + .b (pe_b[i]), + .t (pe_t[i]), + .c_in (pe_c_in[i]), + .p (pe_p[i]), + .c_out (pe_c_out[i]) + ); + assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32]; + assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32]; + assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i]; + always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i]; + end + endgenerate + + + + + + // + // Shift Registers + // + reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load; + reg [SYSTOLIC_PE_LATENCY :0] shreg_latency; + reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload; + + wire shreg_done_load = shreg_load[syst_cnt_last]; + wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; + wire shreg_done_unload = shreg_unload[syst_cnt_last]; + + reg shreg_now_loading; + reg shreg_now_latency; + reg shreg_now_unloading; + + reg shreg_done_latency_dly; + + always @(posedge clk) + shreg_done_latency_dly <= shreg_done_latency; + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_LOAD_N_FINAL: begin + shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; + shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0}; + shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; + end + // + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START, + FSM_STATE_MULT_A_B_RELOAD, + FSM_STATE_MULT_AB_N_COEFF_RELOAD, + FSM_STATE_MULT_Q_N_RELOAD: begin + shreg_now_loading <= 1'b1; + shreg_now_latency <= 1'b1; + shreg_now_unloading <= 1'b0; + shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1}; + shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1}; + shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; + end + // + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: begin + shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0}; + shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0}; + shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]}; + + if (shreg_done_load) shreg_now_loading <= 1'b0; + if (shreg_done_latency) shreg_now_latency <= 1'b0; + if (shreg_done_latency) shreg_now_unloading <= 1'b1; + else if (shreg_done_unload) shreg_now_unloading <= 1'b0; + + end + // + default: begin + shreg_now_loading <= 1'b0; + shreg_now_latency <= 1'b0; + shreg_now_unloading <= 1'b0; + end + // + endcase + + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START: fifo_c_rst <= 1'b1; + + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0; + endcase + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START: fifo_t_rst <= 1'b1; + + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0; + endcase + + + reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] pe_p_msb_dly; + + always @(posedge clk) + // + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j]; + + wire [31: 0] pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0; + assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly}; + + + + reg shreg_now_unloading_dly; + always @(posedge clk) + shreg_now_unloading_dly <= shreg_now_unloading; + + assign fifo_c_wren = shreg_now_unloading_dly; + assign fifo_c_rden = shreg_now_loading; + + assign fifo_t_wren = shreg_now_unloading_dly; + assign fifo_t_rden = shreg_now_loading; + + + + + always @(posedge clk) begin + // + case (fsm_state) + FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero; + FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero; + FSM_STATE_MULT_Q_N_START: begin qn_addr_ext <= bram_addr_ext_zero; + ab_addr_ext <= bram_addr_ext_zero; + end + + FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next; + FSM_STATE_MULT_Q_N_RELOAD: begin qn_addr_ext <= qn_addr_ext_next; + ab_addr_ext <= ab_addr_ext_next; + end + endcase + // + case (fsm_state) + + FSM_STATE_MULT_Q_N_RELOAD: begin + // + if (qn_addr_ext == {1'b0, bram_addr_last}) begin + s_addr <= bram_addr_zero; + sn_addr <= bram_addr_zero; + end + // + if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin + s_addr <= s_addr_next; + sn_addr <= sn_addr_next; + end + // + if (qn_addr_ext == bram_addr_ext_last) begin + s_addr <= bram_addr_zero; + sn_addr <= bram_addr_zero; + end + // + end + // + /* + case (fsm_state) + + FSM_STATE_MULT_Q_N_RELOAD: begin + if (qn_addr_ext == {1'b0, bram_addr_last}) begin + s_addr <= bram_addr_zero; + sn_addr <= bram_addr_zero; + end + + if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin + s_addr <= s_addr_next; + sn_addr <= sn_addr_next; + end + + if (qn_addr_ext == bram_addr_ext_last) begin + s_addr <= bram_addr_zero; + sn_addr <= bram_addr_zero; + end + + end + + FSM_STATE_MULT_Q_N_FINAL, + FSM_STATE_SAVE_START, + FSM_STATE_SAVE_WRITE: begin + s_addr <= !s_addr_done ? s_addr_next : s_addr; + sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr; + end + */ + endcase + + // + case (fsm_next_state) + FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next; + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero; + FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr; + endcase + + // + end + + always @(posedge clk) begin + // + if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin + ab_wren <= shreg_done_latency_dly; + ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + ab_wren <= 1'b0; + ab_data_in <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin + q_wren <= shreg_done_latency_dly; + q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + q_wren <= 1'b0; + q_data_in <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin + qn_wren <= shreg_done_latency_dly; + qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + qn_wren <= 1'b0; + qn_data_in <= 32'hXXXXXXXX; + end + // + case (fsm_state) + FSM_STATE_SAVE_START: r_wren <= 1'b1; + FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done; + default: r_wren <= 1'b0; + endcase + // + end + + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START, + FSM_STATE_MULT_A_B_RELOAD, + FSM_STATE_MULT_AB_N_COEFF_RELOAD, + FSM_STATE_MULT_Q_N_RELOAD: + // + syst_cnt_load <= syst_cnt_zero; + + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: + // + syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; + + endcase + + + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: begin + + if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero; + else if (shreg_now_unloading) + syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; + + end + endcase + + + // + // T and C_IN can be moved to a separate code block + // + always @(posedge clk) begin + // + if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + // + if (shreg_now_loading) begin + pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out; + pe_b[j] <= loader_dout[j]; + //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; + //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + //pe_t[j] <= 32'hXXXXXXXX; + //pe_c_in[j] <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + // + if (shreg_now_loading) begin + pe_a[j] <= ab_data_out; + pe_b[j] <= loader_dout[j]; + //pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; + //pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + //pe_t[j] <= 32'hXXXXXXXX; + //pe_c_in[j] <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + // + if (shreg_now_loading) begin + pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out; + pe_b[j] <= loader_dout[j]; + //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; + //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + //pe_t[j] <= 32'hXXXXXXXX; + //pe_c_in[j] <= 32'hXXXXXXXX; + end + // + + // + end + + + // + // Adder + // + + reg add1_ce; // clock enable + wire [31: 0] add1_s; // sum output + wire add1_c_in; // carry input + reg [31: 0] add1_a; // A-input + reg [31: 0] add1_b; // B-input + reg add1_c_in_mask; // flag to not carry anything into the very first word + wire add1_c_out; // carry output + + // add masking into carry feedback chain + assign add1_c_in = add1_c_out & ~add1_c_in_mask; + + // mask carry for the very first word of N + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) + add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0; + + modexpa7_adder32 add1_inst + ( + .clk (clk), + .ce (add1_ce), + .a (add1_a), + .b (add1_b), + .c_in (add1_c_in), + .s (add1_s), + .c_out (add1_c_out) + ); + + always @(posedge clk) + // + add1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_ADD_S) ? 1'b1 : 1'b0; + + always @(posedge clk) + // + if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin + add1_a <= pe_p[0]; + add1_b <= ab_data_out; + end + + + // + // Subtractor + // + /* + * This subtractor is used to calculate SN = S - N. + * + */ + + reg sub1_ce; // clock enable + wire [31: 0] sub1_d; // difference output + wire sub1_b_in; // borrow input + reg [31: 0] sub1_a; // A-input + reg [31: 0] sub1_b; // B-input + reg sub1_b_in_mask; // flag to not borrow anything from the very first word + wire sub1_b_out; // borrow output + + // add masking into borrow feedback chain + assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; + + // mask carry for the very first word of N TODO! + //always @(posedge clk) + // + //if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) + //add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0; + + modexpa7_subtractor32 sub1_inst + ( + .clk (clk), + .ce (sub1_ce), + .a (sub1_a), + .b (sub1_b), + .b_in (sub1_b_in), + .d (sub1_d), + .b_out (sub1_b_out) + ); + + always @(posedge clk) + // + sub1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr}) ? 1'b1 : 1'b0; + + always @* + sub1_a = add1_s; + + always @(posedge clk) + // + //if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin + //add1_a <= pe_p[0]; + //add1_b <= ab_data_out; + //end + + + /* + reg sub1_ce; // clock enable + reg [31: 0] sub1_d; // difference output + wire sub1_b_in; // borrow input + wire [31: 0] sub1_a; // A-input + reg [31: 0] sub1_b; // B-input + reg sub1_b_in_mask; // flag to not borrow anything from the very first word*/ +// wire sub1_b_out; // borrow output + /* + + // add masking into borrow feedback chain + assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; + + always @(posedge clk) + // + if (sub1_ce) + // + {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in}; + + assign sub1_a = add1_s; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX; + else + sub1_b <= 32'hXXXXXXXX; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0; + else + sub1_b_in_mask <= 1'b0; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr}); + else + sub1_ce <= 1'b0; + */ + + + assign s_data_in = add1_s; + assign sn_data_in = sub1_d; + + always @(posedge clk) begin + // + s_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_ADD_S) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0; + sn_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0; + // + end + + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_FINAL) + flag_select_s <= sub1_b_out & ~add1_c_out; + + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_SAVE_START, + FSM_STATE_SAVE_WRITE: + r_data_in <= flag_select_s ? s_data_out : sn_data_out; + endcase + + + + // + // FSM Process + // + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; + else fsm_state <= fsm_next_state; + + + // + // FSM Transition Logic + // + always @* begin + // + fsm_next_state = FSM_STATE_STOP; + // + case (fsm_state) + + FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START; + else fsm_next_state = FSM_STATE_IDLE; + // + FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE; + else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL; + else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START; + // + FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE; + else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL; + else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START; + // + FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE; + else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL; + else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START; + // + FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD; + else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL; + else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START; + // + FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD; + else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL; + else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START; + // + FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_ADD_S; + else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_ADD_S: fsm_next_state = FSM_STATE_MULT_Q_N_SUB_SN; + FSM_STATE_MULT_Q_N_SUB_SN: fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD; + FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL; + else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START; + // + FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE; + FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL; + else fsm_next_state = FSM_STATE_SAVE_WRITE; + FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP; + // + FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; + + endcase + // + end + + +endmodule + +//====================================================================== +// End of file +//====================================================================== |