aboutsummaryrefslogtreecommitdiff
path: root/src/rtl/modexpa7_systolic_multiplier.v
diff options
context:
space:
mode:
Diffstat (limited to 'src/rtl/modexpa7_systolic_multiplier.v')
-rw-r--r--src/rtl/modexpa7_systolic_multiplier.v876
1 files changed, 876 insertions, 0 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
new file mode 100644
index 0000000..0849b61
--- /dev/null
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -0,0 +1,876 @@
+//======================================================================
+//
+// modexpa7_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module modexpa7_systolic_multiplier #
+ (
+ //
+ // This sets the address widths of memory buffers. Internal data
+ // width is 32 bits, so for e.g. 1024-bit operands buffers must store
+ // 1024 / 32 = 32 words, and these need 5-bit address bus, because
+ // 2 ** 5 = 32.
+ //
+ parameter OPERAND_ADDR_WIDTH = 5,
+
+ //
+ // This sets the width of the systolic cycle counter. TODO: Explain.
+ //
+ parameter SYSTOLIC_ARRAY_POWER = 3
+ )
+ (
+ input clk,
+ input rst_n,
+
+ input ena,
+ output rdy,
+
+ output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr,
+
+ input [ 32-1:0] a_bram_out,
+ input [ 32-1:0] b_bram_out,
+ input [ 32-1:0] n_bram_out,
+ input [ 32-1:0] n_coeff_bram_out,
+
+ output [ 32-1:0] r_bram_in,
+ output r_bram_wr,
+
+ input [OPERAND_ADDR_WIDTH-1:0] n_num_words
+ );
+
+
+ //
+ // Constants
+ //
+ localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
+ localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER;
+ localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH;
+
+ localparam SYSTOLIC_PE_LATENCY = 4;
+
+
+ //
+ // FSM Declaration
+ //
+ localparam [ 3: 0] FSM_STATE_IDLE = 4'd0;
+ localparam [ 3: 0] FSM_STATE_INIT_ZERO_ADDR = 4'd1;
+ localparam [ 3: 0] FSM_STATE_INIT_NEXT_ADDR = 4'd2;
+ localparam [ 3: 0] FSM_STATE_INIT_LAST_ADDR = 4'd3;
+ localparam [ 3: 0] FSM_STATE_PIPE_CRUNCH = 4'd4;
+ localparam [ 3: 0] FSM_STATE_PIPE_RELOAD = 4'd5;
+ localparam [ 3: 0] FSM_STATE_SAVE_ZERO_ADDR = 4'd6;
+ localparam [ 3: 0] FSM_STATE_SAVE_NEXT_ADDR = 4'd7;
+ localparam [ 3: 0] FSM_STATE_SAVE_LAST_ADDR = 4'd8;
+ localparam [ 3: 0] FSM_STATE_STOP = 4'd9;
+
+ reg [ 3: 0] fsm_state = FSM_STATE_IDLE;
+ reg [ 3: 0] fsm_next_state;
+
+
+ //
+ // Enable Delay (Trigger)
+ //
+ reg ena_dly = 1'b0;
+ wire ena_trig = ena && !ena_dly;
+ always @(posedge clk) ena_dly <= ena;
+
+
+ //
+ // Parameters Latch
+ //
+ reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch;
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
+ n_num_words_latch <= n_num_words;
+
+
+ //
+ // Addresses
+ //
+ localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch;
+
+
+ //
+ // BRAM Addresses
+ //
+ reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_reg;
+ reg [OPERAND_ADDR_WIDTH-1:0] r_bram_addr_reg;
+
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr = s_bram_addr_reg;
+
+ reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_dly;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_dly;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_dly;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_dly;
+
+ wire [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_next = b_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_next = a_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_next = n_bram_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_next = s_bram_addr + 1'b1;
+
+ wire b_bram_addr_done =
+ (b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+ wire s_bram_addr_done =
+ (s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+ assign b_bram_addr = b_bram_addr_reg;
+ assign a_bram_addr = a_bram_addr_reg;
+ assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
+ assign n_bram_addr = n_bram_addr_reg;
+ assign r_bram_addr = r_bram_addr_reg;
+
+ always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
+ always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
+ always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
+ always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_ZERO_ADDR: b_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_INIT_NEXT_ADDR: b_bram_addr_reg <= b_bram_addr_next;
+ endcase
+
+ always @(posedge clk)
+ case (fsm_next_state)
+ FSM_STATE_SAVE_ZERO_ADDR: s_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_SAVE_NEXT_ADDR: s_bram_addr_reg <= s_bram_addr_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_LAST_ADDR: a_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_PIPE_RELOAD: a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_ZERO_ADDR: n_coeff_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_INIT_NEXT_ADDR: n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
+ endcase
+
+
+
+
+ //
+ // Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
+ //
+ wire [SYSTOLIC_PE_LATENCY:0] pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+
+ reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb;
+ reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb;
+
+ wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb_next =
+ {pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};
+
+ wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb_next =
+ {pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};
+
+ wire pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
+ wire pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR,
+ FSM_STATE_PIPE_RELOAD: pe_latency_ab_lsb <= pe_latency_start;
+ FSM_STATE_PIPE_CRUNCH: pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
+ pe_latency_ab_lsb : pe_latency_ab_lsb_next;
+ endcase
+
+ //
+ // Buffers
+ //
+ integer i, j;
+
+ reg [31: 0] b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_ZERO_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ b_buf[i][j] <= 32'd0;
+
+ FSM_STATE_INIT_NEXT_ADDR,
+ FSM_STATE_INIT_LAST_ADDR:
+ b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_ZERO_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ n_coeff_buf[i][j] <= 32'd0;
+
+ FSM_STATE_INIT_NEXT_ADDR,
+ FSM_STATE_INIT_LAST_ADDR:
+ n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_ZERO_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ n_buf[i][j] <= 32'd0;
+
+ FSM_STATE_INIT_NEXT_ADDR,
+ FSM_STATE_INIT_LAST_ADDR:
+ n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
+ endcase
+
+
+
+
+
+
+ //
+ // Cycle Counters
+ //
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab;
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_q;
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn;
+ reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_s;
+
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_half = {1'b0, n_num_words};
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_last = {n_num_words, 1'b1};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+ wire mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
+ wire mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
+ wire mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
+ wire mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
+
+ wire syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;
+
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab_next = mult_cnt_ab + 1'b1;
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_q_next = mult_cnt_q + 1'b1;
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn_next = mult_cnt_qn + 1'b1;
+ wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_s_next = mult_cnt_s + 1'b1;
+
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;
+
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR,
+ FSM_STATE_PIPE_RELOAD: syst_cnt <= syst_cnt_zero;
+ FSM_STATE_PIPE_CRUNCH: syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_ab <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_q <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_qn <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR: mult_cnt_s <= mult_cnt_zero;
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
+ endcase
+
+
+ always @(posedge clk) begin
+ syst_cnt_dly[0] <= syst_cnt;
+ for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
+ syst_cnt_dly[i] <= syst_cnt_dly[i-1];
+ end
+
+ //
+ // Systolic Array
+ //
+ wire [31: 0] mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+ wire [31: 0] mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ wire [31: 0] mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+ wire [31: 0] mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ wire [31: 0] mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
+ wire [31: 0] mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ wire [31: 0] mul_ab_a = (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
+ reg [31: 0] mul_q_a_int;
+ reg [31: 0] mul_q_a;
+ reg [31: 0] mul_qn_a_int;
+ reg [31: 0] mul_qn_a;
+
+ reg [31: 0] t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ reg [31: 0] t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ reg [31: 0] t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ reg [31: 0] c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ genvar syst;
+ generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
+ begin : gen_mul
+
+ /*modexpa7_*/pe_mul mul_ab_inst
+ (
+ .clk (clk),
+ .a (mul_ab_a),
+ .b (b_buf[syst_cnt][syst]),
+ .t (t_ab[syst_cnt][syst]),
+ .c_in (c_ab_in[syst_cnt][syst]),
+
+ .p (mul_ab_p[syst]),
+ .c_out (mul_ab_c_out[syst])
+ );
+
+ /*modexpa7_*/pe_mul mul_q_inst
+ (
+ .clk (clk),
+ .a (mul_q_a),
+ .b (n_coeff_buf[syst_cnt][syst]),
+ .t (t_q[syst_cnt][syst]),
+ .c_in (c_q_in[syst_cnt][syst]),
+
+ .p (mul_q_p[syst]),
+ .c_out (mul_q_c_out[syst])
+ );
+
+
+ /*modexpa7_*/pe_mul mul_qn_inst
+ (
+ .clk (clk),
+ .a (mul_qn_a),
+ .b (n_buf[syst_cnt][syst]),
+ .t (t_qn[syst_cnt][syst]),
+ .c_in (c_qn_in[syst_cnt][syst]),
+
+ .p (mul_qn_p[syst]),
+ .c_out (mul_qn_c_out[syst])
+ );
+
+ end
+ endgenerate
+
+ //
+ // c_ab
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_ab_in[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
+ endcase
+
+ //
+ // c_q
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_q_in[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
+ endcase
+
+ //
+ // c_qn
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_qn_in[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
+ endcase
+
+ //
+ // t_ab
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_ab[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done) begin
+ if (syst_cnt_latency > syst_cnt_zero)
+ t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
+ end
+
+ endcase
+
+
+ //
+ // t_q
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_q[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
+ if (syst_cnt_latency > syst_cnt_zero)
+ t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
+ end
+
+ endcase
+
+
+ //
+ // t_qn
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_INIT_LAST_ADDR:
+ for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_qn[i][j] <= 32'd0;
+
+ FSM_STATE_PIPE_CRUNCH:
+ if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
+ if (syst_cnt_latency > syst_cnt_zero)
+ t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
+ end
+
+ endcase
+
+ //
+ // Latency 2
+ //
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ //
+ case (fsm_state)
+ FSM_STATE_INIT_LAST_ADDR,
+ FSM_STATE_PIPE_RELOAD: pe_latency_ab_msb <= pe_latency_start;
+ FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done)
+ pe_latency_ab_msb <= pe_latency_ab_msb_done ?
+ pe_latency_ab_msb : pe_latency_ab_msb_next;
+ endcase
+
+
+ //
+ // Adder
+ //
+ reg pe_add_ce;
+ reg [31: 0] pe_add_a0;
+ reg [31: 0] pe_add_a1;
+ reg [31: 0] pe_add_a2;
+ reg [31: 0] pe_add_b0;
+
+ reg pe_add_c_in;
+ wire [31: 0] pe_add_s;
+ wire pe_add_c_out;
+
+ reg pe_sub_ce;
+ reg [31: 0] pe_sub_a0;
+ reg [31: 0] pe_sub_b0;
+
+ reg pe_sub_b_in;
+ wire [31: 0] pe_sub_d;
+ wire pe_sub_b_out;
+
+ always @(posedge clk)
+ pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;
+
+ always @(posedge clk)
+ pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);
+
+ always @(posedge clk)
+ //
+ if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
+ pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;
+
+ always @(posedge clk)
+ //
+ if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
+ pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
+
+
+ modexpa7_pe_add pe_add_inst
+ (
+ .clk (clk),
+ .ce (pe_add_ce),
+ .a (pe_add_a2),
+ .b (pe_add_b0),
+ .c_in (pe_add_c_in),
+ .s (pe_add_s),
+ .c_out (pe_add_c_out)
+ );
+
+ modexpa7_pe_sub pe_sub_inst
+ (
+ .clk (clk),
+ .ce (pe_sub_ce),
+ .a (pe_sub_a0),
+ .b (pe_sub_b0),
+ .b_in (pe_sub_b_in),
+ .d (pe_sub_d),
+ .b_out (pe_sub_b_out)
+ );
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
+ pe_add_a0 <= mul_ab_p[0];
+ pe_add_a1 <= pe_add_a0;
+ pe_add_a2 <= pe_add_a1;
+ end
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ pe_sub_a0 <= pe_add_s;
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ pe_add_b0 <= mul_qn_p[0];
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
+
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_ZERO_ADDR: n_bram_addr_reg <= bram_addr_zero;
+ FSM_STATE_INIT_NEXT_ADDR: n_bram_addr_reg <= n_bram_addr_next;
+ FSM_STATE_PIPE_RELOAD: begin
+ if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
+ if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
+ end
+ endcase
+
+
+ //
+ // Ready Flag Logic
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else begin
+ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
+ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
+ end
+
+
+ //
+ //
+ //
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ mul_q_a_int <= mul_ab_p[0];
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ mul_qn_a_int <= mul_q_p[0];
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ mul_q_a <= mul_q_a_int; // TODO: Add masking! Maybe not needed after all?..
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
+
+ //
+ // Debug
+ //
+ //always @(posedge clk) begin
+ //
+ //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ //$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
+ //
+ //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
+ //$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
+ //
+ //if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ //$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
+ //
+ //if (fsm_state == FSM_STATE_PIPE_RELOAD)
+ //$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
+ //
+ //end
+
+
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_rd;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr;
+ wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
+ reg s_bram_en;
+
+ wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_rd;
+ reg [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr;
+ wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
+ reg sn_bram_en;
+
+ assign s_bram_addr_rd = s_bram_addr;
+ assign sn_bram_addr_rd = s_bram_addr;
+
+ wire [31: 0] s_bram_din;
+ wire [31: 0] s_bram_dout;
+
+ wire [31: 0] sn_bram_din;
+ wire [31: 0] sn_bram_dout;
+
+ assign s_bram_din = pe_add_s;
+ assign sn_bram_din = pe_sub_d;
+
+ always @(posedge clk)
+ //
+ s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);
+
+ always @(posedge clk)
+ //
+ sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
+
+ always @(posedge clk) begin
+ //
+ if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
+ if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
+ end
+
+ always @(posedge clk) begin
+ //
+ if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
+ if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
+ end
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_s (.clk(clk),
+ .a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
+ .b_addr(s_bram_addr_rd), .b_out(s_bram_dout));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_sn (.clk(clk),
+ .a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
+ .b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
+
+
+ reg r_bram_en;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_ZERO_ADDR,
+ FSM_STATE_SAVE_NEXT_ADDR: r_bram_en <= 1'b1;
+ default: r_bram_en <= 1'b0;
+
+ endcase
+
+
+
+ reg r_bram_wr_reg;
+
+ assign r_bram_wr = r_bram_wr_reg;
+
+ always @(posedge clk)
+ //
+ r_bram_wr_reg <= r_bram_en;
+
+
+ wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
+
+
+ reg [31: 0] r_bram_in_reg;
+
+ assign r_bram_in = r_bram_in_reg;
+
+ always @(posedge clk)
+ //
+ if (r_bram_en)
+ r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
+
+ always @(posedge clk)
+ //
+ if (r_bram_en)
+ r_bram_addr_reg <= s_bram_addr_dly;
+
+
+ //
+ // FSM Transition Logic
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_next_state;
+
+ always @* begin
+ //
+ fsm_next_state = FSM_STATE_STOP;
+ //
+ case (fsm_state)
+
+ FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
+ else fsm_next_state = FSM_STATE_IDLE;
+
+ FSM_STATE_INIT_ZERO_ADDR: fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+
+ FSM_STATE_INIT_NEXT_ADDR: if (b_bram_addr_done) fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
+ else fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
+
+ FSM_STATE_INIT_LAST_ADDR: fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+ FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) fsm_next_state = pe_latency_ab_msb_done ?
+ FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
+ else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+ FSM_STATE_PIPE_RELOAD: if (mult_cnt_s_done) fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
+ else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
+
+ FSM_STATE_SAVE_ZERO_ADDR: fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+
+ FSM_STATE_SAVE_NEXT_ADDR: if (s_bram_addr_done) fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
+ else fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
+
+ FSM_STATE_SAVE_LAST_ADDR: fsm_next_state = FSM_STATE_STOP;
+
+ FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
+
+ endcase
+ end
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================