aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-27 10:43:49 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-27 10:43:49 +0300
commit9f77c4f559daf20e8b495e26003178c57da93fe2 (patch)
treec02185af377ea7941ff47ff419397b24eff0e3c3
parent76e22798dfa410c655dcec555f73bb3025f521e4 (diff)
Work in progress.
-rw-r--r--src/rtl/modexpa7_systolic_multiplier.v8
-rw-r--r--src/rtl/modexpa7_systolic_multiplier_array.v335
-rw-r--r--src/rtl/modexpa7_systolic_multiplier_fix.v1202
3 files changed, 1391 insertions, 154 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index 9d96f98..32ed543 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -292,7 +292,8 @@ module modexpa7_systolic_multiplier #
bram_p
( .clk(clk),
.a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(),
- .b_addr(p_addr_ext_rd), .b_out(p_data_out));
+ .b_addr(p_addr_ext_rd), .b_out(p_data_out)
+ );
/*
@@ -397,13 +398,14 @@ module modexpa7_systolic_multiplier #
.loader_addr_rd (loader_addr_rd),
- .pe_a_wide (),
+ .pe_a_wide ({SYSTOLIC_ARRAY_LENGTH{a_bram_out}}),
.pe_b_wide (pe_b_wide),
+ .a_bram_addr (a_bram_addr),
+
.p_bram_addr (p_addr_ext_wr),
.p_bram_in (p_data_in),
.p_bram_wr (p_wren),
-
.n_num_words (n_num_words_latch),
.p_num_words (p_num_words_latch)
diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v
index 029d9d6..22d5aaf 100644
--- a/src/rtl/modexpa7_systolic_multiplier_array.v
+++ b/src/rtl/modexpa7_systolic_multiplier_array.v
@@ -42,23 +42,25 @@ module modexpa7_systolic_multiplier_array #
parameter SYSTOLIC_ARRAY_POWER = 2
)
(
- input clk,
- input rst_n,
+ input clk,
+ input rst_n,
- input ena,
- output rdy,
+ input ena,
+ output rdy,
output [OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER - 1 : 0] loader_addr_rd,
- input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide,
- input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide,
+ input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide,
+ input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide,
+
+ output [ OPERAND_ADDR_WIDTH - 1 : 0] a_bram_addr,
- output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr,
- output [ 32 - 1 : 0] p_bram_in,
- output p_bram_wr,
+ output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr,
+ output [ 32 - 1 : 0] p_bram_in,
+ output p_bram_wr,
- input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words,
- input [ OPERAND_ADDR_WIDTH : 0] p_num_words
+ input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words,
+ input [ OPERAND_ADDR_WIDTH : 0] p_num_words
);
@@ -75,7 +77,7 @@ module modexpa7_systolic_multiplier_array #
localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
localparam [ 7: 0] FSM_STATE_MULT_START = 8'h11;
- localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12;
+ localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12;
localparam [ 7: 0] FSM_STATE_MULT_RELOAD = 8'h13;
localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h14;
@@ -138,6 +140,107 @@ module modexpa7_systolic_multiplier_array #
/*
+ * Systolic Cycle Counters
+ */
+
+ // handy values
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+ // counters
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
+
+ // handy increment values
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
+
+ // handy stop flags
+ wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
+ wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD:
+ //
+ syst_cnt_load <= syst_cnt_zero;
+
+ FSM_STATE_MULT_CRUNCH:
+ //
+ syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_CRUNCH) begin
+ //
+ if (shreg_done_latency)
+ syst_cnt_unload <= syst_cnt_zero;
+ else if (shreg_now_unloading)
+ syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+ //
+ end
+
+
+ /*
+ * Timing Shift Registers
+ */
+
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
+ reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
+
+ wire shreg_done_load = shreg_load[syst_cnt_last];
+ wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
+ wire shreg_done_unload = shreg_unload[syst_cnt_last];
+
+ reg shreg_now_loading;
+ reg shreg_now_latency;
+ reg shreg_now_unloading;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD: begin
+ //
+ shreg_now_loading <= 1'b1;
+ shreg_now_latency <= 1'b1;
+ shreg_now_unloading <= 1'b0;
+ //
+ shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+ shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1};
+ shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ //
+ end
+ //
+ FSM_STATE_MULT_CRUNCH: begin
+ //
+ shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+ shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+ shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+ //
+ if (shreg_done_load) shreg_now_loading <= 1'b0;
+ if (shreg_done_latency) shreg_now_latency <= 1'b0;
+ if (shreg_done_latency) shreg_now_unloading <= 1'b1;
+ else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
+
+ end
+ //
+ default: begin
+ shreg_now_loading <= 1'b0;
+ shreg_now_latency <= 1'b0;
+ shreg_now_unloading <= 1'b0;
+ end
+ //
+ endcase
+
+
+ /*
* Systolic Array of Processing Elements
*/
reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
@@ -215,195 +318,125 @@ module modexpa7_systolic_multiplier_array #
assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
//
- //assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
- //
- //always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
- //
end
//
endgenerate
-
+
+ /*
+ * FIFO Reset Logic
+ */
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_START: fifo_c_rst <= 1'b1;
+ FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_START: fifo_t_rst <= 1'b1;
+ FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0;
+ endcase
+
+
/*
* Block Memory Interface
*/
// the very first address
- wire [OPERAND_ADDR_WIDTH:0] bram_addr_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_zero = {OPERAND_ADDR_WIDTH {1'b0}};
+ wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
// the very last address
- wire [OPERAND_ADDR_WIDTH:0] bram_addr_last = p_num_words_latch;
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_last = n_num_words_latch;
+ wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_last = p_num_words_latch;
// registers
- reg [OPERAND_ADDR_WIDTH:0] p_addr;
- reg [ 31:0] p_data_in;
- reg p_wren;
+ reg [OPERAND_ADDR_WIDTH - 1 : 0] a_addr;
+ reg [OPERAND_ADDR_WIDTH : 0] p_addr;
+ reg [ 32 - 1 : 0] p_data_in;
+ reg p_wren;
// handy values
- wire [OPERAND_ADDR_WIDTH:0] p_addr_next = p_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] a_addr_next = a_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH : 0] p_addr_next = p_addr + 1'b1;
// handy flags
- wire p_addr_done = (p_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
+ wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire p_addr_done = (p_addr == bram_addr_ext_last) ? 1'b1 : 1'b0;
// map top-level ports to internal registers
+ assign a_bram_addr = a_addr;
assign p_bram_addr = p_addr;
assign p_bram_in = p_data_in;
assign p_bram_wr = p_wren;
-
- /*
- * Systolic Cycle Counters
- */
-
- // handy values
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-
- // counters
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
-
- // handy increment values
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
-
- // handy stop flags
- wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_START,
- FSM_STATE_MULT_RELOAD:
- //
- syst_cnt_load <= syst_cnt_zero;
-
- FSM_STATE_MULT_CRUNCH,
- //
- syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-
- endcase
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_CRUNCH) begin
- //
- if (shreg_done_latency)
- syst_cnt_unload <= syst_cnt_zero;
- else if (shreg_now_unloading)
- syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
- //
- end
-
-
-
- /*
- * Shift Registers
- */
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
- reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
-
- wire shreg_done_load = shreg_load[syst_cnt_last];
- wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
- wire shreg_done_unload = shreg_unload[syst_cnt_last];
-
- reg shreg_now_loading;
- reg shreg_now_latency;
- reg shreg_now_unloading;
-
+ integer j;
always @(posedge clk)
//
- case (fsm_state)
+ if (fsm_state == FSM_STATE_MULT_CRUNCH)
//
- //FSM_STATE_IDLE: begin
- //shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- //shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
- //shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- //end
- //
- FSM_STATE_MULT_START,
- FSM_STATE_MULT_RELOAD: begin
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
//
- shreg_now_loading <= 1'b1;
- shreg_now_latency <= 1'b1;
- shreg_now_unloading <= 1'b0;
- //
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- //
- end
- //
- FSM_STATE_MULT_CRUNCH: begin
- //
- shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
- shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
- shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
- //
- if (shreg_done_load) shreg_now_loading <= 1'b0;
- if (shreg_done_latency) shreg_now_latency <= 1'b0;
- if (shreg_done_latency) shreg_now_unloading <= 1'b1;
- else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
-
- end
- //
- default: begin
- shreg_now_loading <= 1'b0;
- shreg_now_latency <= 1'b0;
- shreg_now_unloading <= 1'b0;
- end
- //
- endcase
+ if (shreg_now_loading) begin
+ pe_a[j] <= (p_addr > {1'b0, a_addr}) ? 32'd0 : pe_a_wide[32 * (j + 1) - 1 -: 32];
+ pe_b[j] <= pe_b_wide[32 * (j + 1) - 1 -: 32];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ end
+// /*
+// *
+// */
+// always @(posedge clk)
+// //
+// case (fsm_next_state)
+// FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
+// default: p_wren <= 1'b0;
+// endcase
+//
/*
- *
- */
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
- default: p_wren <= 1'b0;
- endcase
-
- /*
- *
+ * Block Memory Address Control
*/
- always @(posedge clk)
+ always @(posedge clk) begin
//
case (fsm_state)
FSM_STATE_MULT_START: p_addr <= bram_addr_zero;
FSM_STATE_MULT_RELOAD: p_addr <= p_addr_next;
+ endcase
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_START: a_addr <= bram_addr_zero;
+ FSM_STATE_MULT_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
endcase
+ //
+ end
/*
- * Loader Control
+ * Loader Address Control
*/
reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr;
assign loader_addr_rd = loader_addr;
- integer j;
always @(posedge clk)
//
case (fsm_next_state)
-
- FSM_STATE_MULT_START,
+ //
+ FSM_STATE_MULT_START,
FSM_STATE_MULT_RELOAD:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr <= syst_cnt_zero;
-
+ loader_addr <= syst_cnt_zero;
+ //
FSM_STATE_MULT_CRUNCH:
//
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-
+ loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+ //
endcase
@@ -433,7 +466,7 @@ module modexpa7_systolic_multiplier_array #
//
FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_CRUNCH;
+ else fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_RELOAD: if (p_addr_done) fsm_next_state = FSM_STATE_MULT_FINAL;
else fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP;
diff --git a/src/rtl/modexpa7_systolic_multiplier_fix.v b/src/rtl/modexpa7_systolic_multiplier_fix.v
new file mode 100644
index 0000000..40b2144
--- /dev/null
+++ b/src/rtl/modexpa7_systolic_multiplier_fix.v
@@ -0,0 +1,1202 @@
+//======================================================================
+//
+// modexpa7_systolic_multiplier.v
+// -----------------------------------------------------------------------------
+// Systolic Montgomery multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module modexpa7_systolic_multiplier #
+ (
+ //
+ // This sets the address widths of memory buffers. Internal data
+ // width is 32 bits, so for e.g. 2048-bit operands buffers must store
+ // 2048 / 32 = 64 words, and these need 6-bit address bus, because
+ // 2 ** 6 = 64.
+ //
+ parameter OPERAND_ADDR_WIDTH = 4,
+
+ //
+ // Explain.
+ //
+ parameter SYSTOLIC_ARRAY_POWER = 1
+ )
+ (
+ input clk,
+ input rst_n,
+
+ input ena,
+ output rdy,
+
+ output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr,
+
+ input [ 32-1:0] a_bram_out,
+ input [ 32-1:0] b_bram_out,
+ input [ 32-1:0] n_bram_out,
+ input [ 32-1:0] n_coeff_bram_out,
+
+ output [ 32-1:0] r_bram_in,
+ output r_bram_wr,
+
+ input [OPERAND_ADDR_WIDTH-1:0] ab_num_words
+ );
+
+
+ //
+ // Include Settings
+ //
+ `include "pe/modexpa7_primitive_switch.v"
+ `include "modexpa7_settings.v"
+
+
+ //
+ // FSM Declaration
+ //
+ localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
+
+ localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11;
+ localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12;
+ localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13;
+ localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14;
+
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24;
+
+ localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34;
+
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41;
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42;
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43;
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44;
+
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51;
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52;
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53;
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54;
+
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61;
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62;
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63;
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64;
+
+ localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71;
+ localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72;
+ localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73;
+
+ localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
+
+ //
+ // FSM State / Next State
+ //
+ reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
+ reg [ 7: 0] fsm_next_state;
+
+
+ //
+ // Enable Delay and Trigger
+ //
+ reg ena_dly = 1'b0;
+
+ /* delay enable by one clock cycle */
+ always @(posedge clk) ena_dly <= ena;
+
+ /* trigger new operation when enable goes high */
+ wire ena_trig = ena && !ena_dly;
+
+
+ //
+ // Ready Flag Logic
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+
+ /* reset flag */
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else begin
+
+ /* clear flag when operation is started */
+ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
+
+ /* set flag after operation is finished */
+ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
+
+ end
+
+
+ //
+ // Parameters Latch
+ //
+ reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch;
+
+ /* save number of words in a and b when new operation starts */
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_LOAD_B_START)
+ ab_num_words_latch <= ab_num_words;
+
+
+ //
+ // Systolic Cycle Counters
+ //
+
+ /* handy values */
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+ /* counters */
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
+
+ /* handy increment values */
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
+
+ /* handy stop flags */
+ wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0;
+ wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
+ wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
+
+ /* delayed load counter */
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly;
+ always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
+
+
+ //
+ // Multiplier Iteration Counter
+ //
+
+ /* handy values */
+ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
+ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
+
+ /* counter */
+ reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt;
+
+ /* handy increment value and stop flag */
+ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1;
+ wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
+
+
+ //
+ // Initialization Counter Control Logic
+ //
+ always @(posedge clk) begin
+ //
+ case (fsm_state)
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero;
+
+ FSM_STATE_LOAD_B_SHIFT,
+ FSM_STATE_LOAD_N_COEFF_SHIFT,
+ FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next;
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero;
+
+ FSM_STATE_LOAD_B_WRITE,
+ FSM_STATE_LOAD_N_COEFF_WRITE,
+ FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
+ endcase
+ //
+ end
+
+
+ //
+ // Operand Loader
+ //
+
+ /*
+ * Explain how parallelized loader works here...
+ *
+ */
+
+ /* loader banks */
+ localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0;
+ localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1;
+ localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2;
+
+ /* loader input */
+ reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
+
+ /* loader output */
+ wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1];
+
+ /* generate parallelized loader */
+
+ //
+ // Loader currently stores B, N_COEFF and N, it can be coded another way
+ // to initially store B, then AB, then Q. Some memory can be saved thay way.
+ // Maybe later...
+ //
+
+ genvar i;
+ generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+ //
+ begin : gen_bram_1rw_readfirst_loader
+ //
+ bram_1rw_readfirst #
+ (
+ .MEM_WIDTH (32),
+ .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2)
+ )
+ bram_loader
+ (
+ .clk (clk),
+ .a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}),
+ .a_wr (loader_wren[i]),
+ .a_in (loader_din[i]),
+ .a_out (loader_dout[i])
+ );
+ //
+ end
+ //
+ endgenerate
+
+
+ //
+ // Block Memory Addresses
+ //
+
+ /*
+ * Explain why there are two memory sizes.
+ *
+ */
+
+ /* the very first addresses */
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}};
+ wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+
+ /* the very last addresses */
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch};
+ wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1};
+
+ /* address registers */
+ reg [OPERAND_ADDR_WIDTH-1:0] a_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
+ reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext;
+ reg [OPERAND_ADDR_WIDTH-1:0] q_addr;
+ reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] sn_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
+
+ /* handy increment values */
+ wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
+
+ /* handy stop flags */
+ wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
+ wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
+ wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+ /* delayed B address */
+ reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
+ always @(posedge clk) b_addr_dly <= b_addr;
+
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly;
+ always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
+
+ reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly;
+ always @(posedge clk) n_addr_dly <= n_addr;
+
+ /* map registers to top-level ports */
+ assign a_bram_addr = a_addr;
+ assign b_bram_addr = b_addr;
+ assign n_coeff_bram_addr = n_coeff_addr;
+ assign n_bram_addr = n_addr;
+ assign r_bram_addr = r_addr;
+
+
+ //
+ // Flag
+ //
+ reg flag_select_s;
+
+
+ //
+ // Memory Address Control Logic
+ //
+ always @(posedge clk) begin
+ //
+ case (fsm_next_state)
+ FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero;
+ FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero;
+ FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero;
+
+ FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next;
+ FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next;
+ FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next;
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_Q_N_RELOAD:
+ if (qn_addr_ext == {1'b0, bram_addr_last})
+ n_addr <= bram_addr_zero;
+ else if (qn_addr_ext > {1'b0, bram_addr_last})
+ n_addr <= n_addr_next;
+
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START: r_addr <= bram_addr_zero;
+ FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next;
+ endcase
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero;
+ FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
+ endcase
+ //
+ end
+
+
+ //
+ // Internal Memories
+ //
+
+ /* memory inputs */
+ reg [31: 0] ab_data_in;
+ reg [31: 0] q_data_in;
+ reg [31: 0] qn_data_in;
+ wire [31: 0] s_data_in;
+ wire [31: 0] sn_data_in;
+ reg [31: 0] r_data_in;
+
+ /* memory outputs */
+ wire [31: 0] ab_data_out;
+ wire [31: 0] q_data_out;
+ wire [31: 0] qn_data_out;
+ wire [31: 0] s_data_out;
+ wire [31: 0] sn_data_out;
+
+ /* write enables */
+ reg ab_wren;
+ reg q_wren;
+ reg qn_wren;
+ reg s_wren;
+ reg sn_wren;
+ reg r_wren;
+
+ /* map */
+ assign r_bram_in = r_data_in;
+ assign r_bram_wr = r_wren;
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+ bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+ bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
+
+
+ //
+ // Wide Operand Loader
+ //
+ integer j;
+
+ /* shift logic */
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_B_SHIFT: begin
+
+ /* update the rightmost part of loader buffer */
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+
+ /* shift the loader buffer to the left */
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_din[j-1] <= loader_din[j];
+
+ end
+ //
+ FSM_STATE_LOAD_N_COEFF_SHIFT: begin
+
+ /* update the rightmost part of loader buffer */
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
+
+ /* shift the loader buffer to the left */
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_din[j-1] <= loader_din[j];
+
+ end
+ //
+ FSM_STATE_LOAD_N_SHIFT: begin
+
+ /* update the rightmost part of loader buffer */
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
+
+ /* shift the loader buffer to the left */
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_din[j-1] <= loader_din[j];
+
+ end
+ //
+ endcase
+
+
+ /* write enable logic */
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+
+ FSM_STATE_LOAD_B_WRITE,
+ FSM_STATE_LOAD_N_COEFF_WRITE,
+ FSM_STATE_LOAD_N_WRITE:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_wren[j] <= 1'b1;
+
+ default:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_wren[j] <= 1'b0;
+
+ endcase
+
+ /* loader address update logic */
+ always @(posedge clk) begin
+ //
+ case (fsm_state)
+
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_LOAD_N_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= syst_cnt_zero;
+
+ FSM_STATE_LOAD_B_WRITE,
+ FSM_STATE_LOAD_N_COEFF_WRITE,
+ FSM_STATE_LOAD_N_WRITE:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
+
+ endcase
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START,
+ FSM_STATE_MULT_A_B_RELOAD,
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+ FSM_STATE_MULT_Q_N_RELOAD:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= syst_cnt_zero;
+
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
+ endcase
+ //
+ case (fsm_next_state)
+
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_MULT_A_B_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
+
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_MULT_AB_N_COEFF_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
+
+ FSM_STATE_LOAD_N_START,
+ FSM_STATE_MULT_Q_N_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
+
+ endcase
+ //
+ end
+
+
+ //
+ // Systolic Array of Processing Elements
+ //
+ reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_c_out [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [31: 0] pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1];
+
+
+ //
+ // These can be turned into a FIFO (maybe later?)...
+ //
+ //reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
+ //reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
+
+ reg fifo_c_rst;
+ reg fifo_t_rst;
+
+ wire fifo_c_wren;
+ wire fifo_c_rden;
+
+ wire fifo_t_wren;
+ wire fifo_t_rden;
+
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din;
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout;
+
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din;
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout;
+
+ /**/
+ modexpa7_simple_fifo #
+ (
+ .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
+ .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
+ )
+ fifo_c
+ (
+ .clk (clk),
+ .rst (fifo_c_rst),
+ .wr_en (fifo_c_wren),
+ .d_in (fifo_c_din),
+ .rd_en (fifo_c_rden),
+ .d_out (fifo_c_dout)
+ );
+
+ modexpa7_simple_fifo #
+ (
+ .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
+ .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
+ )
+ fifo_t
+ (
+ .clk (clk),
+ .rst (fifo_t_rst),
+ .wr_en (fifo_t_wren),
+ .d_in (fifo_t_din),
+ .rd_en (fifo_t_rden),
+ .d_out (fifo_t_dout)
+ );
+
+ generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+ begin : modexpa7_systolic_pe_multiplier
+ modexpa7_systolic_pe systolic_pe_inst
+ (
+ .clk (clk),
+ .a (pe_a[i]),
+ .b (pe_b[i]),
+ .t (pe_t[i]),
+ .c_in (pe_c_in[i]),
+ .p (pe_p[i]),
+ .c_out (pe_c_out[i])
+ );
+ assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
+ assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
+ assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
+ always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
+ end
+ endgenerate
+
+
+
+
+
+ //
+ // Shift Registers
+ //
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
+ reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
+
+ wire shreg_done_load = shreg_load[syst_cnt_last];
+ wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
+ wire shreg_done_unload = shreg_unload[syst_cnt_last];
+
+ reg shreg_now_loading;
+ reg shreg_now_latency;
+ reg shreg_now_unloading;
+
+ reg shreg_done_latency_dly;
+
+ always @(posedge clk)
+ shreg_done_latency_dly <= shreg_done_latency;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_LOAD_N_FINAL: begin
+ shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
+ shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ end
+ //
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START,
+ FSM_STATE_MULT_A_B_RELOAD,
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+ FSM_STATE_MULT_Q_N_RELOAD: begin
+ shreg_now_loading <= 1'b1;
+ shreg_now_latency <= 1'b1;
+ shreg_now_unloading <= 1'b0;
+ shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+ shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+ shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ end
+ //
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH: begin
+ shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+ shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+ shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+
+ if (shreg_done_load) shreg_now_loading <= 1'b0;
+ if (shreg_done_latency) shreg_now_latency <= 1'b0;
+ if (shreg_done_latency) shreg_now_unloading <= 1'b1;
+ else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
+
+ end
+ //
+ default: begin
+ shreg_now_loading <= 1'b0;
+ shreg_now_latency <= 1'b0;
+ shreg_now_unloading <= 1'b0;
+ end
+ //
+ endcase
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START: fifo_c_rst <= 1'b1;
+
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START: fifo_t_rst <= 1'b1;
+
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0;
+ endcase
+
+
+ reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] pe_p_msb_dly;
+
+ always @(posedge clk)
+ //
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j];
+
+ wire [31: 0] pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0;
+ assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly};
+
+
+
+ reg shreg_now_unloading_dly;
+ always @(posedge clk)
+ shreg_now_unloading_dly <= shreg_now_unloading;
+
+ assign fifo_c_wren = shreg_now_unloading_dly;
+ assign fifo_c_rden = shreg_now_loading;
+
+ assign fifo_t_wren = shreg_now_unloading_dly;
+ assign fifo_t_rden = shreg_now_loading;
+
+
+
+
+ always @(posedge clk) begin
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero;
+ FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero;
+ FSM_STATE_MULT_Q_N_START: begin qn_addr_ext <= bram_addr_ext_zero;
+ ab_addr_ext <= bram_addr_ext_zero;
+ end
+
+ FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next;
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next;
+ FSM_STATE_MULT_Q_N_RELOAD: begin qn_addr_ext <= qn_addr_ext_next;
+ ab_addr_ext <= ab_addr_ext_next;
+ end
+ endcase
+ //
+ case (fsm_state)
+
+ FSM_STATE_MULT_Q_N_RELOAD: begin
+ if (qn_addr_ext == {1'b0, bram_addr_last}) begin
+ s_addr <= bram_addr_zero;
+ sn_addr <= bram_addr_zero;
+ end
+
+ if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
+ s_addr <= s_addr_next;
+ sn_addr <= sn_addr_next;
+ end
+
+ if (qn_addr_ext == bram_addr_ext_last) begin
+ s_addr <= bram_addr_zero;
+ sn_addr <= bram_addr_zero;
+ end
+
+ end
+
+ FSM_STATE_MULT_Q_N_FINAL,
+ FSM_STATE_SAVE_START,
+ FSM_STATE_SAVE_WRITE: begin
+ s_addr <= !s_addr_done ? s_addr_next : s_addr;
+ sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
+ end
+
+ endcase
+
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero;
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next;
+ endcase
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero;
+ FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr;
+ endcase
+
+ //
+ end
+
+ always @(posedge clk) begin
+ //
+ if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
+ ab_wren <= shreg_done_latency_dly;
+ ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+ end else begin
+ ab_wren <= 1'b0;
+ ab_data_in <= 32'hXXXXXXXX;
+ end
+ //
+ if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
+ q_wren <= shreg_done_latency_dly;
+ q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+ end else begin
+ q_wren <= 1'b0;
+ q_data_in <= 32'hXXXXXXXX;
+ end
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
+ qn_wren <= shreg_done_latency_dly;
+ qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+ end else begin
+ qn_wren <= 1'b0;
+ qn_data_in <= 32'hXXXXXXXX;
+ end
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START: r_wren <= 1'b1;
+ FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done;
+ default: r_wren <= 1'b0;
+ endcase
+ //
+ end
+
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START,
+ FSM_STATE_MULT_A_B_RELOAD,
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+ FSM_STATE_MULT_Q_N_RELOAD:
+ //
+ syst_cnt_load <= syst_cnt_zero;
+
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH:
+ //
+ syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+
+ endcase
+
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH: begin
+
+ if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero;
+ else if (shreg_now_unloading)
+ syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+
+ end
+ endcase
+
+
+ //
+ // T and C_IN can be moved to a separate code block
+ //
+ always @(posedge clk) begin
+ //
+ if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ //
+ if (shreg_now_loading) begin
+ pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
+ pe_b[j] <= loader_dout[j];
+ //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
+ //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ //pe_t[j] <= 32'hXXXXXXXX;
+ //pe_c_in[j] <= 32'hXXXXXXXX;
+ end
+ //
+ if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ //
+ if (shreg_now_loading) begin
+ pe_a[j] <= ab_data_out;
+ pe_b[j] <= loader_dout[j];
+ //pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
+ //pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ //pe_t[j] <= 32'hXXXXXXXX;
+ //pe_c_in[j] <= 32'hXXXXXXXX;
+ end
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ //
+ if (shreg_now_loading) begin
+ pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
+ pe_b[j] <= loader_dout[j];
+ //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
+ //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ //pe_t[j] <= 32'hXXXXXXXX;
+ //pe_c_in[j] <= 32'hXXXXXXXX;
+ end
+ //
+
+ //
+ end
+
+
+ //
+ // Adder
+ //
+ /*
+ * This adder is used to calculate S = AB + QN.
+ *
+ */
+ reg add1_ce; // clock enable
+ reg [31: 0] add1_s; // sum output
+ wire add1_c_in; // carry input
+ wire [31: 0] add1_a; // A-input
+ reg [31: 0] add1_b; // B-input
+ reg add1_c_in_mask; // flag to not carry anything into the very first word
+ reg add1_c_out; // carry output
+
+ /* add masking into carry feedback chain */
+ assign add1_c_in = add1_c_out & ~add1_c_in_mask;
+
+ /* mask carry for the very first word of N */
+ //always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+
+ always @(posedge clk)
+ //
+ if (add1_ce)
+ //
+ {add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
+
+ assign add1_a = qn_data_in;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
+ else
+ add1_b <= 32'hXXXXXXXX;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
+ else
+ add1_c_in_mask <= 1'b0;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ add1_ce <= shreg_done_latency_dly;
+ else
+ add1_ce <= 1'b0;
+
+
+ assign s_data_in = add1_s;
+ assign sn_data_in = sub1_d;
+
+ always @(posedge clk) begin
+ //
+ s_wren <= add1_ce;
+ sn_wren <= sub1_ce;
+ end
+
+
+
+ //
+ // Subtractor
+ //
+ /*
+ * This subtractor is used to calculate SN = S - N.
+ *
+ */
+ reg sub1_ce; // clock enable
+ reg [31: 0] sub1_d; // difference output
+ wire sub1_b_in; // borrow input
+ wire [31: 0] sub1_a; // A-input
+ reg [31: 0] sub1_b; // B-input
+ reg sub1_b_in_mask; // flag to not borrow anything from the very first word
+ reg sub1_b_out; // borrow output
+
+ /* add masking into borrow feedback chain */
+ assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
+
+ always @(posedge clk)
+ //
+ if (sub1_ce)
+ //
+ {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
+
+ assign sub1_a = add1_s;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
+ else
+ sub1_b <= 32'hXXXXXXXX;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
+ else
+ sub1_b_in_mask <= 1'b0;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
+ else
+ sub1_ce <= 1'b0;
+
+
+ assign s_data_in = add1_s;
+
+ always @(posedge clk)
+ //
+ s_wren <= add1_ce;
+
+
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
+ flag_select_s <= sub1_b_out & ~add1_c_out;
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START,
+ FSM_STATE_SAVE_WRITE:
+ r_data_in <= flag_select_s ? s_data_out : sn_data_out;
+ endcase
+
+
+
+ //
+ // FSM Process
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_next_state;
+
+
+ //
+ // FSM Transition Logic
+ //
+ always @* begin
+ //
+ fsm_next_state = FSM_STATE_STOP;
+ //
+ case (fsm_state)
+
+ FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START;
+ else fsm_next_state = FSM_STATE_IDLE;
+ //
+ FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+ FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE;
+ else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+ FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL;
+ else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+ FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
+ //
+ FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+ FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
+ else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+ FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
+ else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+ FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START;
+ //
+ FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+ FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE;
+ else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+ FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL;
+ else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+ FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START;
+ //
+ FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+ FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
+ else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+ FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+ FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
+ //
+ FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
+ else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+ FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START;
+ //
+ FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+ FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
+ else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+ FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+ FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START;
+ //
+ FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE;
+ FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL;
+ else fsm_next_state = FSM_STATE_SAVE_WRITE;
+ FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP;
+ //
+ FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
+
+ endcase
+ //
+ end
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================