aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/rtl/modexpa7_systolic_multiplier.v152
-rw-r--r--src/rtl/modexpa7_systolic_multiplier_array.v452
2 files changed, 565 insertions, 39 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index a1e141e..9d96f98 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -143,6 +143,7 @@ module modexpa7_systolic_multiplier #
* Parameters Latch
*/
reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch;
+ reg [OPERAND_ADDR_WIDTH :0] p_num_words_latch;
// save number of words in n when new operation starts
always @(posedge clk)
@@ -200,20 +201,25 @@ module modexpa7_systolic_multiplier #
// loader input
- reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr[0:SYSTOLIC_ARRAY_LENGTH-1];
- reg loader_wren[0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_wr;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_rd;
+ reg loader_wren;
reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
// loader output
- wire [ 32-1:0] loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [ 32-1:0] loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1];
+
+ // array_input
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] pe_a_wide;
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] pe_b_wide;
// generate parallelized loader
genvar i;
generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
//
- begin : gen_bram_1rw_readfirst_loader
+ begin : gen_bram_1rw_1ro_readfirst_loader
//
- bram_1rw_readfirst #
+ bram_1rw_1ro_readfirst #
(
.MEM_WIDTH (32),
.MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH)
@@ -221,11 +227,15 @@ module modexpa7_systolic_multiplier #
bram_loader
(
.clk (clk),
- .a_addr (loader_addr[i]),
- .a_wr (loader_wren[i]),
+ .a_addr (loader_addr_wr),
+ .a_wr (loader_wren),
.a_in (loader_din[i]),
- .a_out (loader_dout[i])
- );
+ .a_out (),
+ .b_addr (loader_addr_rd),
+ .b_out (loader_dout[i])
+ );
+ //
+ assign pe_b_wide[32 * (i + 1) - 1 -: 32] = loader_dout[i];
//
end
//
@@ -250,22 +260,40 @@ module modexpa7_systolic_multiplier #
// address registers
reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
+ wire [OPERAND_ADDR_WIDTH :0] p_addr_ext_wr;
+ reg [OPERAND_ADDR_WIDTH :0] p_addr_ext_rd;
// handy increment values
- wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH :0] p_addr_ext_rd_next = b_addr + 1'b1;
+
+ // write enables
+ wire p_wren;
+
+ // data buses
+ wire [31: 0] p_data_in;
+ wire [31: 0] p_data_out;
// handy stop flags
- wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire p_addr_ext_rd_done = (p_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
// delayed addresses
reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
-
+
always @(posedge clk) b_addr_dly <= b_addr;
// map registers to top-level ports
assign b_bram_addr = b_addr;
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+ bram_p
+ ( .clk(clk),
+ .a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(),
+ .b_addr(p_addr_ext_rd), .b_out(p_data_out));
+
/*
* Loader Data Input
@@ -297,17 +325,8 @@ module modexpa7_systolic_multiplier #
always @(posedge clk)
//
case (fsm_next_state)
-
- FSM_STATE_LOAD_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b1;
-
- default:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b0;
-
+ FSM_STATE_LOAD_WRITE: loader_wren <= 1'b1;
+ default: loader_wren <= 1'b0;
endcase
@@ -317,17 +336,15 @@ module modexpa7_systolic_multiplier #
always @(posedge clk)
//
- case (fsm_state)
+ case (fsm_state)
- FSM_STATE_LOAD_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr[j] <= load_syst_cnt_zero;
-
- FSM_STATE_LOAD_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr[j] <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
+ FSM_STATE_LOAD_START:
+ //
+ loader_addr_wr <= load_syst_cnt_zero;
+
+ FSM_STATE_LOAD_WRITE:
+ //
+ loader_addr_wr <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
endcase
@@ -344,12 +361,68 @@ module modexpa7_systolic_multiplier #
//
end
+
+ /*
+ * Multiplier Array
+ */
+ reg pe_array_ena;
+ wire pe_array_rdy;
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_START: pe_array_ena <= 1'b1;
+ default: pe_array_ena <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_START: p_num_words_latch <= {n_num_words_latch, 1'b1};
+ endcase
+
+
+ modexpa7_systolic_multiplier_array #
+ (
+ .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH),
+ .SYSTOLIC_ARRAY_POWER (SYSTOLIC_ARRAY_POWER)
+ )
+ systolic_pe_array
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+
+ .ena (pe_array_ena),
+ .rdy (pe_array_rdy),
+
+ .loader_addr_rd (loader_addr_rd),
+
+ .pe_a_wide (),
+ .pe_b_wide (pe_b_wide),
+
+ .p_bram_addr (p_addr_ext_wr),
+ .p_bram_in (p_data_in),
+ .p_bram_wr (p_wren),
+
+
+ .n_num_words (n_num_words_latch),
+ .p_num_words (p_num_words_latch)
+ );
+
+
+
+
+
+
+
+
+
/*
* FSM Process
- */
+ - */
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
@@ -373,13 +446,14 @@ module modexpa7_systolic_multiplier #
else fsm_next_state = FSM_STATE_LOAD_SHIFT;
FSM_STATE_LOAD_WRITE: if (load_syst_cnt_done) fsm_next_state = FSM_STATE_LOAD_FINAL;
else fsm_next_state = FSM_STATE_LOAD_SHIFT;
- FSM_STATE_LOAD_FINAL: fsm_next_state = FSM_STATE_STOP;
+ FSM_STATE_LOAD_FINAL: fsm_next_state = FSM_STATE_MULT_START;
//
- //FSM_STATE_MULT_START:
- //FSM_STATE_MULT_CRUNCH:
- //FSM_STATE_MULT_FINAL:
+ FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH;
+ FSM_STATE_MULT_CRUNCH: if (pe_array_rdy) fsm_next_state = FSM_STATE_MULT_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_CRUNCH;
+ FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP;
//
- FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
+ FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
//
endcase
//
diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v
new file mode 100644
index 0000000..029d9d6
--- /dev/null
+++ b/src/rtl/modexpa7_systolic_multiplier_array.v
@@ -0,0 +1,452 @@
+//======================================================================
+//
+// modexpa7_systolic_multiplier_array.v
+// -----------------------------------------------------------------------------
+// Systolic Montgomery multiplier Processing Element Array
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module modexpa7_systolic_multiplier_array #
+ (
+ parameter OPERAND_ADDR_WIDTH = 4,
+ parameter SYSTOLIC_ARRAY_POWER = 2
+ )
+ (
+ input clk,
+ input rst_n,
+
+ input ena,
+ output rdy,
+
+ output [OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER - 1 : 0] loader_addr_rd,
+
+ input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide,
+ input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide,
+
+ output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr,
+ output [ 32 - 1 : 0] p_bram_in,
+ output p_bram_wr,
+
+ input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words,
+ input [ OPERAND_ADDR_WIDTH : 0] p_num_words
+ );
+
+
+ /*
+ * Include Settings
+ */
+ `include "pe/modexpa7_primitive_switch.v"
+ `include "modexpa7_settings.v"
+
+
+ /*
+ * FSM Declaration
+ */
+ localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
+
+ localparam [ 7: 0] FSM_STATE_MULT_START = 8'h11;
+ localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12;
+ localparam [ 7: 0] FSM_STATE_MULT_RELOAD = 8'h13;
+ localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h14;
+
+ localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
+
+
+ /*
+ * FSM State / Next State
+ */
+ reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
+ reg [ 7: 0] fsm_next_state;
+
+
+ /*
+ * Enable Delay and Trigger
+ */
+ reg ena_dly = 1'b0;
+
+ // delay enable by one clock cycle
+ always @(posedge clk) ena_dly <= ena;
+
+ // trigger new operation when enable goes high
+ wire ena_trig = ena && !ena_dly;
+
+
+ /*
+ * Ready Flag Logic
+ */
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+
+ // reset flag
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else begin
+
+ // clear flag when operation is started
+ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
+
+ // set flag after operation is finished
+ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
+
+ end
+
+
+ /*
+ * Parameters Latch
+ */
+ reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch;
+ reg [OPERAND_ADDR_WIDTH :0] p_num_words_latch;
+
+ // save number of words in n when new operation starts
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_IDLE) && ena_trig) begin
+ n_num_words_latch <= n_num_words;
+ p_num_words_latch <= p_num_words;
+ end
+
+
+ /*
+ * Systolic Array of Processing Elements
+ */
+ reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_c_out [0:SYSTOLIC_ARRAY_LENGTH-1];
+
+
+ /*
+ * FIFOs
+ */
+ reg fifo_c_rst;
+ reg fifo_t_rst;
+
+ wire fifo_c_wren;
+ wire fifo_c_rden;
+
+ wire fifo_t_wren;
+ wire fifo_t_rden;
+
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din;
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout;
+
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din;
+ wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout;
+
+ modexpa7_simple_fifo #
+ (
+ .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
+ .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
+ )
+ fifo_c
+ (
+ .clk (clk),
+ .rst (fifo_c_rst),
+ .wr_en (fifo_c_wren),
+ .d_in (fifo_c_din),
+ .rd_en (fifo_c_rden),
+ .d_out (fifo_c_dout)
+ );
+
+ modexpa7_simple_fifo #
+ (
+ .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
+ .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
+ )
+ fifo_t
+ (
+ .clk (clk),
+ .rst (fifo_t_rst),
+ .wr_en (fifo_t_wren),
+ .d_in (fifo_t_din),
+ .rd_en (fifo_t_rden),
+ .d_out (fifo_t_dout)
+ );
+
+ genvar i;
+ generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+ //
+ begin : gen_modexpa7_systolic_pe
+ //
+ modexpa7_systolic_pe systolic_pe_inst
+ (
+ .clk (clk),
+ .a (pe_a[i]),
+ .b (pe_b[i]),
+ .t (pe_t[i]),
+ .c_in (pe_c_in[i]),
+ .p (pe_p[i]),
+ .c_out (pe_c_out[i])
+ );
+ //
+ assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
+ assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
+ //
+ //assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
+ //
+ //always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
+ //
+ end
+ //
+ endgenerate
+
+
+ /*
+ * Block Memory Interface
+ */
+
+ // the very first address
+ wire [OPERAND_ADDR_WIDTH:0] bram_addr_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
+
+ // the very last address
+ wire [OPERAND_ADDR_WIDTH:0] bram_addr_last = p_num_words_latch;
+
+ // registers
+ reg [OPERAND_ADDR_WIDTH:0] p_addr;
+ reg [ 31:0] p_data_in;
+ reg p_wren;
+
+ // handy values
+ wire [OPERAND_ADDR_WIDTH:0] p_addr_next = p_addr + 1'b1;
+
+ // handy flags
+ wire p_addr_done = (p_addr == bram_addr_last) ? 1'b1 : 1'b0;
+
+
+ // map top-level ports to internal registers
+ assign p_bram_addr = p_addr;
+ assign p_bram_in = p_data_in;
+ assign p_bram_wr = p_wren;
+
+
+ /*
+ * Systolic Cycle Counters
+ */
+
+ // handy values
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+ // counters
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
+
+ // handy increment values
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
+
+ // handy stop flags
+ wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
+ wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD:
+ //
+ syst_cnt_load <= syst_cnt_zero;
+
+ FSM_STATE_MULT_CRUNCH,
+ //
+ syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_CRUNCH) begin
+ //
+ if (shreg_done_latency)
+ syst_cnt_unload <= syst_cnt_zero;
+ else if (shreg_now_unloading)
+ syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+ //
+ end
+
+
+
+ /*
+ * Shift Registers
+ */
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
+ reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
+
+ wire shreg_done_load = shreg_load[syst_cnt_last];
+ wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
+ wire shreg_done_unload = shreg_unload[syst_cnt_last];
+
+ reg shreg_now_loading;
+ reg shreg_now_latency;
+ reg shreg_now_unloading;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ //FSM_STATE_IDLE: begin
+ //shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ //shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
+ //shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ //end
+ //
+ FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD: begin
+ //
+ shreg_now_loading <= 1'b1;
+ shreg_now_latency <= 1'b1;
+ shreg_now_unloading <= 1'b0;
+ //
+ shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+ shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1};
+ shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ //
+ end
+ //
+ FSM_STATE_MULT_CRUNCH: begin
+ //
+ shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+ shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+ shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+ //
+ if (shreg_done_load) shreg_now_loading <= 1'b0;
+ if (shreg_done_latency) shreg_now_latency <= 1'b0;
+ if (shreg_done_latency) shreg_now_unloading <= 1'b1;
+ else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
+
+ end
+ //
+ default: begin
+ shreg_now_loading <= 1'b0;
+ shreg_now_latency <= 1'b0;
+ shreg_now_unloading <= 1'b0;
+ end
+ //
+ endcase
+
+
+
+ /*
+ *
+ */
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
+ default: p_wren <= 1'b0;
+ endcase
+
+ /*
+ *
+ */
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_START: p_addr <= bram_addr_zero;
+ FSM_STATE_MULT_RELOAD: p_addr <= p_addr_next;
+ endcase
+
+
+ /*
+ * Loader Control
+ */
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr;
+
+ assign loader_addr_rd = loader_addr;
+
+ integer j;
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+
+ FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr <= syst_cnt_zero;
+
+ FSM_STATE_MULT_CRUNCH:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+
+ endcase
+
+
+
+
+
+ /*
+ * FSM Process
+ */
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_next_state;
+
+
+ /*
+ * FSM Transition Logic
+ */
+ always @* begin
+ //
+ fsm_next_state = FSM_STATE_STOP;
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_MULT_START;
+ else fsm_next_state = FSM_STATE_IDLE;
+ //
+ FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH;
+ FSM_STATE_MULT_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_RELOAD;
+ else fsm_next_state = FSM_STATE_MULT_CRUNCH;
+ FSM_STATE_MULT_RELOAD: if (p_addr_done) fsm_next_state = FSM_STATE_MULT_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_CRUNCH;
+ FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP;
+ //
+ FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
+ //
+ endcase
+ //
+ end
+
+
+endmodule
+
+//======================================================================
+// End of file
+//======================================================================