aboutsummaryrefslogtreecommitdiff
path: root/src/rtl/modexpa7_systolic_multiplier_array.v
diff options
context:
space:
mode:
Diffstat (limited to 'src/rtl/modexpa7_systolic_multiplier_array.v')
-rw-r--r--src/rtl/modexpa7_systolic_multiplier_array.v335
1 files changed, 184 insertions, 151 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v
index 029d9d6..22d5aaf 100644
--- a/src/rtl/modexpa7_systolic_multiplier_array.v
+++ b/src/rtl/modexpa7_systolic_multiplier_array.v
@@ -42,23 +42,25 @@ module modexpa7_systolic_multiplier_array #
parameter SYSTOLIC_ARRAY_POWER = 2
)
(
- input clk,
- input rst_n,
+ input clk,
+ input rst_n,
- input ena,
- output rdy,
+ input ena,
+ output rdy,
output [OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER - 1 : 0] loader_addr_rd,
- input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide,
- input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide,
+ input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide,
+ input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide,
+
+ output [ OPERAND_ADDR_WIDTH - 1 : 0] a_bram_addr,
- output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr,
- output [ 32 - 1 : 0] p_bram_in,
- output p_bram_wr,
+ output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr,
+ output [ 32 - 1 : 0] p_bram_in,
+ output p_bram_wr,
- input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words,
- input [ OPERAND_ADDR_WIDTH : 0] p_num_words
+ input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words,
+ input [ OPERAND_ADDR_WIDTH : 0] p_num_words
);
@@ -75,7 +77,7 @@ module modexpa7_systolic_multiplier_array #
localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
localparam [ 7: 0] FSM_STATE_MULT_START = 8'h11;
- localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12;
+ localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12;
localparam [ 7: 0] FSM_STATE_MULT_RELOAD = 8'h13;
localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h14;
@@ -138,6 +140,107 @@ module modexpa7_systolic_multiplier_array #
/*
+ * Systolic Cycle Counters
+ */
+
+ // handy values
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
+
+ // counters
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
+
+ // handy increment values
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
+
+ // handy stop flags
+ wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
+ wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD:
+ //
+ syst_cnt_load <= syst_cnt_zero;
+
+ FSM_STATE_MULT_CRUNCH:
+ //
+ syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+
+ endcase
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_CRUNCH) begin
+ //
+ if (shreg_done_latency)
+ syst_cnt_unload <= syst_cnt_zero;
+ else if (shreg_now_unloading)
+ syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+ //
+ end
+
+
+ /*
+ * Timing Shift Registers
+ */
+
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
+ reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
+
+ wire shreg_done_load = shreg_load[syst_cnt_last];
+ wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
+ wire shreg_done_unload = shreg_unload[syst_cnt_last];
+
+ reg shreg_now_loading;
+ reg shreg_now_latency;
+ reg shreg_now_unloading;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD: begin
+ //
+ shreg_now_loading <= 1'b1;
+ shreg_now_latency <= 1'b1;
+ shreg_now_unloading <= 1'b0;
+ //
+ shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+ shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1};
+ shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ //
+ end
+ //
+ FSM_STATE_MULT_CRUNCH: begin
+ //
+ shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+ shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+ shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+ //
+ if (shreg_done_load) shreg_now_loading <= 1'b0;
+ if (shreg_done_latency) shreg_now_latency <= 1'b0;
+ if (shreg_done_latency) shreg_now_unloading <= 1'b1;
+ else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
+
+ end
+ //
+ default: begin
+ shreg_now_loading <= 1'b0;
+ shreg_now_latency <= 1'b0;
+ shreg_now_unloading <= 1'b0;
+ end
+ //
+ endcase
+
+
+ /*
* Systolic Array of Processing Elements
*/
reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
@@ -215,195 +318,125 @@ module modexpa7_systolic_multiplier_array #
assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
//
- //assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
- //
- //always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
- //
end
//
endgenerate
-
+
+ /*
+ * FIFO Reset Logic
+ */
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_START: fifo_c_rst <= 1'b1;
+ FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_START: fifo_t_rst <= 1'b1;
+ FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0;
+ endcase
+
+
/*
* Block Memory Interface
*/
// the very first address
- wire [OPERAND_ADDR_WIDTH:0] bram_addr_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_zero = {OPERAND_ADDR_WIDTH {1'b0}};
+ wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
// the very last address
- wire [OPERAND_ADDR_WIDTH:0] bram_addr_last = p_num_words_latch;
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_last = n_num_words_latch;
+ wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_last = p_num_words_latch;
// registers
- reg [OPERAND_ADDR_WIDTH:0] p_addr;
- reg [ 31:0] p_data_in;
- reg p_wren;
+ reg [OPERAND_ADDR_WIDTH - 1 : 0] a_addr;
+ reg [OPERAND_ADDR_WIDTH : 0] p_addr;
+ reg [ 32 - 1 : 0] p_data_in;
+ reg p_wren;
// handy values
- wire [OPERAND_ADDR_WIDTH:0] p_addr_next = p_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] a_addr_next = a_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH : 0] p_addr_next = p_addr + 1'b1;
// handy flags
- wire p_addr_done = (p_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
+ wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire p_addr_done = (p_addr == bram_addr_ext_last) ? 1'b1 : 1'b0;
// map top-level ports to internal registers
+ assign a_bram_addr = a_addr;
assign p_bram_addr = p_addr;
assign p_bram_in = p_data_in;
assign p_bram_wr = p_wren;
-
- /*
- * Systolic Cycle Counters
- */
-
- // handy values
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-
- // counters
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
-
- // handy increment values
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
-
- // handy stop flags
- wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_START,
- FSM_STATE_MULT_RELOAD:
- //
- syst_cnt_load <= syst_cnt_zero;
-
- FSM_STATE_MULT_CRUNCH,
- //
- syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-
- endcase
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_CRUNCH) begin
- //
- if (shreg_done_latency)
- syst_cnt_unload <= syst_cnt_zero;
- else if (shreg_now_unloading)
- syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
- //
- end
-
-
-
- /*
- * Shift Registers
- */
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
- reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
-
- wire shreg_done_load = shreg_load[syst_cnt_last];
- wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
- wire shreg_done_unload = shreg_unload[syst_cnt_last];
-
- reg shreg_now_loading;
- reg shreg_now_latency;
- reg shreg_now_unloading;
-
+ integer j;
always @(posedge clk)
//
- case (fsm_state)
+ if (fsm_state == FSM_STATE_MULT_CRUNCH)
//
- //FSM_STATE_IDLE: begin
- //shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- //shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
- //shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- //end
- //
- FSM_STATE_MULT_START,
- FSM_STATE_MULT_RELOAD: begin
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
//
- shreg_now_loading <= 1'b1;
- shreg_now_latency <= 1'b1;
- shreg_now_unloading <= 1'b0;
- //
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- //
- end
- //
- FSM_STATE_MULT_CRUNCH: begin
- //
- shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
- shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
- shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
- //
- if (shreg_done_load) shreg_now_loading <= 1'b0;
- if (shreg_done_latency) shreg_now_latency <= 1'b0;
- if (shreg_done_latency) shreg_now_unloading <= 1'b1;
- else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
-
- end
- //
- default: begin
- shreg_now_loading <= 1'b0;
- shreg_now_latency <= 1'b0;
- shreg_now_unloading <= 1'b0;
- end
- //
- endcase
+ if (shreg_now_loading) begin
+ pe_a[j] <= (p_addr > {1'b0, a_addr}) ? 32'd0 : pe_a_wide[32 * (j + 1) - 1 -: 32];
+ pe_b[j] <= pe_b_wide[32 * (j + 1) - 1 -: 32];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ end
+// /*
+// *
+// */
+// always @(posedge clk)
+// //
+// case (fsm_next_state)
+// FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
+// default: p_wren <= 1'b0;
+// endcase
+//
/*
- *
- */
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
- default: p_wren <= 1'b0;
- endcase
-
- /*
- *
+ * Block Memory Address Control
*/
- always @(posedge clk)
+ always @(posedge clk) begin
//
case (fsm_state)
FSM_STATE_MULT_START: p_addr <= bram_addr_zero;
FSM_STATE_MULT_RELOAD: p_addr <= p_addr_next;
+ endcase
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_START: a_addr <= bram_addr_zero;
+ FSM_STATE_MULT_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
endcase
+ //
+ end
/*
- * Loader Control
+ * Loader Address Control
*/
reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr;
assign loader_addr_rd = loader_addr;
- integer j;
always @(posedge clk)
//
case (fsm_next_state)
-
- FSM_STATE_MULT_START,
+ //
+ FSM_STATE_MULT_START,
FSM_STATE_MULT_RELOAD:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr <= syst_cnt_zero;
-
+ loader_addr <= syst_cnt_zero;
+ //
FSM_STATE_MULT_CRUNCH:
//
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-
+ loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+ //
endcase
@@ -433,7 +466,7 @@ module modexpa7_systolic_multiplier_array #
//
FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_CRUNCH;
+ else fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_RELOAD: if (p_addr_done) fsm_next_state = FSM_STATE_MULT_FINAL;
else fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP;