diff options
Diffstat (limited to 'src/rtl/modexpa7_systolic_multiplier_array.v')
-rw-r--r-- | src/rtl/modexpa7_systolic_multiplier_array.v | 335 |
1 files changed, 184 insertions, 151 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v index 029d9d6..22d5aaf 100644 --- a/src/rtl/modexpa7_systolic_multiplier_array.v +++ b/src/rtl/modexpa7_systolic_multiplier_array.v @@ -42,23 +42,25 @@ module modexpa7_systolic_multiplier_array # parameter SYSTOLIC_ARRAY_POWER = 2 ) ( - input clk, - input rst_n, + input clk, + input rst_n, - input ena, - output rdy, + input ena, + output rdy, output [OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER - 1 : 0] loader_addr_rd, - input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide,
- input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide,
+ input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide,
+ input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide,
+
+ output [ OPERAND_ADDR_WIDTH - 1 : 0] a_bram_addr,
- output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr, - output [ 32 - 1 : 0] p_bram_in, - output p_bram_wr,
+ output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr, + output [ 32 - 1 : 0] p_bram_in, + output p_bram_wr,
- input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words, - input [ OPERAND_ADDR_WIDTH : 0] p_num_words + input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words, + input [ OPERAND_ADDR_WIDTH : 0] p_num_words ); @@ -75,7 +77,7 @@ module modexpa7_systolic_multiplier_array # localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_MULT_START = 8'h11; - localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12; + localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12;
localparam [ 7: 0] FSM_STATE_MULT_RELOAD = 8'h13; localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h14;
@@ -138,6 +140,107 @@ module modexpa7_systolic_multiplier_array # /* + * Systolic Cycle Counters + */ + + // handy values + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; + + // counters + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; + + // handy increment values + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; + + // handy stop flags + wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; + wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; +
+ always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_MULT_START, + FSM_STATE_MULT_RELOAD: + // + syst_cnt_load <= syst_cnt_zero; + + FSM_STATE_MULT_CRUNCH: + // + syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; + + endcase + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_CRUNCH) begin + // + if (shreg_done_latency)
+ syst_cnt_unload <= syst_cnt_zero; + else if (shreg_now_unloading) + syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+ // + end +
+
+ /*
+ * Timing Shift Registers
+ */
+
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load; + reg [SYSTOLIC_PE_LATENCY :0] shreg_latency; + reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload; + + wire shreg_done_load = shreg_load[syst_cnt_last]; + wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; + wire shreg_done_unload = shreg_unload[syst_cnt_last]; + + reg shreg_now_loading; + reg shreg_now_latency; + reg shreg_now_unloading; + + always @(posedge clk) + // + case (fsm_state)
+ // + FSM_STATE_MULT_START,
+ FSM_STATE_MULT_RELOAD: begin
+ // + shreg_now_loading <= 1'b1; + shreg_now_latency <= 1'b1; + shreg_now_unloading <= 1'b0;
+ // + shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1}; + shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1}; + shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ // + end + // + FSM_STATE_MULT_CRUNCH: begin
+ // + shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+ shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0}; + shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]}; + // + if (shreg_done_load) shreg_now_loading <= 1'b0; + if (shreg_done_latency) shreg_now_latency <= 1'b0; + if (shreg_done_latency) shreg_now_unloading <= 1'b1; + else if (shreg_done_unload) shreg_now_unloading <= 1'b0; + + end + // + default: begin + shreg_now_loading <= 1'b0; + shreg_now_latency <= 1'b0; + shreg_now_unloading <= 1'b0; + end + // + endcase
+
+
+ /* * Systolic Array of Processing Elements */ reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1]; @@ -215,195 +318,125 @@ module modexpa7_systolic_multiplier_array # assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32]; assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
// - //assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
- // - //always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
- // end
// endgenerate -
+
+ /*
+ * FIFO Reset Logic
+ */
+ always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_START: fifo_c_rst <= 1'b1; + FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0; + endcase + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_START: fifo_t_rst <= 1'b1; + FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0; + endcase
+
+
/*
* Block Memory Interface
*/
// the very first address
- wire [OPERAND_ADDR_WIDTH:0] bram_addr_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_zero = {OPERAND_ADDR_WIDTH {1'b0}};
+ wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_zero = {OPERAND_ADDR_WIDTH+1{1'b0}};
// the very last address
- wire [OPERAND_ADDR_WIDTH:0] bram_addr_last = p_num_words_latch;
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_last = n_num_words_latch;
+ wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_last = p_num_words_latch;
// registers
- reg [OPERAND_ADDR_WIDTH:0] p_addr;
- reg [ 31:0] p_data_in;
- reg p_wren;
+ reg [OPERAND_ADDR_WIDTH - 1 : 0] a_addr;
+ reg [OPERAND_ADDR_WIDTH : 0] p_addr;
+ reg [ 32 - 1 : 0] p_data_in;
+ reg p_wren;
// handy values
- wire [OPERAND_ADDR_WIDTH:0] p_addr_next = p_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] a_addr_next = a_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH : 0] p_addr_next = p_addr + 1'b1;
// handy flags
- wire p_addr_done = (p_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
+ wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire p_addr_done = (p_addr == bram_addr_ext_last) ? 1'b1 : 1'b0;
// map top-level ports to internal registers
+ assign a_bram_addr = a_addr;
assign p_bram_addr = p_addr;
assign p_bram_in = p_data_in;
assign p_bram_wr = p_wren;
-
- /* - * Systolic Cycle Counters - */ - - // handy values - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; - - // counters - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; - - // handy increment values - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; - - // handy stop flags - wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; - wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; -
- always @(posedge clk) - // - case (fsm_next_state) - FSM_STATE_MULT_START, - FSM_STATE_MULT_RELOAD: - // - syst_cnt_load <= syst_cnt_zero; - - FSM_STATE_MULT_CRUNCH, - // - syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; - - endcase - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_CRUNCH) begin - // - if (shreg_done_latency)
- syst_cnt_unload <= syst_cnt_zero; - else if (shreg_now_unloading) - syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
- // - end -
-
-
- /*
- * Shift Registers
- */
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load; - reg [SYSTOLIC_PE_LATENCY :0] shreg_latency; - reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload; - - wire shreg_done_load = shreg_load[syst_cnt_last]; - wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; - wire shreg_done_unload = shreg_unload[syst_cnt_last]; - - reg shreg_now_loading; - reg shreg_now_latency; - reg shreg_now_unloading; - + integer j;
always @(posedge clk) // - case (fsm_state)
+ if (fsm_state == FSM_STATE_MULT_CRUNCH) // - //FSM_STATE_IDLE: begin - //shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; - //shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0}; - //shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; - //end - // - FSM_STATE_MULT_START, - FSM_STATE_MULT_RELOAD: begin
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) // - shreg_now_loading <= 1'b1; - shreg_now_latency <= 1'b1; - shreg_now_unloading <= 1'b0;
- // - shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1}; - shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1}; - shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- // - end - // - FSM_STATE_MULT_CRUNCH: begin
- // - shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0}; - shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0}; - shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]}; - // - if (shreg_done_load) shreg_now_loading <= 1'b0; - if (shreg_done_latency) shreg_now_latency <= 1'b0; - if (shreg_done_latency) shreg_now_unloading <= 1'b1; - else if (shreg_done_unload) shreg_now_unloading <= 1'b0; - - end - // - default: begin - shreg_now_loading <= 1'b0; - shreg_now_latency <= 1'b0; - shreg_now_unloading <= 1'b0; - end - // - endcase + if (shreg_now_loading) begin + pe_a[j] <= (p_addr > {1'b0, a_addr}) ? 32'd0 : pe_a_wide[32 * (j + 1) - 1 -: 32]; + pe_b[j] <= pe_b_wide[32 * (j + 1) - 1 -: 32]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + end
+// /*
+// *
+// */
+// always @(posedge clk)
+// //
+// case (fsm_next_state)
+// FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
+// default: p_wren <= 1'b0;
+// endcase
+//
/*
- *
- */
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
- default: p_wren <= 1'b0;
- endcase
-
- /*
- *
+ * Block Memory Address Control
*/
- always @(posedge clk)
+ always @(posedge clk) begin
//
case (fsm_state) FSM_STATE_MULT_START: p_addr <= bram_addr_zero; FSM_STATE_MULT_RELOAD: p_addr <= p_addr_next; + endcase
+ //
+ case (fsm_next_state) + FSM_STATE_MULT_START: a_addr <= bram_addr_zero; + FSM_STATE_MULT_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; endcase + // + end /*
- * Loader Control
+ * Loader Address Control
*/
reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr; assign loader_addr_rd = loader_addr;
- integer j; always @(posedge clk)
//
case (fsm_next_state)
- - FSM_STATE_MULT_START, + // + FSM_STATE_MULT_START,
FSM_STATE_MULT_RELOAD: - // - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - loader_addr <= syst_cnt_zero; - + loader_addr <= syst_cnt_zero; + // FSM_STATE_MULT_CRUNCH: // - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
- + loader_addr <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+ // endcase @@ -433,7 +466,7 @@ module modexpa7_systolic_multiplier_array # // FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_CRUNCH; + else fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_RELOAD: if (p_addr_done) fsm_next_state = FSM_STATE_MULT_FINAL;
else fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP;
|