aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-10 15:31:25 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-10 15:31:25 +0300
commit71b75290bf2ade9a4022bad93dc80bfb77f87f40 (patch)
treeebd329a15278f6959053496f158c9f81f7b1c1da
parent0da71205b28d07cc832732b28e8893c46fbf6cad (diff)
* made separate file for low-level settings
* turned crazy triple multiplier array into one array with input mux
-rw-r--r--src/rtl/modexpa7_settings.v6
-rw-r--r--src/rtl/modexpa7_systolic_multiplier.v1298
-rw-r--r--src/rtl/pe/modexpa7_adder32.v2
-rw-r--r--src/rtl/pe/modexpa7_primitive_switch.v (renamed from src/rtl/pe/modexpa7_lowlevel_settings.v)1
-rw-r--r--src/rtl/pe/modexpa7_subtractor32.v2
-rw-r--r--src/rtl/pe/modexpa7_systolic_pe.v2
-rw-r--r--src/tb/tb_systolic_multiplier.v3
7 files changed, 670 insertions, 644 deletions
diff --git a/src/rtl/modexpa7_settings.v b/src/rtl/modexpa7_settings.v
new file mode 100644
index 0000000..0ec6978
--- /dev/null
+++ b/src/rtl/modexpa7_settings.v
@@ -0,0 +1,6 @@
+localparam SYSTOLIC_PE_LATENCY = 4;
+
+localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
+localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER;
+localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH;
+
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index cb1c716..56e7be3 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -40,16 +40,16 @@ module modexpa7_systolic_multiplier #
(
//
// This sets the address widths of memory buffers. Internal data
- // width is 32 bits, so for e.g. 1024-bit operands buffers must store
- // 1024 / 32 = 32 words, and these need 5-bit address bus, because
- // 2 ** 5 = 32.
+ // width is 32 bits, so for e.g. 2048-bit operands buffers must store
+ // 2048 / 32 = 64 words, and these need 5-bit address bus, because
+ // 2 ** 6 = 64.
//
- parameter OPERAND_ADDR_WIDTH = 5,
+ parameter OPERAND_ADDR_WIDTH = 4,
//
- // This sets the width of the systolic cycle counter. TODO: Explain.
+ // Explain.
//
- parameter SYSTOLIC_ARRAY_POWER = 3
+ parameter SYSTOLIC_ARRAY_POWER = 2
)
(
input clk,
@@ -72,801 +72,819 @@ module modexpa7_systolic_multiplier #
output [ 32-1:0] r_bram_in,
output r_bram_wr,
- input [OPERAND_ADDR_WIDTH-1:0] n_num_words
+ input [OPERAND_ADDR_WIDTH-1:0] ab_num_words
);
-
+
//
- // Constants
+ // Include Settings
//
- localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
- localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER;
- localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH;
-
- localparam SYSTOLIC_PE_LATENCY = 4;
-
+ `include "pe/modexpa7_primitive_switch.v"
+ `include "modexpa7_settings.v"
+
//
// FSM Declaration
//
- localparam [ 3: 0] FSM_STATE_IDLE = 4'd0;
- localparam [ 3: 0] FSM_STATE_INIT_ZERO_ADDR = 4'd1;
- localparam [ 3: 0] FSM_STATE_INIT_NEXT_ADDR = 4'd2;
- localparam [ 3: 0] FSM_STATE_INIT_LAST_ADDR = 4'd3;
- localparam [ 3: 0] FSM_STATE_PIPE_CRUNCH = 4'd4;
- localparam [ 3: 0] FSM_STATE_PIPE_RELOAD = 4'd5;
- localparam [ 3: 0] FSM_STATE_SAVE_ZERO_ADDR = 4'd6;
- localparam [ 3: 0] FSM_STATE_SAVE_NEXT_ADDR = 4'd7;
- localparam [ 3: 0] FSM_STATE_SAVE_LAST_ADDR = 4'd8;
- localparam [ 3: 0] FSM_STATE_STOP = 4'd9;
-
- reg [ 3: 0] fsm_state = FSM_STATE_IDLE;
- reg [ 3: 0] fsm_next_state;
+ localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
-
- //
- // Enable Delay (Trigger)
- //
- reg ena_dly = 1'b0;
- wire ena_trig = ena && !ena_dly;
- always @(posedge clk) ena_dly <= ena;
+ localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11;
+ localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12;
+ localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13;
+ localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14;
-
- //
- // Parameters Latch
- //
- reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24;
- always @(posedge clk)
- //
- if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
- n_num_words_latch <= n_num_words;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33;
+ localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34;
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41;
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42;
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43;
+ localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44;
- //
- // Addresses
- //
- localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch;
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51;
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52;
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53;
+ localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54;
+
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61;
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62;
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63;
+ localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64;
+ localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
//
- // BRAM Addresses
+ // FSM State / Next State
//
- reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_reg;
- reg [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_reg;
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_reg;
- reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_reg;
- reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_reg;
- reg [OPERAND_ADDR_WIDTH-1:0] r_bram_addr_reg;
+ reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
+ reg [ 7: 0] fsm_next_state;
- wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr = s_bram_addr_reg;
-
- reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_dly;
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_dly;
- reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_dly;
- reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_dly;
-
- wire [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_next = b_bram_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_next = a_bram_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_next = n_bram_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_next = s_bram_addr + 1'b1;
+
+ //
+ // Enable Delay and Trigger
+ //
+ reg ena_dly = 1'b0;
- wire b_bram_addr_done =
- (b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ /* delay enable by one clock cycle */
+ always @(posedge clk) ena_dly <= ena;
- wire s_bram_addr_done =
- (s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ /* trigger new operation when enable goes high */
+ wire ena_trig = ena && !ena_dly;
+
- assign b_bram_addr = b_bram_addr_reg;
- assign a_bram_addr = a_bram_addr_reg;
- assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
- assign n_bram_addr = n_bram_addr_reg;
- assign r_bram_addr = r_bram_addr_reg;
+ //
+ // Ready Flag Logic
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
- always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
- always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
- always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
- always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
+ always @(posedge clk or negedge rst_n)
+
+ /* reset flag */
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else begin
+
+ /* clear flag when operation is started */
+ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
+
+ /* set flag after operation is finished */
+ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
+
+ end
+
- always @(posedge clk)
//
- case (fsm_next_state)
- FSM_STATE_INIT_ZERO_ADDR: b_bram_addr_reg <= bram_addr_zero;
- FSM_STATE_INIT_NEXT_ADDR: b_bram_addr_reg <= b_bram_addr_next;
- endcase
-
- always @(posedge clk)
- case (fsm_next_state)
- FSM_STATE_SAVE_ZERO_ADDR: s_bram_addr_reg <= bram_addr_zero;
- FSM_STATE_SAVE_NEXT_ADDR: s_bram_addr_reg <= s_bram_addr_next;
- endcase
-
- always @(posedge clk)
+ // Parameters Latch
//
- case (fsm_next_state)
- FSM_STATE_INIT_LAST_ADDR: a_bram_addr_reg <= bram_addr_zero;
- FSM_STATE_PIPE_RELOAD: a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
- endcase
+ reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch;
+ /* save number of words in a and b when new operation starts */
always @(posedge clk)
//
- case (fsm_next_state)
- FSM_STATE_INIT_ZERO_ADDR: n_coeff_bram_addr_reg <= bram_addr_zero;
- FSM_STATE_INIT_NEXT_ADDR: n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
- endcase
-
-
-
-
+ if (fsm_next_state == FSM_STATE_LOAD_B_START)
+ ab_num_words_latch <= ab_num_words;
+
+
//
- // Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
+ // Systolic Cycle Counters
//
- wire [SYSTOLIC_PE_LATENCY:0] pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
-
- reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb;
- reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb;
+
+ /* handy values */
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
- wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb_next =
- {pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};
+ /* counters */
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
+
+ /* handy increment values */
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
+ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
- wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb_next =
- {pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};
+ /* handy stop flags */
+ wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0;
+ wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
+ wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
- wire pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
- wire pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];
+ /* delayed load counter */
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly;
+ always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
- always @(posedge clk)
- //
- if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
- //
- case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR,
- FSM_STATE_PIPE_RELOAD: pe_latency_ab_lsb <= pe_latency_start;
- FSM_STATE_PIPE_CRUNCH: pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
- pe_latency_ab_lsb : pe_latency_ab_lsb_next;
- endcase
//
- // Buffers
+ // Multiplier Iteration Counter
//
- integer i, j;
-
- reg [31: 0] b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
- reg [31: 0] n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
- reg [31: 0] n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+
+ /* handy values */
+ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
+ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
- always @(posedge clk)
+ /* counter */
+ reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt;
+
+ /* handy increment value and stop flag */
+ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1;
+ wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
+
+
//
- case (fsm_state)
- FSM_STATE_INIT_ZERO_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- b_buf[i][j] <= 32'd0;
-
- FSM_STATE_INIT_NEXT_ADDR,
- FSM_STATE_INIT_LAST_ADDR:
- b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
- endcase
-
- always @(posedge clk)
+ // Initialization Counter Control Logic
+ //
+ always @(posedge clk) begin
//
case (fsm_state)
- FSM_STATE_INIT_ZERO_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- n_coeff_buf[i][j] <= 32'd0;
-
- FSM_STATE_INIT_NEXT_ADDR,
- FSM_STATE_INIT_LAST_ADDR:
- n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero;
+
+ FSM_STATE_LOAD_B_SHIFT,
+ FSM_STATE_LOAD_N_COEFF_SHIFT,
+ FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next;
endcase
-
- always @(posedge clk)
//
case (fsm_state)
- FSM_STATE_INIT_ZERO_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- n_buf[i][j] <= 32'd0;
-
- FSM_STATE_INIT_NEXT_ADDR,
- FSM_STATE_INIT_LAST_ADDR:
- n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero;
+
+ FSM_STATE_LOAD_B_WRITE,
+ FSM_STATE_LOAD_N_COEFF_WRITE,
+ FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
endcase
-
-
-
-
+ //
+ end
//
- // Cycle Counters
+ // Operand Loader
//
- reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab;
- reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_q;
- reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn;
- reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_s;
-
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
-
- wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
-
- wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_half = {1'b0, n_num_words};
- wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_last = {n_num_words, 1'b1};
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-
- wire mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
- wire mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
- wire mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
- wire mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
+ /*
+ * Explain how parallelized loader works here...
+ *
+ */
- wire syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;
-
- wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab_next = mult_cnt_ab + 1'b1;
- wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_q_next = mult_cnt_q + 1'b1;
- wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn_next = mult_cnt_qn + 1'b1;
- wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_s_next = mult_cnt_s + 1'b1;
+ /* loader banks */
+ localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0;
+ localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1;
+ localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;
-
+ /* loader input */
+ reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
- always @(posedge clk)
+ /* loader output */
+ wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1];
+
+ /* generate parallelized loader */
+
//
- if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
- //
- case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR,
- FSM_STATE_PIPE_RELOAD: syst_cnt <= syst_cnt_zero;
- FSM_STATE_PIPE_CRUNCH: syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
- endcase
-
- always @(posedge clk)
+ // Loader currently stores B, N_COEFF and N, it can be coded another way
+ // to initially stire B, then AB, then Q. Some memory can be saved thay way.
+ // Maybe later...
//
- if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
- //
- case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR: mult_cnt_ab <= mult_cnt_zero;
- FSM_STATE_PIPE_RELOAD: mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
- endcase
-
- always @(posedge clk)
+
+ genvar i;
+ generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
//
- if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
+ begin : gen_bram_1rw_readfirst_loader
//
- case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR: mult_cnt_q <= mult_cnt_zero;
- FSM_STATE_PIPE_RELOAD: if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
- endcase
+ bram_1rw_readfirst #
+ (
+ .MEM_WIDTH (32),
+ .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2)
+ )
+ bram_loader
+ (
+ .clk (clk),
+ .a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}),
+ .a_wr (loader_wren[i]),
+ .a_in (loader_din[i]),
+ .a_out (loader_dout[i])
+ );
+ //
+ end
+ //
+ endgenerate
+
- always @(posedge clk)
//
- if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
- //
- case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR: mult_cnt_qn <= mult_cnt_zero;
- FSM_STATE_PIPE_RELOAD: if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
- endcase
-
- always @(posedge clk)
+ // Block Memory Addresses
//
- if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
- //
- case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR: mult_cnt_s <= mult_cnt_zero;
- FSM_STATE_PIPE_RELOAD: if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
- endcase
+ /*
+ * Explain why there are two memory sizes.
+ *
+ */
+
+ /* the very first addresses */
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}};
+ wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
+
+ /* the very last addresses */
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch};
+ wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1};
+
+ /* address registers */
+ reg [OPERAND_ADDR_WIDTH-1:0] a_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
+ reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext;
+ reg [OPERAND_ADDR_WIDTH-1:0] q_addr;
+ reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext;
+ /* handy increment values */
+ wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1;
+
+ /* handy stop flags */
+ wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
+ wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
+
+ /* delayed B address */
+ reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
+ always @(posedge clk) b_addr_dly <= b_addr;
+
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly;
+ always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
+
+ reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly;
+ always @(posedge clk) n_addr_dly <= n_addr;
+
+ /* map registers to top-level ports */
+ assign a_bram_addr = a_addr;
+ assign b_bram_addr = b_addr;
+ assign n_coeff_bram_addr = n_coeff_addr;
+ assign n_bram_addr = n_addr;
+
+
+ //
+ // Memory Address Control Logic
+ //
always @(posedge clk) begin
- syst_cnt_dly[0] <= syst_cnt;
- for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
- syst_cnt_dly[i] <= syst_cnt_dly[i-1];
+ //
+ case (fsm_next_state)
+ FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero;
+ FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero;
+ FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero;
+
+ FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next;
+ FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next;
+ FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next;
+ endcase
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero;
+ FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
+ endcase
+ //
end
+
//
- // Systolic Array
+ // Internal Memories
//
- wire [31: 0] mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
- wire [31: 0] mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
- wire [31: 0] mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
- wire [31: 0] mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
+ /* memory inputs */
+ reg [31: 0] ab_data_in;
+ reg [31: 0] q_data_in;
+ reg [31: 0] qn_data_in;
- wire [31: 0] mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
- wire [31: 0] mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
-
- wire [31: 0] mul_ab_a = (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
- reg [31: 0] mul_q_a_int;
- reg [31: 0] mul_q_a;
- reg [31: 0] mul_qn_a_int;
- reg [31: 0] mul_qn_a;
-
- reg [31: 0] t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
- reg [31: 0] c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ /* memory outputs */
+ wire [31: 0] ab_data_out;
+ wire [31: 0] q_data_out;
+ wire [31: 0] qn_data_out;
- reg [31: 0] t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
- reg [31: 0] c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ /* write enables */
+ reg ab_wren;
+ reg q_wren;
+ reg qn_wren;
- reg [31: 0] t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
- reg [31: 0] c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+ bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
- genvar syst;
- generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
- begin : gen_mul
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
- modexpa7_systolic_pe mul_ab_inst
- (
- .clk (clk),
- .a (mul_ab_a),
- .b (b_buf[syst_cnt][syst]),
- .t (t_ab[syst_cnt][syst]),
- .c_in (c_ab_in[syst_cnt][syst]),
-
- .p (mul_ab_p[syst]),
- .c_out (mul_ab_c_out[syst])
- );
-
- modexpa7_systolic_pe mul_q_inst
- (
- .clk (clk),
- .a (mul_q_a),
- .b (n_coeff_buf[syst_cnt][syst]),
- .t (t_q[syst_cnt][syst]),
- .c_in (c_q_in[syst_cnt][syst]),
-
- .p (mul_q_p[syst]),
- .c_out (mul_q_c_out[syst])
- );
-
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+ bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
- modexpa7_systolic_pe mul_qn_inst
- (
- .clk (clk),
- .a (mul_qn_a),
- .b (n_buf[syst_cnt][syst]),
- .t (t_qn[syst_cnt][syst]),
- .c_in (c_qn_in[syst_cnt][syst]),
-
- .p (mul_qn_p[syst]),
- .c_out (mul_qn_c_out[syst])
- );
-
- end
- endgenerate
//
- // c_ab
- //
- always @(posedge clk)
+ // Wide Operand Loader
//
- case (fsm_state)
-
- FSM_STATE_INIT_LAST_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- c_ab_in[i][j] <= 32'd0;
-
- FSM_STATE_PIPE_CRUNCH:
- if (pe_latency_ab_lsb_done)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
- endcase
+ integer j;
- //
- // c_q
- //
- always @(posedge clk)
- //
- case (fsm_state)
-
- FSM_STATE_INIT_LAST_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- c_q_in[i][j] <= 32'd0;
-
- FSM_STATE_PIPE_CRUNCH:
- if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
- endcase
-
- //
- // c_qn
- //
+ /* shift logic */
always @(posedge clk)
//
case (fsm_state)
-
- FSM_STATE_INIT_LAST_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- c_qn_in[i][j] <= 32'd0;
-
- FSM_STATE_PIPE_CRUNCH:
- if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
- endcase
+ //
+ FSM_STATE_LOAD_B_SHIFT: begin
- //
- // t_ab
- //
- always @(posedge clk)
- //
- case (fsm_state)
+ /* update the rightmost part of loader buffer */
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+
+ /* shift the loader buffer to the left */
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_din[j-1] <= loader_din[j];
+
+ end
+ //
+ FSM_STATE_LOAD_N_COEFF_SHIFT: begin
- FSM_STATE_INIT_LAST_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- t_ab[i][j] <= 32'd0;
-
- FSM_STATE_PIPE_CRUNCH:
- if (pe_latency_ab_lsb_done) begin
- if (syst_cnt_latency > syst_cnt_zero)
- t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
- end
+ /* update the rightmost part of loader buffer */
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
+
+ /* shift the loader buffer to the left */
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_din[j-1] <= loader_din[j];
+
+ end
+ //
+ FSM_STATE_LOAD_N_SHIFT: begin
+
+ /* update the rightmost part of loader buffer */
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
+ /* shift the loader buffer to the left */
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_din[j-1] <= loader_din[j];
+
+ end
+ //
endcase
+
-
- //
- // t_q
- //
+ /* write enable logic */
always @(posedge clk)
//
- case (fsm_state)
+ case (fsm_next_state)
- FSM_STATE_INIT_LAST_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- t_q[i][j] <= 32'd0;
-
- FSM_STATE_PIPE_CRUNCH:
- if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
- if (syst_cnt_latency > syst_cnt_zero)
- t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
- end
-
+ FSM_STATE_LOAD_B_WRITE,
+ FSM_STATE_LOAD_N_COEFF_WRITE,
+ FSM_STATE_LOAD_N_WRITE:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_wren[j] <= 1'b1;
+
+ default:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_wren[j] <= 1'b0;
+
endcase
-
- //
- // t_qn
- //
- always @(posedge clk)
+ /* loader address update logic */
+ always @(posedge clk) begin
//
case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR:
- for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- t_qn[i][j] <= 32'd0;
-
- FSM_STATE_PIPE_CRUNCH:
- if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
- if (syst_cnt_latency > syst_cnt_zero)
- t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
- end
-
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_LOAD_N_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= syst_cnt_zero;
+
+ FSM_STATE_LOAD_B_WRITE,
+ FSM_STATE_LOAD_N_COEFF_WRITE,
+ FSM_STATE_LOAD_N_WRITE:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
+
+ endcase
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START,
+ FSM_STATE_MULT_A_B_RELOAD,
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+ FSM_STATE_MULT_Q_N_RELOAD:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= syst_cnt_zero;
+
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
endcase
+ //
+ case (fsm_next_state)
+
+ FSM_STATE_LOAD_B_START,
+ FSM_STATE_MULT_A_B_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
+
+ FSM_STATE_LOAD_N_COEFF_START,
+ FSM_STATE_MULT_AB_N_COEFF_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
+
+ FSM_STATE_LOAD_N_START,
+ FSM_STATE_MULT_Q_N_START:
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
+ endcase
//
- // Latency 2
+ end
+
+
//
- always @(posedge clk)
+ // Systolic Array of Processing Elements
//
- if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
- //
- case (fsm_state)
- FSM_STATE_INIT_LAST_ADDR,
- FSM_STATE_PIPE_RELOAD: pe_latency_ab_msb <= pe_latency_start;
- FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done)
- pe_latency_ab_msb <= pe_latency_ab_msb_done ?
- pe_latency_ab_msb : pe_latency_ab_msb_next;
- endcase
-
+ reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1];
+ reg [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1];
+ wire [31: 0] pe_c_out[0:SYSTOLIC_ARRAY_LENGTH-1];
+
//
- // Adder
+ // These can be turned into a FIFO (maybe later?)...
//
- reg pe_add_ce;
- reg [31: 0] pe_add_a0;
- reg [31: 0] pe_add_a1;
- reg [31: 0] pe_add_a2;
- reg [31: 0] pe_add_b0;
+ reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
+ reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
- reg pe_add_c_in;
- wire [31: 0] pe_add_s;
- wire pe_add_c_out;
+ generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+ begin : modexpa7_systolic_pe_multiplier
+ modexpa7_systolic_pe systolic_pe_inst
+ (
+ .clk (clk),
+ .a (pe_a[i]),
+ .b (pe_b[i]),
+ .t (pe_t[i]),
+ .c_in (pe_c_in[i]),
+ .p (pe_p[i]),
+ .c_out (pe_c_out[i])
+ );
+ end
+ endgenerate
- reg pe_sub_ce;
- reg [31: 0] pe_sub_a0;
- reg [31: 0] pe_sub_b0;
- reg pe_sub_b_in;
- wire [31: 0] pe_sub_d;
- wire pe_sub_b_out;
-
- always @(posedge clk)
- pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;
-
- always @(posedge clk)
- pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);
+
+
+ //
+ // Shift Registers
+ //
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
+ reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
+ reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
- always @(posedge clk)
- //
- if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
- pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;
+ wire shreg_done_load = shreg_load[syst_cnt_last];
+ wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
+ wire shreg_done_unload = shreg_unload[syst_cnt_last];
- always @(posedge clk)
- //
- if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
- pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
-
+ reg shreg_now_loading;
+ reg shreg_now_latency;
+ reg shreg_now_unloading;
- modexpa7_adder32 pe_add_inst
- (
- .clk (clk),
- .ce (pe_add_ce),
- .a (pe_add_a2),
- .b (pe_add_b0),
- .c_in (pe_add_c_in),
- .s (pe_add_s),
- .c_out (pe_add_c_out)
- );
-
- modexpa7_subtractor32 pe_sub_inst
- (
- .clk (clk),
- .ce (pe_sub_ce),
- .a (pe_sub_a0),
- .b (pe_sub_b0),
- .b_in (pe_sub_b_in),
- .d (pe_sub_d),
- .b_out (pe_sub_b_out)
- );
+ reg shreg_done_latency_dly;
always @(posedge clk)
- //
- if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
- pe_add_a0 <= mul_ab_p[0];
- pe_add_a1 <= pe_add_a0;
- pe_add_a2 <= pe_add_a1;
- end
+ shreg_done_latency_dly <= shreg_done_latency;
always @(posedge clk)
//
- if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
- pe_sub_a0 <= pe_add_s;
-
- always @(posedge clk)
- //
- if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
- pe_add_b0 <= mul_qn_p[0];
-
- always @(posedge clk)
- //
- if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
- pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
-
-
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_INIT_ZERO_ADDR: n_bram_addr_reg <= bram_addr_zero;
- FSM_STATE_INIT_NEXT_ADDR: n_bram_addr_reg <= n_bram_addr_next;
- FSM_STATE_PIPE_RELOAD: begin
- if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
- if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
+ case (fsm_state)
+ //
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START,
+ FSM_STATE_MULT_A_B_RELOAD,
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+ FSM_STATE_MULT_Q_N_RELOAD: begin
+ shreg_now_loading <= 1'b1;
+ shreg_now_latency <= 1'b1;
+ shreg_now_unloading <= 1'b0;
+ shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
+ shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
+ shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
+ end
+ //
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH: begin
+ shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
+ shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
+ shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
+
+ if (shreg_done_load) shreg_now_loading <= 1'b0;
+ if (shreg_done_latency) shreg_now_latency <= 1'b0;
+ if (shreg_done_latency) shreg_now_unloading <= 1'b1;
+ else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
+
+ end
+ //
+ default: begin
+ shreg_now_loading <= 1'b0;
+ shreg_now_latency <= 1'b0;
+ shreg_now_unloading <= 1'b0;
end
+ //
endcase
+
+
+
+ always @(posedge clk) begin
//
- // Ready Flag Logic
- //
- reg rdy_reg = 1'b1;
- assign rdy = rdy_reg;
-
- always @(posedge clk or negedge rst_n)
- //
- if (rst_n == 1'b0) rdy_reg <= 1'b1;
- else begin
- if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
- if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
- end
-
-
- //
- //
- //
- always @(posedge clk)
- //
- if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
- mul_q_a_int <= mul_ab_p[0];
-
- always @(posedge clk)
+ case (fsm_state)
+ FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero;
+ FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero;
+ FSM_STATE_MULT_Q_N_START: qn_addr_ext <= bram_addr_ext_zero;
+
+ FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next;
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next;
+ FSM_STATE_MULT_Q_N_RELOAD: qn_addr_ext <= qn_addr_ext_next;
+
+ endcase
//
- if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
- mul_qn_a_int <= mul_q_p[0];
-
- always @(posedge clk)
+ case (fsm_next_state)
+ FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero;
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next;
+ endcase
//
- if (fsm_state == FSM_STATE_PIPE_RELOAD)
- mul_q_a <= mul_q_a_int; // TODO: Add masking! Maybe not needed after all?..
+ case (fsm_next_state)
+ FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero;
+ FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr;
+ endcase
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_PIPE_RELOAD)
- mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
-
- //
- // Debug
- //
- //always @(posedge clk) begin
- //
- //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
- //$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
- //
- //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
- //$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
- //
- //if (fsm_state == FSM_STATE_PIPE_RELOAD)
- //$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
//
- //if (fsm_state == FSM_STATE_PIPE_RELOAD)
- //$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
- //
- //end
-
+ end
- wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_rd;
- reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr;
- wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
- reg s_bram_en;
-
- wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_rd;
- reg [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr;
- wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
- reg sn_bram_en;
-
- assign s_bram_addr_rd = s_bram_addr;
- assign sn_bram_addr_rd = s_bram_addr;
-
- wire [31: 0] s_bram_din;
- wire [31: 0] s_bram_dout;
-
- wire [31: 0] sn_bram_din;
- wire [31: 0] sn_bram_dout;
-
- assign s_bram_din = pe_add_s;
- assign sn_bram_din = pe_sub_d;
-
- always @(posedge clk)
+ always @(posedge clk) begin
//
- s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);
-
- always @(posedge clk)
+ if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
+ ab_wren <= shreg_done_latency_dly;
+ ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+ end else begin
+ ab_wren <= 1'b0;
+ ab_data_in <= 32'hXXXXXXXX;
+ end
//
- sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
-
- always @(posedge clk) begin
+ if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
+ q_wren <= shreg_done_latency_dly;
+ q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+ end else begin
+ q_wren <= 1'b0;
+ q_data_in <= 32'hXXXXXXXX;
+ end
//
- if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
- if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
- end
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
+ qn_wren <= shreg_done_latency_dly;
+ qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
+ end else begin
+ qn_wren <= 1'b0;
+ qn_data_in <= 32'hXXXXXXXX;
+ end
- always @(posedge clk) begin
//
- if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
- if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
end
- bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_s (.clk(clk),
- .a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
- .b_addr(s_bram_addr_rd), .b_out(s_bram_dout));
-
- bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_sn (.clk(clk),
- .a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
- .b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
-
-
- reg r_bram_en;
always @(posedge clk)
//
- case (fsm_state)
- FSM_STATE_SAVE_ZERO_ADDR,
- FSM_STATE_SAVE_NEXT_ADDR: r_bram_en <= 1'b1;
- default: r_bram_en <= 1'b0;
+ case (fsm_next_state)
+ FSM_STATE_MULT_A_B_START,
+ FSM_STATE_MULT_AB_N_COEFF_START,
+ FSM_STATE_MULT_Q_N_START,
+ FSM_STATE_MULT_A_B_RELOAD,
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD,
+ FSM_STATE_MULT_Q_N_RELOAD:
+ //
+ syst_cnt_load <= syst_cnt_zero;
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH:
+ //
+ syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
+
endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH: begin
- reg r_bram_wr_reg;
-
- assign r_bram_wr = r_bram_wr_reg;
+ if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero;
+ else if (shreg_now_unloading)
+ syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
+
+ end
+ endcase
always @(posedge clk)
//
- r_bram_wr_reg <= r_bram_en;
-
-
- wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
-
+ case (fsm_state)
+ FSM_STATE_MULT_A_B_CRUNCH,
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
+ FSM_STATE_MULT_Q_N_CRUNCH: begin
- reg [31: 0] r_bram_in_reg;
-
- assign r_bram_in = r_bram_in_reg;
+ if (shreg_now_unloading)
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ pe_c_out_mem[syst_cnt_unload][j] <= pe_c_out[j];
+
+ if (shreg_now_unloading) begin
+
+ for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ pe_t_mem[syst_cnt_unload][j-1] <= pe_p[j];
+
+ if (syst_cnt_unload > syst_cnt_zero)
+ pe_t_mem[syst_cnt_unload-1'b1][SYSTOLIC_ARRAY_LENGTH-1] <= pe_p[0];
+ else
+ pe_t_mem[syst_cnt_last][SYSTOLIC_ARRAY_LENGTH-1] <= 32'd0;
+
+ end
+ end
+ endcase
- always @(posedge clk)
+
//
- if (r_bram_en)
- r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
-
- always @(posedge clk)
+ // T and C_IN can be moved to a separate code block
+ //
+ always @(posedge clk) begin
+ //
+ if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ //
+ if (shreg_now_loading) begin
+ pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
+ pe_b[j] <= loader_dout[j];
+ pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
+ pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ pe_t[j] <= 32'hXXXXXXXX;
+ pe_c_in[j] <= 32'hXXXXXXXX;
+ end
+ //
+ if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ //
+ if (shreg_now_loading) begin
+ pe_a[j] <= ab_data_out;
+ pe_b[j] <= loader_dout[j];
+ pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
+ pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ pe_t[j] <= 32'hXXXXXXXX;
+ pe_c_in[j] <= 32'hXXXXXXXX;
+ end
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ //
+ for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
+ //
+ if (shreg_now_loading) begin
+ pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
+ pe_b[j] <= loader_dout[j];
+ pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
+ pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
+ end else begin
+ pe_a[j] <= 32'hXXXXXXXX;
+ pe_b[j] <= 32'hXXXXXXXX;
+ pe_t[j] <= 32'hXXXXXXXX;
+ pe_c_in[j] <= 32'hXXXXXXXX;
+ end
//
- if (r_bram_en)
- r_bram_addr_reg <= s_bram_addr_dly;
-
//
- // FSM Transition Logic
+ end
+
+
+ //
+ // FSM Process
//
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
else fsm_state <= fsm_next_state;
+
+ //
+ // FSM Transition Logic
+ //
always @* begin
//
fsm_next_state = FSM_STATE_STOP;
//
case (fsm_state)
-
- FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
+
+ FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START;
else fsm_next_state = FSM_STATE_IDLE;
-
- FSM_STATE_INIT_ZERO_ADDR: fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
-
- FSM_STATE_INIT_NEXT_ADDR: if (b_bram_addr_done) fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
- else fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
-
- FSM_STATE_INIT_LAST_ADDR: fsm_next_state = FSM_STATE_PIPE_CRUNCH;
-
- FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) fsm_next_state = pe_latency_ab_msb_done ?
- FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
- else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
-
- FSM_STATE_PIPE_RELOAD: if (mult_cnt_s_done) fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
- else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
-
- FSM_STATE_SAVE_ZERO_ADDR: fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
-
- FSM_STATE_SAVE_NEXT_ADDR: if (s_bram_addr_done) fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
- else fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
-
- FSM_STATE_SAVE_LAST_ADDR: fsm_next_state = FSM_STATE_STOP;
-
+ //
+ FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+ FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE;
+ else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+ FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL;
+ else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
+ FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
+ //
+ FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+ FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
+ else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+ FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
+ else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
+ FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START;
+ //
+ FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+ FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE;
+ else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+ FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL;
+ else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
+ FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START;
+ //
+ FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+ FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
+ else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+ FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
+ FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
+ //
+ FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+ FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
+ else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+ FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
+ FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START;
+ //
+ FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+ FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
+ else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+ FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
+ else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
+ FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_STOP;
+ //
FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
-
+
endcase
- end
+ //
+ end
endmodule
diff --git a/src/rtl/pe/modexpa7_adder32.v b/src/rtl/pe/modexpa7_adder32.v
index ad296b1..04f8a18 100644
--- a/src/rtl/pe/modexpa7_adder32.v
+++ b/src/rtl/pe/modexpa7_adder32.v
@@ -51,7 +51,7 @@ module modexpa7_adder32
//
// Include Primitive Selector
//
- `include "modexpa7_lowlevel_settings.v"
+ `include "modexpa7_primitive_switch.v"
//
diff --git a/src/rtl/pe/modexpa7_lowlevel_settings.v b/src/rtl/pe/modexpa7_primitive_switch.v
index 93f5f34..d38069b 100644
--- a/src/rtl/pe/modexpa7_lowlevel_settings.v
+++ b/src/rtl/pe/modexpa7_primitive_switch.v
@@ -12,4 +12,5 @@
`define SUBTRACTOR32_PRIMITIVE subtractor32_generic
`define SYSTOLIC_PE_PRIMITIVE systolic_pe_generic
+
`endif
diff --git a/src/rtl/pe/modexpa7_subtractor32.v b/src/rtl/pe/modexpa7_subtractor32.v
index 75b9c13..a43d670 100644
--- a/src/rtl/pe/modexpa7_subtractor32.v
+++ b/src/rtl/pe/modexpa7_subtractor32.v
@@ -51,7 +51,7 @@ module modexpa7_subtractor32
//
// Include Primitive Selector
//
- `include "modexpa7_lowlevel_settings.v"
+ `include "modexpa7_primitive_switch.v"
//
diff --git a/src/rtl/pe/modexpa7_systolic_pe.v b/src/rtl/pe/modexpa7_systolic_pe.v
index 22e6874..b284134 100644
--- a/src/rtl/pe/modexpa7_systolic_pe.v
+++ b/src/rtl/pe/modexpa7_systolic_pe.v
@@ -51,7 +51,7 @@ module modexpa7_systolic_pe
//
// Include Primitive Selector
//
- `include "modexpa7_lowlevel_settings.v"
+ `include "modexpa7_primitive_switch.v"
//
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
index 21e319a..9df492e 100644
--- a/src/tb/tb_systolic_multiplier.v
+++ b/src/tb/tb_systolic_multiplier.v
@@ -176,7 +176,7 @@ module tb_systolic_multiplier;
.r_bram_in (core_r_data),
.r_bram_wr (core_r_wren),
- .n_num_words (n_num_words)
+ .ab_num_words (n_num_words)
);
@@ -273,6 +273,7 @@ module tb_systolic_multiplier;
b = ab_modulo; // prepare for next round
+ #1000000;
end
// final step, display results