aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-08-06 21:46:35 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-08-06 21:46:35 +0300
commitf96ad01980fc4d0ed40f6ffb0fbb7c2006421c18 (patch)
treee9fed452b5f0c9ccbadfd6e9c536b999b3b096f2
parent9f77c4f559daf20e8b495e26003178c57da93fe2 (diff)
* Moved systolic processing element array into a separate module.
* Finished top-level wrapper module.
-rw-r--r--src/rtl/modexpa7_exponentiator.v4
-rw-r--r--src/rtl/modexpa7_systolic_multiplier.v424
-rw-r--r--src/rtl/modexpa7_systolic_multiplier_array.v72
-rw-r--r--src/rtl/modexpa7_systolic_multiplier_fix.v1202
-rw-r--r--src/rtl/modexpa7_systolic_multiplier_old.v1260
-rw-r--r--src/rtl/modexpa7_wrapper.v130
-rw-r--r--src/tb/tb_exponentiator.v4
-rw-r--r--src/tb/tb_systolic_multiplier.v4
-rw-r--r--src/tb/tb_wrapper.v123
9 files changed, 638 insertions, 2585 deletions
diff --git a/src/rtl/modexpa7_exponentiator.v b/src/rtl/modexpa7_exponentiator.v
index cda6882..b33360a 100644
--- a/src/rtl/modexpa7_exponentiator.v
+++ b/src/rtl/modexpa7_exponentiator.v
@@ -665,7 +665,7 @@ module modexpa7_exponentiator #
.r_bram_in (pp_data_in),
.r_bram_wr (pp_wren),
- .ab_num_words (m_num_words_latch)
+ .n_num_words (m_num_words_latch)
);
modexpa7_systolic_multiplier #
@@ -695,7 +695,7 @@ module modexpa7_exponentiator #
.r_bram_in (tp_data_in),
.r_bram_wr (tp_wren),
- .ab_num_words (m_num_words_latch)
+ .n_num_words (m_num_words_latch)
);
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index 32ed543..7293998 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -96,14 +96,26 @@ module modexpa7_systolic_multiplier #
localparam [ 7: 0] FSM_STATE_MULT_START = 8'h21;
localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h22;
localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h23;
+
+ localparam [ 7: 0] FSM_STATE_ADD_START = 8'h31;
+ localparam [ 7: 0] FSM_STATE_ADD_CRUNCH = 8'h32;
+ localparam [ 7: 0] FSM_STATE_ADD_UNLOAD = 8'h33;
+ localparam [ 7: 0] FSM_STATE_SUB_UNLOAD = 8'h34;
+ localparam [ 7: 0] FSM_STATE_ADD_FINAL = 8'h35;
+
+ localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h41;
+ localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h42;
+ localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h43;
localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
+
/*
- * FSM State / Next State
+ * FSM State / Next State / Previous State
*/
reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
reg [ 7: 0] fsm_next_state;
+ reg [ 7: 0] fsm_prev_state;
/*
@@ -153,6 +165,31 @@ module modexpa7_systolic_multiplier #
/*
+ * Multiplication Phase
+ */
+ localparam [ 1: 0] MULT_PHASE_A_B = 2'd1;
+ localparam [ 1: 0] MULT_PHASE_AB_N_COEFF = 2'd2;
+ localparam [ 1: 0] MULT_PHASE_Q_N = 2'd3;
+ localparam [ 1: 0] MULT_PHASE_STALL = 2'd0;
+
+ reg [ 1: 0] mult_phase;
+
+ wire mult_phase_done = (mult_phase == MULT_PHASE_STALL) ? 1'b1 : 1'b0;
+
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_LOAD_START: if (ena_trig) mult_phase <= MULT_PHASE_A_B;
+ FSM_STATE_MULT_FINAL:
+ case (mult_phase)
+ MULT_PHASE_A_B: mult_phase <= MULT_PHASE_AB_N_COEFF;
+ MULT_PHASE_AB_N_COEFF: mult_phase <= MULT_PHASE_Q_N;
+ MULT_PHASE_Q_N: mult_phase <= MULT_PHASE_STALL;
+ endcase
+ endcase
+
+
+ /*
* Counters
*/
@@ -258,41 +295,130 @@ module modexpa7_systolic_multiplier #
wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {n_num_words_latch};
wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {n_num_words_latch, 1'b1};
- // address registers
+ // address registers
reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
wire [OPERAND_ADDR_WIDTH :0] p_addr_ext_wr;
- reg [OPERAND_ADDR_WIDTH :0] p_addr_ext_rd;
+ wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_wr;
+ reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext_rd;
+ wire [OPERAND_ADDR_WIDTH-1:0] q_addr_wr;
+ wire [OPERAND_ADDR_WIDTH-1:0] q_addr_rd;
+ wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_wr;
+ reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] sn_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
// handy increment values
- wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH :0] p_addr_ext_rd_next = b_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_rd_next = ab_addr_ext_rd + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] q_addr_rd_next = q_addr_rd + 1'b1;
+ wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd_next = qn_addr_ext_rd + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
// write enables
wire p_wren;
+ wire ab_wren;
+ wire q_wren;
+ wire qn_wren;
+ reg s_wren;
+ reg sn_wren;
+ reg r_wren;
// data buses
wire [31: 0] p_data_in;
- wire [31: 0] p_data_out;
+ wire [31: 0] ab_data_in;
+ wire [31: 0] ab_data_out;
+ wire [31: 0] q_data_in;
+ wire [31: 0] q_data_out;
+ wire [31: 0] qn_data_in;
+ wire [31: 0] qn_data_out;
+ wire [31: 0] s_data_in;
+ wire [31: 0] s_data_out;
+ wire [31: 0] sn_data_in;
+ wire [31: 0] sn_data_out;
+ wire [31: 0] r_data_in;
// handy stop flags
- wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire p_addr_ext_rd_done = (p_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
+ wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire ab_addr_ext_rd_done = (ab_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
+ wire q_addr_rd_done = (q_addr_rd == bram_addr_last) ? 1'b1 : 1'b0;
+ wire qn_addr_ext_rd_done = (qn_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0;
+ wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
// delayed addresses
reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly;
+ reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext_rd_dly;
+ reg [OPERAND_ADDR_WIDTH : 0] qn_addr_ext_rd_dly1;
+ reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd_dly2;
+ reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd_dly3;
- always @(posedge clk) b_addr_dly <= b_addr;
-
+ always @(posedge clk) b_addr_dly <= b_addr;
+ always @(posedge clk) n_addr_dly <= n_addr;
+ always @(posedge clk) ab_addr_ext_rd_dly <= ab_addr_ext_rd;
+ always @(posedge clk) qn_addr_ext_rd_dly1 <= qn_addr_ext_rd;
+ always @(posedge clk) qn_addr_ext_rd_dly2 <= qn_addr_ext_rd_dly1;
+ always @(posedge clk) qn_addr_ext_rd_dly3 <= qn_addr_ext_rd_dly2;
// map registers to top-level ports
assign b_bram_addr = b_addr;
+ assign n_bram_addr = n_addr;
+ assign r_bram_addr = r_addr;
+
+ // map
+ assign ab_addr_ext_wr = p_addr_ext_wr[OPERAND_ADDR_WIDTH :0];
+ assign q_addr_wr = p_addr_ext_wr[OPERAND_ADDR_WIDTH-1:0];
+ assign qn_addr_ext_wr = p_addr_ext_wr[OPERAND_ADDR_WIDTH :0];
+ assign r_bram_wr = r_wren;
+
+ assign ab_data_in = p_data_in;
+ assign q_data_in = p_data_in;
+ assign qn_data_in = p_data_in;
+ assign r_bram_in = r_data_in;
+
+ assign ab_wren = p_wren && (mult_phase == MULT_PHASE_A_B);
+ assign q_wren = p_wren && (mult_phase == MULT_PHASE_AB_N_COEFF);
+ assign qn_wren = p_wren && (mult_phase == MULT_PHASE_Q_N);
+
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
+ bram_ab
+ ( .clk(clk),
+ .a_addr(ab_addr_ext_wr), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(),
+ .b_addr(ab_addr_ext_rd), .b_out(ab_data_out)
+ );
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_q
+ ( .clk(clk),
+ .a_addr(q_addr_wr), .a_wr(q_wren), .a_in(q_data_in), .a_out(),
+ .b_addr(q_addr_rd), .b_out(q_data_out)
+ );
bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
- bram_p
+ bram_qn
( .clk(clk),
- .a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(),
- .b_addr(p_addr_ext_rd), .b_out(p_data_out)
+ .a_addr(qn_addr_ext_wr), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(),
+ .b_addr(qn_addr_ext_rd), .b_out(qn_data_out)
+ );
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_s
+ ( .clk(clk),
+ .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)
+ );
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_sn
+ ( .clk(clk),
+ .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)
);
@@ -308,10 +434,24 @@ module modexpa7_systolic_multiplier #
//
FSM_STATE_LOAD_SHIFT: begin
- // update the rightmost part of loader buffer
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+ // update the rightmost part of loader buffer
+ case (mult_phase)
- // shift the loader buffer to the left
+ MULT_PHASE_A_B:
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
+ (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
+
+ MULT_PHASE_AB_N_COEFF:
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
+ (ab_addr_ext_rd_dly <= {1'b0, bram_addr_last}) ? ab_data_out : {32{1'b0}};
+
+ MULT_PHASE_Q_N:
+ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <=
+ (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
+
+ endcase
+
+ // shift the loader buffer to the left
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_din[j-1] <= loader_din[j];
@@ -348,16 +488,60 @@ module modexpa7_systolic_multiplier #
loader_addr_wr <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt;
endcase
-
/*
+ * Flag
+ */
+ reg flag_select_s;
+
+ assign r_data_in = flag_select_s ? s_data_out : sn_data_out;
+
+
+ /*
* Memory Address Control Logic
*/
always @(posedge clk) begin
//
- case (fsm_next_state)
- FSM_STATE_LOAD_START: b_addr <= bram_addr_zero;
- FSM_STATE_LOAD_SHIFT: b_addr <= b_addr_next;
+ case (fsm_next_state)
+
+ FSM_STATE_LOAD_START: begin
+ ab_addr_ext_rd <= bram_addr_ext_zero;
+ end
+
+ FSM_STATE_LOAD_SHIFT: begin
+ ab_addr_ext_rd <= ab_addr_ext_rd_next;
+ end
+
+ FSM_STATE_ADD_START: begin
+ ab_addr_ext_rd <= bram_addr_ext_zero;
+ qn_addr_ext_rd <= bram_addr_ext_zero;
+ end
+
+ FSM_STATE_ADD_CRUNCH: begin
+ ab_addr_ext_rd <= ab_addr_ext_rd_next;
+ qn_addr_ext_rd <= qn_addr_ext_rd_next;
+ end
+
+ endcase
+ //
+ case (fsm_next_state)
+
+ FSM_STATE_LOAD_START: begin
+ b_addr <= bram_addr_zero;
+ n_addr <= bram_addr_zero;
+ end
+
+ FSM_STATE_LOAD_SHIFT: begin
+ b_addr <= b_addr_next;
+ n_addr <= n_addr_next;
+ end
+
+ FSM_STATE_ADD_CRUNCH,
+ FSM_STATE_ADD_UNLOAD: begin
+ if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_last}) n_addr <= bram_addr_zero;
+ else if (qn_addr_ext_rd_dly1 > {1'b0, bram_addr_last}) n_addr <= n_addr_next;
+ end
+
endcase
//
end
@@ -378,10 +562,27 @@ module modexpa7_systolic_multiplier #
always @(posedge clk)
//
- case (fsm_next_state)
- FSM_STATE_MULT_START: p_num_words_latch <= {n_num_words_latch, 1'b1};
- endcase
+ if (fsm_next_state == FSM_STATE_MULT_START)
+ //
+ case (mult_phase)
+ MULT_PHASE_A_B: p_num_words_latch <= {n_num_words_latch, 1'b1};
+ MULT_PHASE_AB_N_COEFF: p_num_words_latch <= {1'b0, n_num_words_latch};
+ MULT_PHASE_Q_N: p_num_words_latch <= {n_num_words_latch, 1'b1};
+ endcase
+ assign n_coeff_bram_addr = a_bram_addr;
+ assign q_addr_rd = a_bram_addr;
+
+ reg [31: 0] a_data_out;
+
+ always @*
+ //
+ case (mult_phase)
+ MULT_PHASE_A_B: a_data_out = a_bram_out;
+ MULT_PHASE_AB_N_COEFF: a_data_out = n_coeff_bram_out;
+ MULT_PHASE_Q_N: a_data_out = q_data_out;
+ default: a_data_out = {32{1'bX}};
+ endcase
modexpa7_systolic_multiplier_array #
(
@@ -398,7 +599,7 @@ module modexpa7_systolic_multiplier #
.loader_addr_rd (loader_addr_rd),
- .pe_a_wide ({SYSTOLIC_ARRAY_LENGTH{a_bram_out}}),
+ .pe_a_wide ({SYSTOLIC_ARRAY_LENGTH{a_data_out}}),
.pe_b_wide (pe_b_wide),
.a_bram_addr (a_bram_addr),
@@ -411,24 +612,174 @@ module modexpa7_systolic_multiplier #
.p_num_words (p_num_words_latch)
);
+ /*
+ * Adder
+ */
+
+ reg add1_ce; // clock enable
+ wire [31: 0] add1_s; // sum output
+ wire add1_c_in; // carry input
+ wire [31: 0] add1_a; // A-input
+ wire [31: 0] add1_b; // B-input
+ reg add1_c_in_mask; // flag to not carry anything into the very first word
+ wire add1_c_out; // carry output
+ modexpa7_adder32 add1_inst
+ (
+ .clk (clk),
+ .ce (add1_ce),
+ .a (add1_a),
+ .b (add1_b),
+ .c_in (add1_c_in),
+ .s (add1_s),
+ .c_out (add1_c_out)
+ );
+ /*
+ * Subtractor
+ */
+ reg sub1_ce; // clock enable
+ wire [31: 0] sub1_d; // difference output
+ wire sub1_b_in; // borrow input
+ wire [31: 0] sub1_a; // A-input
+ wire [31: 0] sub1_b; // B-input
+ reg sub1_b_in_mask; // flag to not borrow anything from the very first word
+ wire sub1_b_out; // borrow output
+ modexpa7_subtractor32 sub1_inst
+ (
+ .clk (clk),
+ .ce (sub1_ce),
+ .a (sub1_a),
+ .b (sub1_b),
+ .b_in (sub1_b_in),
+ .d (sub1_d),
+ .b_out (sub1_b_out)
+ );
+
+ // add masking into carry feedback chain
+ assign add1_c_in = add1_c_out & ~add1_c_in_mask;
+ // add masking into borrow feedback chain
+ assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
+
+ // mask carry for the very first words of AB and QN
+ always @(posedge clk)
+ //
+ add1_c_in_mask <= (fsm_state == FSM_STATE_ADD_START) ? 1'b1 : 1'b0;
+ // mask borrow for the very first words of S and N
+ always @(posedge clk)
+ //
+ sub1_b_in_mask <= add1_c_in_mask;
+
+
+ // map adder inputs
+ assign add1_a = ab_data_out;
+ assign add1_b = qn_data_out;
+
+ // map subtractor inputs
+ assign sub1_a = add1_s;
+ assign sub1_b = (qn_addr_ext_rd_dly2 <= {1'b0, bram_addr_last}) ? 32'd0 : n_bram_out;
+
+ // clock enable
+ always @(posedge clk) begin
+ //
+ case (fsm_state)
+ FSM_STATE_ADD_START,
+ FSM_STATE_ADD_CRUNCH: add1_ce <= 1'b1;
+ default: add1_ce <= 1'b0;
+ endcase
+ //
+ sub1_ce <= add1_ce;
+ //
+ end
+
+ // map outputs
+ assign s_data_in = add1_s;
+ assign sn_data_in = sub1_d;
+
+ // write enabled
+ always @(posedge clk) begin
+ //
+ case (fsm_state)
+ FSM_STATE_ADD_CRUNCH,
+ FSM_STATE_ADD_UNLOAD: s_wren <= 1'b1;
+ default: s_wren <= 1'b0;
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_ADD_CRUNCH,
+ FSM_STATE_ADD_UNLOAD,
+ FSM_STATE_SUB_UNLOAD,
+ FSM_STATE_ADD_FINAL: sn_wren <= s_wren;
+ default: sn_wren <= 1'b0;
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START,
+ FSM_STATE_SAVE_WRITE: r_wren <= 1'b1;
+ default: r_wren <= 1'b0;
+ endcase
+ //
+ end
+
+ // ...
+ always @(posedge clk) begin
+ //
+ case (fsm_state)
+ FSM_STATE_ADD_CRUNCH,
+ FSM_STATE_ADD_UNLOAD: begin
+ if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_zero}) s_addr <= bram_addr_zero;
+ else if (qn_addr_ext_rd_dly2 > {1'b0, bram_addr_last}) s_addr <= s_addr_next;
+ end
+ FSM_STATE_ADD_FINAL: s_addr <= bram_addr_zero;
+ FSM_STATE_SAVE_START,
+ FSM_STATE_SAVE_WRITE: s_addr <= s_addr_next;
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_ADD_CRUNCH,
+ FSM_STATE_ADD_UNLOAD,
+ FSM_STATE_SUB_UNLOAD: begin
+ if (qn_addr_ext_rd_dly2 == {1'b0, bram_addr_zero}) sn_addr <= bram_addr_zero;
+ else if (qn_addr_ext_rd_dly3 > {1'b0, bram_addr_last}) sn_addr <= sn_addr_next;
+ end
+ FSM_STATE_ADD_FINAL: sn_addr <= bram_addr_zero;
+ FSM_STATE_SAVE_START,
+ FSM_STATE_SAVE_WRITE: sn_addr <= sn_addr_next;
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START: r_addr <= bram_addr_zero;
+ FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next;
+ endcase
+ //
+ end
+
+
+ /*
+ * Flag Update Logic
+ */
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_ADD_FINAL)
+ flag_select_s <= sub1_b_out & ~add1_c_out;
-
-
/*
* FSM Process
- - */
+ */
always @(posedge clk or negedge rst_n)
- //
+ //
if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
- else fsm_state <= fsm_next_state;
+ else fsm_state <= fsm_next_state;
+
+ always @(posedge clk)
+ //
+ fsm_prev_state <= fsm_state;
/*
@@ -453,7 +804,20 @@ module modexpa7_systolic_multiplier #
FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH;
FSM_STATE_MULT_CRUNCH: if (pe_array_rdy) fsm_next_state = FSM_STATE_MULT_FINAL;
else fsm_next_state = FSM_STATE_MULT_CRUNCH;
- FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP;
+ FSM_STATE_MULT_FINAL: if (mult_phase_done) fsm_next_state = FSM_STATE_ADD_START;
+ else fsm_next_state = FSM_STATE_LOAD_START;
+ //
+ FSM_STATE_ADD_START: fsm_next_state = FSM_STATE_ADD_CRUNCH;
+ FSM_STATE_ADD_CRUNCH: if (ab_addr_ext_rd_done) fsm_next_state = FSM_STATE_ADD_UNLOAD;
+ else fsm_next_state = FSM_STATE_ADD_CRUNCH;
+ FSM_STATE_ADD_UNLOAD: fsm_next_state = FSM_STATE_SUB_UNLOAD;
+ FSM_STATE_SUB_UNLOAD: fsm_next_state = FSM_STATE_ADD_FINAL;
+ FSM_STATE_ADD_FINAL: fsm_next_state = FSM_STATE_SAVE_START;
+ //
+ FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE;
+ FSM_STATE_SAVE_WRITE: if (s_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL;
+ else fsm_next_state = FSM_STATE_SAVE_WRITE;
+ FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP;
//
FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
//
diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v
index 22d5aaf..754203d 100644
--- a/src/rtl/modexpa7_systolic_multiplier_array.v
+++ b/src/rtl/modexpa7_systolic_multiplier_array.v
@@ -195,11 +195,15 @@ module modexpa7_systolic_multiplier_array #
wire shreg_done_load = shreg_load[syst_cnt_last];
wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
- wire shreg_done_unload = shreg_unload[syst_cnt_last];
-
+ wire shreg_done_unload = shreg_unload[syst_cnt_last];
+
reg shreg_now_loading;
reg shreg_now_latency;
reg shreg_now_unloading;
+
+ reg shreg_done_latency_dly;
+ always @(posedge clk)
+ shreg_done_latency_dly <= shreg_done_latency;
always @(posedge clk)
//
@@ -257,17 +261,22 @@ module modexpa7_systolic_multiplier_array #
reg fifo_c_rst;
reg fifo_t_rst;
- wire fifo_c_wren;
+ reg fifo_c_wren;
wire fifo_c_rden;
- wire fifo_t_wren;
+ reg fifo_t_wren;
wire fifo_t_rden;
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din;
+ reg [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din;
wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout;
wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din;
wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout;
+
+ wire [32 * 1 - 1 : 0] fifo_t_din_msb;
+ reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] fifo_t_din_lsb;
+
+ assign fifo_t_din = {fifo_t_din_msb, fifo_t_din_lsb};
modexpa7_simple_fifo #
(
@@ -317,10 +326,26 @@ module modexpa7_systolic_multiplier_array #
//
assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
+ //
+ always @(posedge clk)
+ fifo_c_din[32 * (i + 1) - 1 -: 32] <= pe_c_out[i];
+ //
+ end
+ //
+ endgenerate
+
+ generate for (i=1; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
+ //
+ begin : gen_modexpa7_fifo_t_lsb
+ //
+ always @(posedge clk)
+ fifo_t_din_lsb[32 * i - 1 -: 32] <= pe_p[i];
//
end
//
- endgenerate
+ endgenerate
+
+ assign fifo_t_din_msb = shreg_now_unloading ? pe_p[0] : 32'd0;
/*
@@ -340,6 +365,15 @@ module modexpa7_systolic_multiplier_array #
FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0;
endcase
+ /*
+ *
+ */
+ assign fifo_c_rden = shreg_now_loading;
+ assign fifo_t_rden = shreg_now_loading;
+
+ always @(posedge clk) fifo_c_wren <= shreg_now_unloading;
+ always @(posedge clk) fifo_t_wren <= shreg_now_unloading;
+
/*
* Block Memory Interface
@@ -390,16 +424,22 @@ module modexpa7_systolic_multiplier_array #
-// /*
-// *
-// */
-// always @(posedge clk)
-// //
-// case (fsm_next_state)
-// FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
-// default: p_wren <= 1'b0;
-// endcase
-//
+ /*
+ *
+ */
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+ FSM_STATE_MULT_RELOAD: p_wren <= 1'b1;
+ default: p_wren <= 1'b0;
+ endcase
+
+
+ always @(posedge clk)
+ //
+ if ((fsm_state == FSM_STATE_MULT_CRUNCH) && shreg_done_latency_dly)
+ p_data_in <= pe_p[0];
+
/*
* Block Memory Address Control
*/
diff --git a/src/rtl/modexpa7_systolic_multiplier_fix.v b/src/rtl/modexpa7_systolic_multiplier_fix.v
deleted file mode 100644
index 40b2144..0000000
--- a/src/rtl/modexpa7_systolic_multiplier_fix.v
+++ /dev/null
@@ -1,1202 +0,0 @@
-//======================================================================
-//
-// modexpa7_systolic_multiplier.v
-// -----------------------------------------------------------------------------
-// Systolic Montgomery multiplier.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2017, NORDUnet A/S All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// - Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may
-// be used to endorse or promote products derived from this software
-// without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//======================================================================
-
-module modexpa7_systolic_multiplier #
- (
- //
- // This sets the address widths of memory buffers. Internal data
- // width is 32 bits, so for e.g. 2048-bit operands buffers must store
- // 2048 / 32 = 64 words, and these need 6-bit address bus, because
- // 2 ** 6 = 64.
- //
- parameter OPERAND_ADDR_WIDTH = 4,
-
- //
- // Explain.
- //
- parameter SYSTOLIC_ARRAY_POWER = 1
- )
- (
- input clk,
- input rst_n,
-
- input ena,
- output rdy,
-
- output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr,
-
- input [ 32-1:0] a_bram_out,
- input [ 32-1:0] b_bram_out,
- input [ 32-1:0] n_bram_out,
- input [ 32-1:0] n_coeff_bram_out,
-
- output [ 32-1:0] r_bram_in,
- output r_bram_wr,
-
- input [OPERAND_ADDR_WIDTH-1:0] ab_num_words
- );
-
-
- //
- // Include Settings
- //
- `include "pe/modexpa7_primitive_switch.v"
- `include "modexpa7_settings.v"
-
-
- //
- // FSM Declaration
- //
- localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
-
- localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11;
- localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12;
- localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13;
- localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14;
-
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24;
-
- localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31;
- localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32;
- localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33;
- localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34;
-
- localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44;
-
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54;
-
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64;
-
- localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71;
- localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72;
- localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73;
-
- localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
-
- //
- // FSM State / Next State
- //
- reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
- reg [ 7: 0] fsm_next_state;
-
-
- //
- // Enable Delay and Trigger
- //
- reg ena_dly = 1'b0;
-
- /* delay enable by one clock cycle */
- always @(posedge clk) ena_dly <= ena;
-
- /* trigger new operation when enable goes high */
- wire ena_trig = ena && !ena_dly;
-
-
- //
- // Ready Flag Logic
- //
- reg rdy_reg = 1'b1;
- assign rdy = rdy_reg;
-
- always @(posedge clk or negedge rst_n)
-
- /* reset flag */
- if (rst_n == 1'b0) rdy_reg <= 1'b1;
- else begin
-
- /* clear flag when operation is started */
- if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
-
- /* set flag after operation is finished */
- if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
-
- end
-
-
- //
- // Parameters Latch
- //
- reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch;
-
- /* save number of words in a and b when new operation starts */
- always @(posedge clk)
- //
- if (fsm_next_state == FSM_STATE_LOAD_B_START)
- ab_num_words_latch <= ab_num_words;
-
-
- //
- // Systolic Cycle Counters
- //
-
- /* handy values */
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-
- /* counters */
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
-
- /* handy increment values */
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
-
- /* handy stop flags */
- wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
- /* delayed load counter */
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly;
- always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
-
-
- //
- // Multiplier Iteration Counter
- //
-
- /* handy values */
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
-
- /* counter */
- reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt;
-
- /* handy increment value and stop flag */
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1;
- wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
-
-
- //
- // Initialization Counter Control Logic
- //
- always @(posedge clk) begin
- //
- case (fsm_state)
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero;
-
- FSM_STATE_LOAD_B_SHIFT,
- FSM_STATE_LOAD_N_COEFF_SHIFT,
- FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next;
- endcase
- //
- case (fsm_state)
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero;
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
- endcase
- //
- end
-
-
- //
- // Operand Loader
- //
-
- /*
- * Explain how parallelized loader works here...
- *
- */
-
- /* loader banks */
- localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0;
- localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1;
- localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2;
-
- /* loader input */
- reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
- reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
-
- /* loader output */
- wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1];
-
- /* generate parallelized loader */
-
- //
- // Loader currently stores B, N_COEFF and N, it can be coded another way
- // to initially store B, then AB, then Q. Some memory can be saved thay way.
- // Maybe later...
- //
-
- genvar i;
- generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
- //
- begin : gen_bram_1rw_readfirst_loader
- //
- bram_1rw_readfirst #
- (
- .MEM_WIDTH (32),
- .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2)
- )
- bram_loader
- (
- .clk (clk),
- .a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}),
- .a_wr (loader_wren[i]),
- .a_in (loader_din[i]),
- .a_out (loader_dout[i])
- );
- //
- end
- //
- endgenerate
-
-
- //
- // Block Memory Addresses
- //
-
- /*
- * Explain why there are two memory sizes.
- *
- */
-
- /* the very first addresses */
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}};
- wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
-
- /* the very last addresses */
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch};
- wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1};
-
- /* address registers */
- reg [OPERAND_ADDR_WIDTH-1:0] a_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
- reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext;
- reg [OPERAND_ADDR_WIDTH-1:0] q_addr;
- reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext;
- reg [OPERAND_ADDR_WIDTH-1:0] s_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] sn_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
-
- /* handy increment values */
- wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
-
- /* handy stop flags */
- wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
- wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
- wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
- /* delayed B address */
- reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
- always @(posedge clk) b_addr_dly <= b_addr;
-
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly;
- always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
-
- reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly;
- always @(posedge clk) n_addr_dly <= n_addr;
-
- /* map registers to top-level ports */
- assign a_bram_addr = a_addr;
- assign b_bram_addr = b_addr;
- assign n_coeff_bram_addr = n_coeff_addr;
- assign n_bram_addr = n_addr;
- assign r_bram_addr = r_addr;
-
-
- //
- // Flag
- //
- reg flag_select_s;
-
-
- //
- // Memory Address Control Logic
- //
- always @(posedge clk) begin
- //
- case (fsm_next_state)
- FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero;
- FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero;
- FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero;
-
- FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next;
- FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next;
- FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next;
- endcase
- //
- case (fsm_state)
- FSM_STATE_MULT_Q_N_RELOAD:
- if (qn_addr_ext == {1'b0, bram_addr_last})
- n_addr <= bram_addr_zero;
- else if (qn_addr_ext > {1'b0, bram_addr_last})
- n_addr <= n_addr_next;
-
- endcase
- //
- case (fsm_state)
- FSM_STATE_SAVE_START: r_addr <= bram_addr_zero;
- FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next;
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero;
- FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
- endcase
- //
- end
-
-
- //
- // Internal Memories
- //
-
- /* memory inputs */
- reg [31: 0] ab_data_in;
- reg [31: 0] q_data_in;
- reg [31: 0] qn_data_in;
- wire [31: 0] s_data_in;
- wire [31: 0] sn_data_in;
- reg [31: 0] r_data_in;
-
- /* memory outputs */
- wire [31: 0] ab_data_out;
- wire [31: 0] q_data_out;
- wire [31: 0] qn_data_out;
- wire [31: 0] s_data_out;
- wire [31: 0] sn_data_out;
-
- /* write enables */
- reg ab_wren;
- reg q_wren;
- reg qn_wren;
- reg s_wren;
- reg sn_wren;
- reg r_wren;
-
- /* map */
- assign r_bram_in = r_data_in;
- assign r_bram_wr = r_wren;
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
- bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
- bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
-
-
- //
- // Wide Operand Loader
- //
- integer j;
-
- /* shift logic */
- always @(posedge clk)
- //
- case (fsm_state)
- //
- FSM_STATE_LOAD_B_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- FSM_STATE_LOAD_N_COEFF_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- FSM_STATE_LOAD_N_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- endcase
-
-
- /* write enable logic */
- always @(posedge clk)
- //
- case (fsm_next_state)
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b1;
-
- default:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b0;
-
- endcase
-
- /* loader address update logic */
- always @(posedge clk) begin
- //
- case (fsm_state)
-
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= syst_cnt_zero;
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
-
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= syst_cnt_zero;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
- endcase
- //
- case (fsm_next_state)
-
- FSM_STATE_LOAD_B_START,
- FSM_STATE_MULT_A_B_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
-
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_MULT_AB_N_COEFF_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
-
- FSM_STATE_LOAD_N_START,
- FSM_STATE_MULT_Q_N_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
-
- endcase
- //
- end
-
-
- //
- // Systolic Array of Processing Elements
- //
- reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_c_out [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [31: 0] pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1];
-
-
- //
- // These can be turned into a FIFO (maybe later?)...
- //
- //reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
- //reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
-
- reg fifo_c_rst;
- reg fifo_t_rst;
-
- wire fifo_c_wren;
- wire fifo_c_rden;
-
- wire fifo_t_wren;
- wire fifo_t_rden;
-
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din;
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout;
-
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din;
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout;
-
- /**/
- modexpa7_simple_fifo #
- (
- .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
- .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
- )
- fifo_c
- (
- .clk (clk),
- .rst (fifo_c_rst),
- .wr_en (fifo_c_wren),
- .d_in (fifo_c_din),
- .rd_en (fifo_c_rden),
- .d_out (fifo_c_dout)
- );
-
- modexpa7_simple_fifo #
- (
- .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
- .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
- )
- fifo_t
- (
- .clk (clk),
- .rst (fifo_t_rst),
- .wr_en (fifo_t_wren),
- .d_in (fifo_t_din),
- .rd_en (fifo_t_rden),
- .d_out (fifo_t_dout)
- );
-
- generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
- begin : modexpa7_systolic_pe_multiplier
- modexpa7_systolic_pe systolic_pe_inst
- (
- .clk (clk),
- .a (pe_a[i]),
- .b (pe_b[i]),
- .t (pe_t[i]),
- .c_in (pe_c_in[i]),
- .p (pe_p[i]),
- .c_out (pe_c_out[i])
- );
- assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
- assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
- assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
- always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
- end
- endgenerate
-
-
-
-
-
- //
- // Shift Registers
- //
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
- reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
-
- wire shreg_done_load = shreg_load[syst_cnt_last];
- wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
- wire shreg_done_unload = shreg_unload[syst_cnt_last];
-
- reg shreg_now_loading;
- reg shreg_now_latency;
- reg shreg_now_unloading;
-
- reg shreg_done_latency_dly;
-
- always @(posedge clk)
- shreg_done_latency_dly <= shreg_done_latency;
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_LOAD_N_FINAL: begin
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- end
- //
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD: begin
- shreg_now_loading <= 1'b1;
- shreg_now_latency <= 1'b1;
- shreg_now_unloading <= 1'b0;
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- end
- //
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: begin
- shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
- shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
- shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
-
- if (shreg_done_load) shreg_now_loading <= 1'b0;
- if (shreg_done_latency) shreg_now_latency <= 1'b0;
- if (shreg_done_latency) shreg_now_unloading <= 1'b1;
- else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
-
- end
- //
- default: begin
- shreg_now_loading <= 1'b0;
- shreg_now_latency <= 1'b0;
- shreg_now_unloading <= 1'b0;
- end
- //
- endcase
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START: fifo_c_rst <= 1'b1;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0;
- endcase
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START: fifo_t_rst <= 1'b1;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0;
- endcase
-
-
- reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] pe_p_msb_dly;
-
- always @(posedge clk)
- //
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j];
-
- wire [31: 0] pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0;
- assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly};
-
-
-
- reg shreg_now_unloading_dly;
- always @(posedge clk)
- shreg_now_unloading_dly <= shreg_now_unloading;
-
- assign fifo_c_wren = shreg_now_unloading_dly;
- assign fifo_c_rden = shreg_now_loading;
-
- assign fifo_t_wren = shreg_now_unloading_dly;
- assign fifo_t_rden = shreg_now_loading;
-
-
-
-
- always @(posedge clk) begin
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero;
- FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero;
- FSM_STATE_MULT_Q_N_START: begin qn_addr_ext <= bram_addr_ext_zero;
- ab_addr_ext <= bram_addr_ext_zero;
- end
-
- FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next;
- FSM_STATE_MULT_Q_N_RELOAD: begin qn_addr_ext <= qn_addr_ext_next;
- ab_addr_ext <= ab_addr_ext_next;
- end
- endcase
- //
- case (fsm_state)
-
- FSM_STATE_MULT_Q_N_RELOAD: begin
- if (qn_addr_ext == {1'b0, bram_addr_last}) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
-
- if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
- s_addr <= s_addr_next;
- sn_addr <= sn_addr_next;
- end
-
- if (qn_addr_ext == bram_addr_ext_last) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
-
- end
-
- FSM_STATE_MULT_Q_N_FINAL,
- FSM_STATE_SAVE_START,
- FSM_STATE_SAVE_WRITE: begin
- s_addr <= !s_addr_done ? s_addr_next : s_addr;
- sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
- end
-
- endcase
-
- //
- case (fsm_next_state)
- FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next;
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero;
- FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr;
- endcase
-
- //
- end
-
- always @(posedge clk) begin
- //
- if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
- ab_wren <= shreg_done_latency_dly;
- ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- ab_wren <= 1'b0;
- ab_data_in <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
- q_wren <= shreg_done_latency_dly;
- q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- q_wren <= 1'b0;
- q_data_in <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
- qn_wren <= shreg_done_latency_dly;
- qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- qn_wren <= 1'b0;
- qn_data_in <= 32'hXXXXXXXX;
- end
- //
- case (fsm_state)
- FSM_STATE_SAVE_START: r_wren <= 1'b1;
- FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done;
- default: r_wren <= 1'b0;
- endcase
- //
- end
-
-
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD:
- //
- syst_cnt_load <= syst_cnt_zero;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH:
- //
- syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-
- endcase
-
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: begin
-
- if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero;
- else if (shreg_now_unloading)
- syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
-
- end
- endcase
-
-
- //
- // T and C_IN can be moved to a separate code block
- //
- always @(posedge clk) begin
- //
- if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= ab_data_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
-
- //
- end
-
-
- //
- // Adder
- //
- /*
- * This adder is used to calculate S = AB + QN.
- *
- */
- reg add1_ce; // clock enable
- reg [31: 0] add1_s; // sum output
- wire add1_c_in; // carry input
- wire [31: 0] add1_a; // A-input
- reg [31: 0] add1_b; // B-input
- reg add1_c_in_mask; // flag to not carry anything into the very first word
- reg add1_c_out; // carry output
-
- /* add masking into carry feedback chain */
- assign add1_c_in = add1_c_out & ~add1_c_in_mask;
-
- /* mask carry for the very first word of N */
- //always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
-
- always @(posedge clk)
- //
- if (add1_ce)
- //
- {add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
-
- assign add1_a = qn_data_in;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
- else
- add1_b <= 32'hXXXXXXXX;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
- else
- add1_c_in_mask <= 1'b0;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- add1_ce <= shreg_done_latency_dly;
- else
- add1_ce <= 1'b0;
-
-
- assign s_data_in = add1_s;
- assign sn_data_in = sub1_d;
-
- always @(posedge clk) begin
- //
- s_wren <= add1_ce;
- sn_wren <= sub1_ce;
- end
-
-
-
- //
- // Subtractor
- //
- /*
- * This subtractor is used to calculate SN = S - N.
- *
- */
- reg sub1_ce; // clock enable
- reg [31: 0] sub1_d; // difference output
- wire sub1_b_in; // borrow input
- wire [31: 0] sub1_a; // A-input
- reg [31: 0] sub1_b; // B-input
- reg sub1_b_in_mask; // flag to not borrow anything from the very first word
- reg sub1_b_out; // borrow output
-
- /* add masking into borrow feedback chain */
- assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
-
- always @(posedge clk)
- //
- if (sub1_ce)
- //
- {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
-
- assign sub1_a = add1_s;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
- else
- sub1_b <= 32'hXXXXXXXX;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
- else
- sub1_b_in_mask <= 1'b0;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
- else
- sub1_ce <= 1'b0;
-
-
- assign s_data_in = add1_s;
-
- always @(posedge clk)
- //
- s_wren <= add1_ce;
-
-
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
- flag_select_s <= sub1_b_out & ~add1_c_out;
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_SAVE_START,
- FSM_STATE_SAVE_WRITE:
- r_data_in <= flag_select_s ? s_data_out : sn_data_out;
- endcase
-
-
-
- //
- // FSM Process
- //
- always @(posedge clk or negedge rst_n)
- //
- if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
- else fsm_state <= fsm_next_state;
-
-
- //
- // FSM Transition Logic
- //
- always @* begin
- //
- fsm_next_state = FSM_STATE_STOP;
- //
- case (fsm_state)
-
- FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START;
- else fsm_next_state = FSM_STATE_IDLE;
- //
- FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
- //
- FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START;
- //
- FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START;
- //
- FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
- else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
- //
- FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
- else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START;
- //
- FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
- else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START;
- //
- FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE;
- FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL;
- else fsm_next_state = FSM_STATE_SAVE_WRITE;
- FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP;
- //
- FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
-
- endcase
- //
- end
-
-
-endmodule
-
-//======================================================================
-// End of file
-//======================================================================
diff --git a/src/rtl/modexpa7_systolic_multiplier_old.v b/src/rtl/modexpa7_systolic_multiplier_old.v
deleted file mode 100644
index 8b00370..0000000
--- a/src/rtl/modexpa7_systolic_multiplier_old.v
+++ /dev/null
@@ -1,1260 +0,0 @@
-//======================================================================
-//
-// modexpa7_systolic_multiplier.v
-// -----------------------------------------------------------------------------
-// Systolic Montgomery multiplier.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2017, NORDUnet A/S All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// - Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may
-// be used to endorse or promote products derived from this software
-// without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-//======================================================================
-
-module modexpa7_systolic_multiplier #
- (
- //
- // This sets the address widths of memory buffers. Internal data
- // width is 32 bits, so for e.g. 2048-bit operands buffers must store
- // 2048 / 32 = 64 words, and these need 6-bit address bus, because
- // 2 ** 6 = 64.
- //
- parameter OPERAND_ADDR_WIDTH = 4,
-
- //
- // Explain.
- //
- parameter SYSTOLIC_ARRAY_POWER = 1
- )
- (
- input clk,
- input rst_n,
-
- input ena,
- output rdy,
-
- output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr,
-
- input [ 32-1:0] a_bram_out,
- input [ 32-1:0] b_bram_out,
- input [ 32-1:0] n_bram_out,
- input [ 32-1:0] n_coeff_bram_out,
-
- output [ 32-1:0] r_bram_in,
- output r_bram_wr,
-
- input [OPERAND_ADDR_WIDTH-1:0] ab_num_words
- );
-
-
- //
- // Include Settings
- //
- `include "pe/modexpa7_primitive_switch.v"
- `include "modexpa7_settings.v"
-
-
- //
- // FSM Declaration
- //
- localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
-
- localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11;
- localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12;
- localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13;
- localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14;
-
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23;
- localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24;
-
- localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31;
- localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32;
- localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33;
- localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34;
-
- localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43;
- localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44;
-
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53;
- localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54;
-
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_ADD_S = 8'h63;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_SUB_SN = 8'h64;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h65;
- localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h66;
-
- localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71;
- localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72;
- localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73;
-
- localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
-
- //
- // FSM State / Next State
- //
- reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
- reg [ 7: 0] fsm_next_state;
-
-
- //
- // Enable Delay and Trigger
- //
- reg ena_dly = 1'b0;
-
- /* delay enable by one clock cycle */
- always @(posedge clk) ena_dly <= ena;
-
- /* trigger new operation when enable goes high */
- wire ena_trig = ena && !ena_dly;
-
-
- //
- // Ready Flag Logic
- //
- reg rdy_reg = 1'b1;
- assign rdy = rdy_reg;
-
- always @(posedge clk or negedge rst_n)
-
- /* reset flag */
- if (rst_n == 1'b0) rdy_reg <= 1'b1;
- else begin
-
- /* clear flag when operation is started */
- if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
-
- /* set flag after operation is finished */
- if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
-
- end
-
-
- //
- // Parameters Latch
- //
- reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch;
-
- /* save number of words in a and b when new operation starts */
- always @(posedge clk)
- //
- if (fsm_next_state == FSM_STATE_LOAD_B_START)
- ab_num_words_latch <= ab_num_words;
-
-
- //
- // Systolic Cycle Counters
- //
-
- /* handy values */
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
-
- /* counters */
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
-
- /* handy increment values */
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
- wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
-
- /* handy stop flags */
- wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
- wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
-
- /* delayed load counter */
- reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly;
- always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
-
-
- //
- // Multiplier Iteration Counter
- //
-
- /* handy values */
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
-
- /* counter */
- reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt;
-
- /* handy increment value and stop flag */
- wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1;
- wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
-
-
- //
- // Initialization Counter Control Logic
- //
- always @(posedge clk) begin
- //
- case (fsm_state)
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero;
-
- FSM_STATE_LOAD_B_SHIFT,
- FSM_STATE_LOAD_N_COEFF_SHIFT,
- FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next;
- endcase
- //
- case (fsm_state)
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero;
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
- endcase
- //
- end
-
-
- //
- // Operand Loader
- //
-
- /*
- * Explain how parallelized loader works here...
- *
- */
-
- /* loader banks */
- localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0;
- localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1;
- localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2;
-
- /* loader input */
- reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
- reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
-
- /* loader output */
- wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1];
-
- /* generate parallelized loader */
-
- //
- // Loader currently stores B, N_COEFF and N, it can be coded another way
- // to initially store B, then AB, then Q. Some memory can be saved thay way.
- // Maybe later...
- //
-
- genvar i;
- generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
- //
- begin : gen_bram_1rw_readfirst_loader
- //
- bram_1rw_readfirst #
- (
- .MEM_WIDTH (32),
- .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2)
- )
- bram_loader
- (
- .clk (clk),
- .a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}),
- .a_wr (loader_wren[i]),
- .a_in (loader_din[i]),
- .a_out (loader_dout[i])
- );
- //
- end
- //
- endgenerate
-
-
- //
- // Block Memory Addresses
- //
-
- /*
- * Explain why there are two memory sizes.
- *
- */
-
- /* the very first addresses */
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}};
- wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
-
- /* the very last addresses */
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch};
- wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1};
-
- /* address registers */
- reg [OPERAND_ADDR_WIDTH-1:0] a_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
- reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext;
- reg [OPERAND_ADDR_WIDTH-1:0] q_addr;
- reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext;
- reg [OPERAND_ADDR_WIDTH-1:0] s_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] sn_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
-
- /* handy increment values */
- wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
-
- /* handy stop flags */
- wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
- wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
- wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
- /* delayed B address */
- reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
- always @(posedge clk) b_addr_dly <= b_addr;
-
- reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly;
- always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
-
- reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly;
- always @(posedge clk) n_addr_dly <= n_addr;
-
- /* map registers to top-level ports */
- assign a_bram_addr = a_addr;
- assign b_bram_addr = b_addr;
- assign n_coeff_bram_addr = n_coeff_addr;
- assign n_bram_addr = n_addr;
- assign r_bram_addr = r_addr;
-
-
- //
- // Flag
- //
- reg flag_select_s;
-
-
- //
- // Memory Address Control Logic
- //
- always @(posedge clk) begin
- //
- case (fsm_next_state)
- FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero;
- FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero;
- FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero;
-
- FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next;
- FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next;
- FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next;
- endcase
- //
- case (fsm_state)
- FSM_STATE_MULT_Q_N_RELOAD:
- if (qn_addr_ext == {1'b0, bram_addr_last})
- n_addr <= bram_addr_zero;
- else if (qn_addr_ext > {1'b0, bram_addr_last})
- n_addr <= n_addr_next;
-
- endcase
- //
- case (fsm_state)
- FSM_STATE_SAVE_START: r_addr <= bram_addr_zero;
- FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next;
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero;
- FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
- endcase
- //
- end
-
-
- //
- // Internal Memories
- //
-
- /* memory inputs */
- reg [31: 0] ab_data_in;
- reg [31: 0] q_data_in;
- reg [31: 0] qn_data_in;
- wire [31: 0] s_data_in;
- wire [31: 0] sn_data_in;
- reg [31: 0] r_data_in;
-
- /* memory outputs */
- wire [31: 0] ab_data_out;
- wire [31: 0] q_data_out;
- wire [31: 0] qn_data_out;
- wire [31: 0] s_data_out;
- wire [31: 0] sn_data_out;
-
- /* write enables */
- reg ab_wren;
- reg q_wren;
- reg qn_wren;
- reg s_wren;
- reg sn_wren;
- reg r_wren;
-
- /* map */
- assign r_bram_in = r_data_in;
- assign r_bram_wr = r_wren;
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
- bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
- bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
-
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
-
-
- //
- // Wide Operand Loader
- //
- integer j;
-
- /* shift logic */
- always @(posedge clk)
- //
- case (fsm_state)
- //
- FSM_STATE_LOAD_B_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- FSM_STATE_LOAD_N_COEFF_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- FSM_STATE_LOAD_N_SHIFT: begin
-
- /* update the rightmost part of loader buffer */
- loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
-
- /* shift the loader buffer to the left */
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_din[j-1] <= loader_din[j];
-
- end
- //
- endcase
-
-
- /* write enable logic */
- always @(posedge clk)
- //
- case (fsm_next_state)
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b1;
-
- default:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_wren[j] <= 1'b0;
-
- endcase
-
- /* loader address update logic */
- always @(posedge clk) begin
- //
- case (fsm_state)
-
- FSM_STATE_LOAD_B_START,
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_LOAD_N_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= syst_cnt_zero;
-
- FSM_STATE_LOAD_B_WRITE,
- FSM_STATE_LOAD_N_COEFF_WRITE,
- FSM_STATE_LOAD_N_WRITE:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
-
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= syst_cnt_zero;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
- endcase
- //
- case (fsm_next_state)
-
- FSM_STATE_LOAD_B_START,
- FSM_STATE_MULT_A_B_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
-
- FSM_STATE_LOAD_N_COEFF_START,
- FSM_STATE_MULT_AB_N_COEFF_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
-
- FSM_STATE_LOAD_N_START,
- FSM_STATE_MULT_Q_N_START:
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
-
- endcase
- //
- end
-
-
- //
- // Systolic Array of Processing Elements
- //
- reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1];
- wire [31: 0] pe_c_out [0:SYSTOLIC_ARRAY_LENGTH-1];
- reg [31: 0] pe_c_out_dly[0:SYSTOLIC_ARRAY_LENGTH-1];
-
-
- //
- // These can be turned into a FIFO (maybe later?)...
- //
- //reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
- //reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
-
- reg fifo_c_rst;
- reg fifo_t_rst;
-
- wire fifo_c_wren;
- wire fifo_c_rden;
-
- wire fifo_t_wren;
- wire fifo_t_rden;
-
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din;
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout;
-
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din;
- wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout;
-
- /**/
- modexpa7_simple_fifo #
- (
- .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
- .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
- )
- fifo_c
- (
- .clk (clk),
- .rst (fifo_c_rst),
- .wr_en (fifo_c_wren),
- .d_in (fifo_c_din),
- .rd_en (fifo_c_rden),
- .d_out (fifo_c_dout)
- );
-
- modexpa7_simple_fifo #
- (
- .BUS_WIDTH (32 * SYSTOLIC_ARRAY_LENGTH),
- .DEPTH_BITS (SYSTOLIC_CNTR_WIDTH)
- )
- fifo_t
- (
- .clk (clk),
- .rst (fifo_t_rst),
- .wr_en (fifo_t_wren),
- .d_in (fifo_t_din),
- .rd_en (fifo_t_rden),
- .d_out (fifo_t_dout)
- );
-
- generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
- begin : modexpa7_systolic_pe_multiplier
- modexpa7_systolic_pe systolic_pe_inst
- (
- .clk (clk),
- .a (pe_a[i]),
- .b (pe_b[i]),
- .t (pe_t[i]),
- .c_in (pe_c_in[i]),
- .p (pe_p[i]),
- .c_out (pe_c_out[i])
- );
- assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32];
- assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32];
- assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i];
- always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i];
- end
- endgenerate
-
-
-
-
-
- //
- // Shift Registers
- //
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
- reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
- reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
-
- wire shreg_done_load = shreg_load[syst_cnt_last];
- wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
- wire shreg_done_unload = shreg_unload[syst_cnt_last];
-
- reg shreg_now_loading;
- reg shreg_now_latency;
- reg shreg_now_unloading;
-
- reg shreg_done_latency_dly;
-
- always @(posedge clk)
- shreg_done_latency_dly <= shreg_done_latency;
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_LOAD_N_FINAL: begin
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- end
- //
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD: begin
- shreg_now_loading <= 1'b1;
- shreg_now_latency <= 1'b1;
- shreg_now_unloading <= 1'b0;
- shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
- shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
- shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
- end
- //
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: begin
- shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
- shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
- shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
-
- if (shreg_done_load) shreg_now_loading <= 1'b0;
- if (shreg_done_latency) shreg_now_latency <= 1'b0;
- if (shreg_done_latency) shreg_now_unloading <= 1'b1;
- else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
-
- end
- //
- default: begin
- shreg_now_loading <= 1'b0;
- shreg_now_latency <= 1'b0;
- shreg_now_unloading <= 1'b0;
- end
- //
- endcase
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START: fifo_c_rst <= 1'b1;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0;
- endcase
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START: fifo_t_rst <= 1'b1;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0;
- endcase
-
-
- reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] pe_p_msb_dly;
-
- always @(posedge clk)
- //
- for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- pe_p_msb_dly[32 * j - 1 -: 32] <= pe_p[j];
-
- wire [31: 0] pe_p_lsb_masked = shreg_now_unloading ? pe_p[0] : 32'd0;
- assign fifo_t_din = {pe_p_lsb_masked, pe_p_msb_dly};
-
-
-
- reg shreg_now_unloading_dly;
- always @(posedge clk)
- shreg_now_unloading_dly <= shreg_now_unloading;
-
- assign fifo_c_wren = shreg_now_unloading_dly;
- assign fifo_c_rden = shreg_now_loading;
-
- assign fifo_t_wren = shreg_now_unloading_dly;
- assign fifo_t_rden = shreg_now_loading;
-
-
-
-
- always @(posedge clk) begin
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero;
- FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero;
- FSM_STATE_MULT_Q_N_START: begin qn_addr_ext <= bram_addr_ext_zero;
- ab_addr_ext <= bram_addr_ext_zero;
- end
-
- FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next;
- FSM_STATE_MULT_Q_N_RELOAD: begin qn_addr_ext <= qn_addr_ext_next;
- ab_addr_ext <= ab_addr_ext_next;
- end
- endcase
- //
- case (fsm_state)
-
- FSM_STATE_MULT_Q_N_RELOAD: begin
- //
- if (qn_addr_ext == {1'b0, bram_addr_last}) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
- //
- if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
- s_addr <= s_addr_next;
- sn_addr <= sn_addr_next;
- end
- //
- if (qn_addr_ext == bram_addr_ext_last) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
- //
- end
- //
- /*
- case (fsm_state)
-
- FSM_STATE_MULT_Q_N_RELOAD: begin
- if (qn_addr_ext == {1'b0, bram_addr_last}) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
-
- if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
- s_addr <= s_addr_next;
- sn_addr <= sn_addr_next;
- end
-
- if (qn_addr_ext == bram_addr_ext_last) begin
- s_addr <= bram_addr_zero;
- sn_addr <= bram_addr_zero;
- end
-
- end
-
- FSM_STATE_MULT_Q_N_FINAL,
- FSM_STATE_SAVE_START,
- FSM_STATE_SAVE_WRITE: begin
- s_addr <= !s_addr_done ? s_addr_next : s_addr;
- sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
- end
- */
- endcase
-
- //
- case (fsm_next_state)
- FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next;
- endcase
- //
- case (fsm_next_state)
- FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero;
- FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr;
- endcase
-
- //
- end
-
- always @(posedge clk) begin
- //
- if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
- ab_wren <= shreg_done_latency_dly;
- ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- ab_wren <= 1'b0;
- ab_data_in <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
- q_wren <= shreg_done_latency_dly;
- q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- q_wren <= 1'b0;
- q_data_in <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
- qn_wren <= shreg_done_latency_dly;
- qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
- end else begin
- qn_wren <= 1'b0;
- qn_data_in <= 32'hXXXXXXXX;
- end
- //
- case (fsm_state)
- FSM_STATE_SAVE_START: r_wren <= 1'b1;
- FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done;
- default: r_wren <= 1'b0;
- endcase
- //
- end
-
-
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_MULT_A_B_START,
- FSM_STATE_MULT_AB_N_COEFF_START,
- FSM_STATE_MULT_Q_N_START,
- FSM_STATE_MULT_A_B_RELOAD,
- FSM_STATE_MULT_AB_N_COEFF_RELOAD,
- FSM_STATE_MULT_Q_N_RELOAD:
- //
- syst_cnt_load <= syst_cnt_zero;
-
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH:
- //
- syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
-
- endcase
-
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_MULT_A_B_CRUNCH,
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
- FSM_STATE_MULT_Q_N_CRUNCH: begin
-
- if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero;
- else if (shreg_now_unloading)
- syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
-
- end
- endcase
-
-
- //
- // T and C_IN can be moved to a separate code block
- //
- always @(posedge clk) begin
- //
- if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= ab_data_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- //
- for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
- //
- if (shreg_now_loading) begin
- pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
- pe_b[j] <= loader_dout[j];
- //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly];
- //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly];
- end else begin
- pe_a[j] <= 32'hXXXXXXXX;
- pe_b[j] <= 32'hXXXXXXXX;
- //pe_t[j] <= 32'hXXXXXXXX;
- //pe_c_in[j] <= 32'hXXXXXXXX;
- end
- //
-
- //
- end
-
-
- //
- // Adder
- //
-
- reg add1_ce; // clock enable
- wire [31: 0] add1_s; // sum output
- wire add1_c_in; // carry input
- reg [31: 0] add1_a; // A-input
- reg [31: 0] add1_b; // B-input
- reg add1_c_in_mask; // flag to not carry anything into the very first word
- wire add1_c_out; // carry output
-
- // add masking into carry feedback chain
- assign add1_c_in = add1_c_out & ~add1_c_in_mask;
-
- // mask carry for the very first word of N
- always @(posedge clk)
- //
- if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly)
- add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0;
-
- modexpa7_adder32 add1_inst
- (
- .clk (clk),
- .ce (add1_ce),
- .a (add1_a),
- .b (add1_b),
- .c_in (add1_c_in),
- .s (add1_s),
- .c_out (add1_c_out)
- );
-
- always @(posedge clk)
- //
- add1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_ADD_S) ? 1'b1 : 1'b0;
-
- always @(posedge clk)
- //
- if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin
- add1_a <= pe_p[0];
- add1_b <= ab_data_out;
- end
-
-
- //
- // Subtractor
- //
- /*
- * This subtractor is used to calculate SN = S - N.
- *
- */
-
- reg sub1_ce; // clock enable
- wire [31: 0] sub1_d; // difference output
- wire sub1_b_in; // borrow input
- reg [31: 0] sub1_a; // A-input
- reg [31: 0] sub1_b; // B-input
- reg sub1_b_in_mask; // flag to not borrow anything from the very first word
- wire sub1_b_out; // borrow output
-
- // add masking into borrow feedback chain
- assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
-
- // mask carry for the very first word of N TODO!
- //always @(posedge clk)
- //
- //if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly)
- //add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0;
-
- modexpa7_subtractor32 sub1_inst
- (
- .clk (clk),
- .ce (sub1_ce),
- .a (sub1_a),
- .b (sub1_b),
- .b_in (sub1_b_in),
- .d (sub1_d),
- .b_out (sub1_b_out)
- );
-
- always @(posedge clk)
- //
- sub1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr}) ? 1'b1 : 1'b0;
-
- always @*
- sub1_a = add1_s;
-
- always @(posedge clk)
- //
- //if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin
- //add1_a <= pe_p[0];
- //add1_b <= ab_data_out;
- //end
-
-
- /*
- reg sub1_ce; // clock enable
- reg [31: 0] sub1_d; // difference output
- wire sub1_b_in; // borrow input
- wire [31: 0] sub1_a; // A-input
- reg [31: 0] sub1_b; // B-input
- reg sub1_b_in_mask; // flag to not borrow anything from the very first word*/
-// wire sub1_b_out; // borrow output
- /*
-
- // add masking into borrow feedback chain
- assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
-
- always @(posedge clk)
- //
- if (sub1_ce)
- //
- {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
-
- assign sub1_a = add1_s;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
- else
- sub1_b <= 32'hXXXXXXXX;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
- else
- sub1_b_in_mask <= 1'b0;
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
- sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
- else
- sub1_ce <= 1'b0;
- */
-
-
- assign s_data_in = add1_s;
- assign sn_data_in = sub1_d;
-
- always @(posedge clk) begin
- //
- s_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_ADD_S) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0;
- sn_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0;
- //
- end
-
-
- always @(posedge clk)
- //
- if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
- flag_select_s <= sub1_b_out & ~add1_c_out;
-
-
- always @(posedge clk)
- //
- case (fsm_state)
- FSM_STATE_SAVE_START,
- FSM_STATE_SAVE_WRITE:
- r_data_in <= flag_select_s ? s_data_out : sn_data_out;
- endcase
-
-
-
- //
- // FSM Process
- //
- always @(posedge clk or negedge rst_n)
- //
- if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
- else fsm_state <= fsm_next_state;
-
-
- //
- // FSM Transition Logic
- //
- always @* begin
- //
- fsm_next_state = FSM_STATE_STOP;
- //
- case (fsm_state)
-
- FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START;
- else fsm_next_state = FSM_STATE_IDLE;
- //
- FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
- FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
- //
- FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
- FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START;
- //
- FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE;
- else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL;
- else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
- FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START;
- //
- FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
- else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
- FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
- //
- FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
- else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
- else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
- FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START;
- //
- FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_ADD_S;
- else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_ADD_S: fsm_next_state = FSM_STATE_MULT_Q_N_SUB_SN;
- FSM_STATE_MULT_Q_N_SUB_SN: fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
- FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
- else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START;
- //
- FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE;
- FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL;
- else fsm_next_state = FSM_STATE_SAVE_WRITE;
- FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP;
- //
- FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
-
- endcase
- //
- end
-
-
-endmodule
-
-//======================================================================
-// End of file
-//======================================================================
diff --git a/src/rtl/modexpa7_wrapper.v b/src/rtl/modexpa7_wrapper.v
index 3b749be..090ea8d 100644
--- a/src/rtl/modexpa7_wrapper.v
+++ b/src/rtl/modexpa7_wrapper.v
@@ -35,7 +35,6 @@ module modexpa7_wrapper #
parameter OPERAND_ADDR_WIDTH = 5,
parameter SYSTOLIC_ARRAY_POWER = 2
)
-
(
input clk,
input rst_n,
@@ -62,7 +61,7 @@ module modexpa7_wrapper #
/*
* Output Mux
*/
- wire [31: 0] read_data_regs;
+ reg [31: 0] read_data_regs;
wire [31: 0] read_data_core;
@@ -75,27 +74,31 @@ module modexpa7_wrapper #
localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_CONTROL = 'h08; // {next, init}
localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_STATUS = 'h09; // {valid, ready}
-// localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODE // NOT USED ANYMORE
+ localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODE = 'h10; // {crt, dummy}
localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODULUS_BITS = 'h11; // number of bits in modulus
localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_EXPONENT_BITS = 'h12; // number of bits in exponent
localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_BUFFER_BITS = 'h13; // largest supported number of bits
- localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_ARRAY_BITS = 'h15; // number of bits in systolic array
+ localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_ARRAY_BITS = 'h14; // number of bits in systolic array
localparam CONTROL_INIT_BIT = 0;
localparam CONTROL_NEXT_BIT = 1;
localparam STATUS_READY_BIT = 0;
- localparam STATUS_VALID_BIT = 1;
+ localparam STATUS_VALID_BIT = 1;
+
+ localparam MODE_DUMMY_BIT = 0;
+ localparam MODE_CRT_BIT = 1;
localparam CORE_NAME0 = 32'h6D6F6465; // "mode"
localparam CORE_NAME1 = 32'h78706137; // "xpa7"
- localparam CORE_VERSION = 32'h302E3230; // "0.10"
+ localparam CORE_VERSION = 32'h302E3230; // "0.20"
/*
* Registers
*/
- reg [ 1:0] reg_control;
+ reg [ 1:0] reg_control;
+ reg [ 1:1] reg_mode;
reg [OPERAND_ADDR_WIDTH+5:0] reg_modulus_bits;
reg [OPERAND_ADDR_WIDTH+5:0] reg_exponent_bits;
@@ -142,34 +145,53 @@ module modexpa7_wrapper #
.bus_data_wr (write_data),
.bus_data_rd (read_data_core)
);
-
-
- /*
- * Read Latch
- */
-
- reg [31: 0] read_data_regs;
/*
* Write Checker
*/
-
- // largest supported operand width
- localparam [OPERAND_ADDR_WIDTH+5:0] BUFFER_BITS = {1'b1, {OPERAND_ADDR_WIDTH+4{1'b0}}};
+
+ // largest supported operand width
+ localparam [OPERAND_ADDR_WIDTH+5:0] EXPONENT_MIN_BITS = {{OPERAND_ADDR_WIDTH+4{1'b0}}, 2'b10};
+ localparam [OPERAND_ADDR_WIDTH+5:0] EXPONENT_MAX_BITS = {1'b1, {OPERAND_ADDR_WIDTH+5{1'b0}}};
+
+ localparam [OPERAND_ADDR_WIDTH+5:0] MODULUS_MIN_BITS = {{OPERAND_ADDR_WIDTH-1{1'b0}}, 7'b1000000};
+ localparam [OPERAND_ADDR_WIDTH+5:0] MODULUS_MAX_BITS = {1'b1, {OPERAND_ADDR_WIDTH+5{1'b0}}};
- // check_modulus_bits
+ //
+ // Limits on modulus_bits:
+ //
+ // Must be 64 .. BUFFER_BITS in steps of 32
+ //
function [OPERAND_ADDR_WIDTH+5:0] check_modulus_bits;
input [OPERAND_ADDR_WIDTH+5:0] num_bits;
begin
- //
- //t = num_bits[]
- //if (num_bits > MAX_BITS) write_check_bits = MAX_BITS;
- //else write_check_bits = num_bits;
- //
+
+ // store input value
+ check_modulus_bits = num_bits;
+
+ // must be multiple of 32
+ check_modulus_bits[4:0] = {5{1'b0}};
+ if (check_modulus_bits < num_bits)
+ check_modulus_bits = check_modulus_bits + 6'd32;
+
+ // too large?
+ if (check_modulus_bits > MODULUS_MAX_BITS)
+ check_modulus_bits = MODULUS_MAX_BITS;
+
+ // too small?
+ if (check_modulus_bits < MODULUS_MIN_BITS)
+ check_modulus_bits = MODULUS_MIN_BITS;
+
end
endfunction
+ //
+ // Limits on exponent_bits:
+ //
+ // Must be 2 .. BUFFER_BITS;
+ //
+ //
function [OPERAND_ADDR_WIDTH+5:0] check_exponent_bits;
input [OPERAND_ADDR_WIDTH+5:0] num_bits;
begin
@@ -178,12 +200,12 @@ module modexpa7_wrapper #
check_exponent_bits = num_bits;
// too large?
- if (num_bits > BUFFER_BITS)
- check_exponent_bits = BUFFER_BITS;
+ if (check_exponent_bits > EXPONENT_MAX_BITS)
+ check_exponent_bits = EXPONENT_MAX_BITS;
// too small?
- if (num_bits == {OPERAND_ADDR_WIDTH+5{1'b0}})
- num_bits = {{OPERAND_ADDR_WIDTH+4{1'b0}}, 1'b1};
+ if (check_exponent_bits < EXPONENT_MIN_BITS)
+ check_exponent_bits = EXPONENT_MIN_BITS;
//
end
@@ -194,9 +216,24 @@ module modexpa7_wrapper #
* Internal Quantities Generator
*/
- function [OPERAND_ADDR_WIDTH-1:0] modulus_num_words_core;
- input [OPERAND_ADDR_WIDTH+5:0] num_bits;
+
+ function [OPERAND_ADDR_WIDTH-1:0] get_modulus_num_words_core;
+ input [OPERAND_ADDR_WIDTH+5:0] num_bits;
+ reg [OPERAND_ADDR_WIDTH+5:0] num_words_checked;
begin
+
+ // check number of bits
+ num_words_checked = check_modulus_bits(num_bits);
+
+ // reduce by 1
+ num_words_checked = {{5{1'b0}}, num_words_checked[OPERAND_ADDR_WIDTH+5:5]};
+
+ // reduce by 1
+ num_words_checked = num_words_checked - 1'b1;
+
+ // return
+ get_modulus_num_words_core = num_words_checked[OPERAND_ADDR_WIDTH-1:0];
+
end
endfunction
@@ -205,14 +242,19 @@ module modexpa7_wrapper #
reg [OPERAND_ADDR_WIDTH+5:0] num_bits_checked;
begin
- // check number of bits (not too large, not too small)
+ // check number of bits
num_bits_checked = check_exponent_bits(num_bits);
- // de
+ // reduce by 1
+ num_bits_checked = num_bits_checked - 1'b1;
+
+ // return
+ get_exponent_num_bits_core = num_bits_checked[OPERAND_ADDR_WIDTH+4:0];
+
end
endfunction
-
+
/*
* Write Interface (External Registers)
*/
@@ -229,7 +271,8 @@ module modexpa7_wrapper #
//
case (address_lsb)
//
- ADDR_CONTROL: reg_control <= write_data[ 1: 0];
+ ADDR_CONTROL: reg_control <= write_data[ 1: 0];
+ ADDR_MODE: reg_mode <= write_data[MODE_CRT_BIT];
ADDR_MODULUS_BITS: reg_modulus_bits <= check_modulus_bits(write_data[OPERAND_ADDR_WIDTH+5:0]);
ADDR_EXPONENT_BITS: reg_exponent_bits <= check_exponent_bits(write_data[OPERAND_ADDR_WIDTH+5:0]);
//
@@ -265,17 +308,20 @@ module modexpa7_wrapper #
//
case (address_lsb)
//
- ADDR_NAME0: tmp_read_data <= CORE_NAME0;
- ADDR_NAME1: tmp_read_data <= CORE_NAME1;
- ADDR_VERSION: tmp_read_data <= CORE_VERSION;
+ ADDR_NAME0: read_data_regs <= CORE_NAME0;
+ ADDR_NAME1: read_data_regs <= CORE_NAME1;
+ ADDR_VERSION: read_data_regs <= CORE_VERSION;
- ADDR_CONTROL: tmp_read_data <= {{30{1'b0}}, reg_control};
- ADDR_STATUS: tmp_read_data <= {{30{1'b0}}, reg_status};
+ ADDR_CONTROL: read_data_regs <= {{30{1'b0}}, reg_control};
+ ADDR_MODE: read_data_regs <= {{30{1'b0}}, reg_mode, 1'b0};
+ ADDR_STATUS: read_data_regs <= {{30{1'b0}}, reg_status};
- ADDR_MODULUS_BITS: tmp_read_data <= {{19{1'b0}}, reg_modulus_bits};
- ADDR_EXPONENT_BITS: tmp_read_data <= {{19{1'b0}}, reg_exponent_bits};
+ ADDR_MODULUS_BITS: read_data_regs <= {{19{1'b0}}, reg_modulus_bits};
+ ADDR_EXPONENT_BITS: read_data_regs <= {{19{1'b0}}, reg_exponent_bits};
+ ADDR_BUFFER_BITS: read_data_regs <= {{26-OPERAND_ADDR_WIDTH {1'b0}}, 1'b1, { OPERAND_ADDR_WIDTH+5{1'b0}}};
+ ADDR_ARRAY_BITS: read_data_regs <= {{26-SYSTOLIC_ARRAY_POWER{1'b0}}, 1'b1, {SYSTOLIC_ARRAY_POWER+5{1'b0}}};
//
- default: tmp_read_data <= {32{1'b0}};
+ default: read_data_regs <= {32{1'b0}};
//
endcase
@@ -294,7 +340,7 @@ module modexpa7_wrapper #
always @(*)
//
- case (address_msb_last)
+ case (address_msb_dly)
ADDR_MSB_REGS: read_data_mux = read_data_regs;
ADDR_MSB_CORE: read_data_mux = read_data_core;
endcase
diff --git a/src/tb/tb_exponentiator.v b/src/tb/tb_exponentiator.v
index c9a9f7e..16be0a5 100644
--- a/src/tb/tb_exponentiator.v
+++ b/src/tb/tb_exponentiator.v
@@ -160,7 +160,7 @@ module tb_exponentiator;
modexpa7_exponentiator #
(
.OPERAND_ADDR_WIDTH (4), // 32 * (2**4) = 512-bit operands
- .SYSTOLIC_ARRAY_POWER (2) // 2 ** 2 = 4-tap systolic array
+ .SYSTOLIC_ARRAY_POWER (3) // 2 ** 2 = 4-tap systolic array
)
uut
(
@@ -207,7 +207,7 @@ module tb_exponentiator;
rst_n = 1'b1;
#100;
- //test_exponent_384(M_384, D_384, FACTOR_384, N_384, N_COEFF_384, S_384);
+ test_exponent_384(M_384, D_384, FACTOR_384, N_384, N_COEFF_384, S_384);
test_exponent_512(M_512, D_512, FACTOR_512, N_512, N_COEFF_512, S_512);
end
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
index e9d532e..96e76d5 100644
--- a/src/tb/tb_systolic_multiplier.v
+++ b/src/tb/tb_systolic_multiplier.v
@@ -57,7 +57,7 @@ module tb_systolic_multiplier;
//
// Model Settings
//
- localparam NUM_ROUNDS = 43;
+ localparam NUM_ROUNDS = 1000;
//
@@ -193,7 +193,7 @@ module tb_systolic_multiplier;
#100;
test_systolic_multiplier_384(M_384, N_384, N_COEFF_384, FACTOR_384, COEFF_384);
- //test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512);
+ test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512);
end
diff --git a/src/tb/tb_wrapper.v b/src/tb/tb_wrapper.v
index bd8dbf1..fae0934 100644
--- a/src/tb/tb_wrapper.v
+++ b/src/tb/tb_wrapper.v
@@ -2,43 +2,108 @@
module tb_wrapper;
- // Inputs
+ /*
+ * Settings
+ */
+ localparam USE_OPERAND_ADDR_WIDTH = 7;
+ localparam USE_SYSTOLIC_ARRAY_POWER = 1;
+
+ /*
+ * Clock (100 MHz)
+ */
reg clk;
+ initial clk = 1'b0;
+ always #5 clk = ~clk;
+
+ /*
+ * Reset
+ */
reg rst_n;
- reg cs;
- reg we;
- reg [7:0] address;
- reg [31:0] write_data;
-
- // Outputs
- wire [31:0] read_data;
+
+ /*
+ * Access Bus
+ */
+ reg bus_cs;
+ reg bus_we;
+ reg [USE_OPERAND_ADDR_WIDTH+2:0] bus_addr;
+ reg [ 32-1:0] bus_wr_data;
+ wire [ 32-1:0] bus_rd_data;
- // Instantiate the Unit Under Test (UUT)
- modexpa7_wrapper uut (
- .clk(clk),
- .rst_n(rst_n),
- .cs(cs),
- .we(we),
- .address(address),
- .write_data(write_data),
- .read_data(read_data)
+ modexpa7_wrapper #
+ (
+ .OPERAND_ADDR_WIDTH (USE_OPERAND_ADDR_WIDTH),
+ .SYSTOLIC_ARRAY_POWER (USE_SYSTOLIC_ARRAY_POWER)
+ )
+ uut
+ (
+ .clk (clk),
+
+ .rst_n (rst_n),
+
+ .cs (bus_cs),
+ .we (bus_we),
+ .address (bus_addr),
+ .write_data (bus_wr_data),
+ .read_data (bus_rd_data)
);
+ reg [31: 0] tmp;
initial begin
- // Initialize Inputs
- clk = 0;
+ //
rst_n = 0;
- cs = 0;
- we = 0;
- address = 0;
- write_data = 0;
-
- // Wait 100 ns for global reset to finish
- #100;
-
- // Add stimulus here
-
+ //
+ bus_cs = 0;
+ bus_we = 0;
+ bus_addr = 'bX;
+ bus_wr_data = 'bX;
+ //
+ #200;
+ //
+ rst_n = 1;
+ //
+ read_reg('h00, tmp); // NAME0
+ read_reg('h01, tmp); // NAME1
+ read_reg('h02, tmp); // VERSION
+ //
+ read_reg('h13, tmp); // BUFFER_BITS
+ read_reg('h14, tmp); // ARRAY_BITS
+ //
+ write_reg('h12, 32'd384); // EXPONENT_BITS
+ read_reg ('h12, tmp);
+ //
+ write_reg('h11, 32'd384); // MODULUS_BITS
+ read_reg ('h11, tmp);
+ //
+ //
end
+
+ task read_reg;
+ input [USE_OPERAND_ADDR_WIDTH+1:0] addr;
+ output [ 32-1:0] data;
+ begin
+ bus_cs = 1;
+ bus_addr = {1'b0, addr};
+ #10;
+ bus_cs = 0;
+ bus_addr = 'bX;
+ data = bus_rd_data;
+ end
+ endtask
+
+ task write_reg;
+ input [USE_OPERAND_ADDR_WIDTH+1:0] addr;
+ input [ 32-1:0] data;
+ begin
+ bus_cs = 1;
+ bus_we = 1;
+ bus_addr = {1'b0, addr};
+ bus_wr_data = data;
+ #10;
+ bus_cs = 0;
+ bus_we = 0;
+ bus_addr = 'bX;
+ end
+ endtask
endmodule