aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-13 21:38:53 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-13 21:38:53 +0300
commit72a67f04a21ba4006c7b5bf38e01a3aa6592740f (patch)
tree6b5a90a9dc683bbe7c91e09a2f20ea8e462e5995
parent71b75290bf2ade9a4022bad93dc80bfb77f87f40 (diff)
Systolic multiplier simplified a bit:
* passes testbench tests again * this time synthesizes fine (without major issues) List of things that need polishing in the future: * Parallelized operand loader can be reduced by a factor of 3 to only store one operand at a time: it currently stores B, N_COEFF and N. After B is consumed, it can be overwritten with AB, N_COEFF can be loaded sequentially the same way A is loaded. After that loader can be filled with Q while N will be loaded sequentially. * Turns out QN block memory is not needed at all. After we obtain the next word of QN, we immediately calculate SN. After that QN can be discarded, no need to store it. * Currently there are two wide memories T and PE_C_OUT. XST throws weird warnings about multi-port RAM before finally deciding to implement it using flip-flop. Those memories should be turned into FIFOs to simplify the design and not confuse XST.
-rw-r--r--src/rtl/modexpa7_systolic_multiplier.v239
-rw-r--r--src/tb/tb_systolic_multiplier.v1
2 files changed, 234 insertions, 6 deletions
diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v
index 56e7be3..513b5aa 100644
--- a/src/rtl/modexpa7_systolic_multiplier.v
+++ b/src/rtl/modexpa7_systolic_multiplier.v
@@ -118,6 +118,10 @@ module modexpa7_systolic_multiplier #
localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63;
localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64;
+ localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71;
+ localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72;
+ localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73;
+
localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
//
@@ -271,7 +275,7 @@ module modexpa7_systolic_multiplier #
//
// Loader currently stores B, N_COEFF and N, it can be coded another way
- // to initially stire B, then AB, then Q. Some memory can be saved thay way.
+ // to initially store B, then AB, then Q. Some memory can be saved thay way.
// Maybe later...
//
@@ -324,6 +328,9 @@ module modexpa7_systolic_multiplier #
reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext;
reg [OPERAND_ADDR_WIDTH-1:0] q_addr;
reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext;
+ reg [OPERAND_ADDR_WIDTH-1:0] s_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] sn_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
/* handy increment values */
wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1;
@@ -333,6 +340,9 @@ module modexpa7_systolic_multiplier #
wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
/* handy stop flags */
wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
@@ -342,6 +352,9 @@ module modexpa7_systolic_multiplier #
wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
+ wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
/* delayed B address */
reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
@@ -358,9 +371,16 @@ module modexpa7_systolic_multiplier #
assign b_bram_addr = b_addr;
assign n_coeff_bram_addr = n_coeff_addr;
assign n_bram_addr = n_addr;
+ assign r_bram_addr = r_addr;
//
+ // Flag
+ //
+ reg flag_select_s;
+
+
+ //
// Memory Address Control Logic
//
always @(posedge clk) begin
@@ -375,6 +395,20 @@ module modexpa7_systolic_multiplier #
FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next;
endcase
//
+ case (fsm_state)
+ FSM_STATE_MULT_Q_N_RELOAD:
+ if (qn_addr_ext == {1'b0, bram_addr_last})
+ n_addr <= bram_addr_zero;
+ else if (qn_addr_ext > {1'b0, bram_addr_last})
+ n_addr <= n_addr_next;
+
+ endcase
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START: r_addr <= bram_addr_zero;
+ FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next;
+ endcase
+ //
case (fsm_next_state)
FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero;
FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
@@ -391,16 +425,28 @@ module modexpa7_systolic_multiplier #
reg [31: 0] ab_data_in;
reg [31: 0] q_data_in;
reg [31: 0] qn_data_in;
+ wire [31: 0] s_data_in;
+ wire [31: 0] sn_data_in;
+ reg [31: 0] r_data_in;
/* memory outputs */
wire [31: 0] ab_data_out;
wire [31: 0] q_data_out;
wire [31: 0] qn_data_out;
+ wire [31: 0] s_data_out;
+ wire [31: 0] sn_data_out;
/* write enables */
reg ab_wren;
reg q_wren;
reg qn_wren;
+ reg s_wren;
+ reg sn_wren;
+ reg r_wren;
+
+ /* map */
+ assign r_bram_in = r_data_in;
+ assign r_bram_wr = r_wren;
bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
@@ -411,6 +457,12 @@ module modexpa7_systolic_multiplier #
bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
+
//
// Wide Operand Loader
@@ -646,13 +698,46 @@ module modexpa7_systolic_multiplier #
case (fsm_state)
FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero;
FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero;
- FSM_STATE_MULT_Q_N_START: qn_addr_ext <= bram_addr_ext_zero;
+ FSM_STATE_MULT_Q_N_START: begin qn_addr_ext <= bram_addr_ext_zero;
+ ab_addr_ext <= bram_addr_ext_zero;
+ end
FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next;
FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next;
- FSM_STATE_MULT_Q_N_RELOAD: qn_addr_ext <= qn_addr_ext_next;
+ FSM_STATE_MULT_Q_N_RELOAD: begin qn_addr_ext <= qn_addr_ext_next;
+ ab_addr_ext <= ab_addr_ext_next;
+ end
+ endcase
+ //
+ case (fsm_state)
+
+ FSM_STATE_MULT_Q_N_RELOAD: begin
+ if (qn_addr_ext == {1'b0, bram_addr_last}) begin
+ s_addr <= bram_addr_zero;
+ sn_addr <= bram_addr_zero;
+ end
+
+ if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
+ s_addr <= s_addr_next;
+ sn_addr <= sn_addr_next;
+ end
+
+ if (qn_addr_ext == bram_addr_ext_last) begin
+ s_addr <= bram_addr_zero;
+ sn_addr <= bram_addr_zero;
+ end
+
+ end
+
+ FSM_STATE_MULT_Q_N_FINAL,
+ FSM_STATE_SAVE_START,
+ FSM_STATE_SAVE_WRITE: begin
+ s_addr <= !s_addr_done ? s_addr_next : s_addr;
+ sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
+ end
endcase
+
//
case (fsm_next_state)
FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero;
@@ -692,7 +777,12 @@ module modexpa7_systolic_multiplier #
qn_wren <= 1'b0;
qn_data_in <= 32'hXXXXXXXX;
end
-
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START: r_wren <= 1'b1;
+ FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done;
+ default: r_wren <= 1'b0;
+ endcase
//
end
@@ -816,6 +906,140 @@ module modexpa7_systolic_multiplier #
//
end
+
+ //
+ // Adder
+ //
+ /*
+ * This adder is used to calculate S = AB + QN.
+ *
+ */
+ reg add1_ce; // clock enable
+ reg [31: 0] add1_s; // sum output
+ wire add1_c_in; // carry input
+ wire [31: 0] add1_a; // A-input
+ reg [31: 0] add1_b; // B-input
+ reg add1_c_in_mask; // flag to not carry anything into the very first word
+ reg add1_c_out; // carry output
+
+ /* add masking into carry feedback chain */
+ assign add1_c_in = add1_c_out & ~add1_c_in_mask;
+
+ /* mask carry for the very first word of N */
+ //always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+
+ always @(posedge clk)
+ //
+ if (add1_ce)
+ //
+ {add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
+
+ assign add1_a = qn_data_in;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
+ else
+ add1_b <= 32'hXXXXXXXX;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
+ else
+ add1_c_in_mask <= 1'b0;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ add1_ce <= shreg_done_latency_dly;
+ else
+ add1_ce <= 1'b0;
+
+
+ assign s_data_in = add1_s;
+ assign sn_data_in = sub1_d;
+
+ always @(posedge clk) begin
+ //
+ s_wren <= add1_ce;
+ sn_wren <= sub1_ce;
+ end
+
+
+
+ //
+ // Subtractor
+ //
+ /*
+ * This subtractor is used to calculate SN = S - N.
+ *
+ */
+ reg sub1_ce; // clock enable
+ reg [31: 0] sub1_d; // difference output
+ wire sub1_b_in; // borrow input
+ wire [31: 0] sub1_a; // A-input
+ reg [31: 0] sub1_b; // B-input
+ reg sub1_b_in_mask; // flag to not borrow anything from the very first word
+ reg sub1_b_out; // borrow output
+
+ /* add masking into borrow feedback chain */
+ assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
+
+ always @(posedge clk)
+ //
+ if (sub1_ce)
+ //
+ {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
+
+ assign sub1_a = add1_s;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
+ else
+ sub1_b <= 32'hXXXXXXXX;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
+ else
+ sub1_b_in_mask <= 1'b0;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
+ sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
+ else
+ sub1_ce <= 1'b0;
+
+
+ assign s_data_in = add1_s;
+
+ always @(posedge clk)
+ //
+ s_wren <= add1_ce;
+
+
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
+ flag_select_s <= sub1_b_out & ~add1_c_out;
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_SAVE_START,
+ FSM_STATE_SAVE_WRITE:
+ r_data_in <= flag_select_s ? s_data_out : sn_data_out;
+ endcase
+
+
//
// FSM Process
@@ -878,7 +1102,12 @@ module modexpa7_systolic_multiplier #
else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
- FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_STOP;
+ FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START;
+ //
+ FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE;
+ FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL;
+ else fsm_next_state = FSM_STATE_SAVE_WRITE;
+ FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP;
//
FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v
index 9df492e..33d1e01 100644
--- a/src/tb/tb_systolic_multiplier.v
+++ b/src/tb/tb_systolic_multiplier.v
@@ -273,7 +273,6 @@ module tb_systolic_multiplier;
b = ab_modulo; // prepare for next round
- #1000000;
end
// final step, display results