aboutsummaryrefslogtreecommitdiff
path: root/src/rtl
diff options
context:
space:
mode:
Diffstat (limited to 'src/rtl')
-rw-r--r--src/rtl/modexpa7_factor.v57
-rw-r--r--src/rtl/modexpa7_n_coeff.v745
-rw-r--r--src/rtl/pe/modexpa7_pe_mul.v41
3 files changed, 553 insertions, 290 deletions
diff --git a/src/rtl/modexpa7_factor.v b/src/rtl/modexpa7_factor.v
index 17d4785..510f7af 100644
--- a/src/rtl/modexpa7_factor.v
+++ b/src/rtl/modexpa7_factor.v
@@ -118,63 +118,6 @@ module modexpa7_factor #
localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch;
-
- //
- // BRAM Addresses
- //
- /*
- reg [OPERAND_ADDR_WIDTH-1:0] f_bram_addr_reg;
-
- wire [OPERAND_ADDR_WIDTH-1:0] f_bram_addr_next = f_bram_addr + 1'b1;
-
- wire f_bram_addr_done = (f_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
-
- assign f_bram_addr = f_bram_addr_reg;
-
-
- always @(posedge clk)
- //
- case (fsm_next_state)
-
- FSM_STATE_INIT_ZERO_ADDR: f_bram_addr_reg <= bram_addr_zero;
- FSM_STATE_INIT_NEXT_ADDR: f_bram_addr_reg <= f_bram_addr_next;
-
- endcase
-
- reg f_bram_en;
-
- assign f_bram_wr = f_bram_en;
-
- always @(posedge clk)
- //
- case (fsm_next_state)
-
- FSM_STATE_INIT_ZERO_ADDR,
- FSM_STATE_INIT_NEXT_ADDR,
- FSM_STATE_INIT_LAST_ADDR: f_bram_en <= 1'b1;
- default: f_bram_en <= 1'b0;
-
- endcase
-
-
- reg [31: 0] f_bram_data;
-
- assign f_bram_in = f_bram_data;
-
- always @(posedge clk)
- //
- case (fsm_next_state)
- FSM_STATE_INIT_ZERO_ADDR: f_bram_data <= 32'd1;
- FSM_STATE_INIT_NEXT_ADDR,
- FSM_STATE_INIT_LAST_ADDR: f_bram_data <= 32'd0;
- default: f_bram_data <= {32{1'bX}};
-
- endcase
- */
-
-
-
-
//
// Cycle Counters
//
diff --git a/src/rtl/modexpa7_n_coeff.v b/src/rtl/modexpa7_n_coeff.v
index 1e763ba..cba59e2 100644
--- a/src/rtl/modexpa7_n_coeff.v
+++ b/src/rtl/modexpa7_n_coeff.v
@@ -40,28 +40,28 @@ module modexpa7_n_coeff #
(
//
// This sets the address widths of memory buffers. Internal data
- // width is 32 bits, so for e.g. 1024-bit operands buffers must store
- // 1024 / 32 = 32 words, and these need 5-bit address bus, because
- // 2 ** 5 = 32.
+ // width is 32 bits, so for e.g. 2048-bit operands buffers must store
+ // 2048 / 32 = 64 words, and these need 6-bit address bus, because
+ // 2 ** 6 = 64.
//
- parameter OPERAND_ADDR_WIDTH = 5
+ parameter OPERAND_ADDR_WIDTH = 6
)
(
- input clk,
- input rst_n,
+ input clk, // clock
+ input rst_n, // active-low reset
- input ena,
- output rdy,
+ input ena, // enable input
+ output rdy, // ready output
- output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
- output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
+ output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, // modulus memory address
+ output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, // modulus coefficient memory address
- input [ 32-1:0] n_bram_out,
+ input [ 32-1:0] n_bram_out, // modulus memory output
- output [ 32-1:0] n_coeff_bram_in,
- output n_coeff_bram_wr,
+ output [ 32-1:0] n_coeff_bram_in, // modulus coefficient memory input
+ output n_coeff_bram_wr, // modulus coefficient memory write enable
- input [OPERAND_ADDR_WIDTH-1:0] n_num_words
+ input [OPERAND_ADDR_WIDTH-1:0] n_num_words // number of words in modulus
);
//
@@ -79,191 +79,286 @@ module modexpa7_n_coeff #
localparam [ 7: 0] FSM_STATE_CALC_2 = 8'hB2;
localparam [ 7: 0] FSM_STATE_CALC_3 = 8'hB3;
localparam [ 7: 0] FSM_STATE_CALC_4 = 8'hB4;
- /*
localparam [ 7: 0] FSM_STATE_CALC_5 = 8'hB5;
- localparam [ 7: 0] FSM_STATE_CALC_6 = 8'hB6;
- localparam [ 7: 0] FSM_STATE_CALC_7 = 8'hB7;
- localparam [ 7: 0] FSM_STATE_CALC_8 = 8'hB8;
localparam [ 7: 0] FSM_STATE_SAVE_1 = 8'hC1;
localparam [ 7: 0] FSM_STATE_SAVE_2 = 8'hC2;
localparam [ 7: 0] FSM_STATE_SAVE_3 = 8'hC3;
localparam [ 7: 0] FSM_STATE_SAVE_4 = 8'hC4;
localparam [ 7: 0] FSM_STATE_SAVE_5 = 8'hC5;
- */
+
localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
+
+ //
+ // FSM State / Next State
+ //
reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
reg [ 7: 0] fsm_next_state;
- //
- // Enable Delay (Trigger)
- //
+ //
+ // Enable Delay and Trigger
+ //
reg ena_dly = 1'b0;
- wire ena_trig = ena && !ena_dly;
+
+ /* delay enable by one clock cycle */
always @(posedge clk) ena_dly <= ena;
+
+ /* trigger new operation when enable goes high */
+ wire ena_trig = ena && !ena_dly;
+
+ //
+ // Ready Flag Logic
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+
+ /* reset flag */
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else begin
+
+ /* clear flag when operation is started */
+ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
+
+ /* set flag after operation is finished */
+ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
+
+ end
+
//
// Parameters Latch
//
reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch;
+ /* save number of words in modulus when new operation starts*/
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_INIT_1)
n_num_words_latch <= n_num_words;
-
- //
- // Addresses
- //
- localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
- wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch;
-
- /*
//
// Cycle Counters
//
- reg [OPERAND_ADDR_WIDTH+5:0] cyc_cnt; // cycle counter
+ reg [OPERAND_ADDR_WIDTH+4:0] cyc_cnt;
- wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}};
- wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_last = {n_num_words, 1'b1, {5{1'b1}}};
- wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_next = cyc_cnt + 1'b1;
+ wire [OPERAND_ADDR_WIDTH+4:0] cyc_cnt_zero = {{OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}};
+ wire [OPERAND_ADDR_WIDTH+4:0] cyc_cnt_last = {n_num_words, 5'b11110};
+ wire [OPERAND_ADDR_WIDTH+4:0] cyc_cnt_next = cyc_cnt + 1'b1;
+ /* handy flag */
wire cyc_cnt_done = (cyc_cnt == cyc_cnt_last) ? 1'b1 : 1'b0;
-
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_CALC_1)
//
case (fsm_state)
- FSM_STATE_INIT_2: cyc_cnt <= cyc_cnt_zero;
- FSM_STATE_SAVE_5: cyc_cnt <= cyc_cnt_done ? cyc_cnt : cyc_cnt_next;
+ FSM_STATE_INIT_5: cyc_cnt <= cyc_cnt_zero;
+ FSM_STATE_SAVE_5: cyc_cnt <= !cyc_cnt_done ? cyc_cnt_next : cyc_cnt;
endcase
- */
-
-
-
-
- //
- // Ready Flag Logic
+
+
//
- reg rdy_reg = 1'b1;
- assign rdy = rdy_reg;
-
- always @(posedge clk or negedge rst_n)
+ // Handy Address Values
//
- if (rst_n == 1'b0) rdy_reg <= 1'b1;
- else begin
- if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
- if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
- end
+
+ /* the very first address */
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
+
+ /* the very last address */
+ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch;
//
// Block Memories
//
+
+ /*
+ * This module uses 8 block memories:
+ *
+ * N - external input, stores modulus
+ * R - internal, stores intermediate result
+ * B - internal, stores current bit mask (see high-level algorithm)
+ * T - internal, stores the product R * NN (see high-level algorithm)
+ * NN - internal, stores the quantity ~N + 1 (see high-level algorithm)
+ * RR - internal, stores a copy of R (see high-level algorithm)
+ * RB - internal, stores the sum R + B (see high-level algorithm)
+ * N_COEFF - external output, stores the calculated modulus-depentent coefficient
+ *
+ */
+
reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] t_addr;
reg [OPERAND_ADDR_WIDTH-1:0] nn_addr;
- reg [OPERAND_ADDR_WIDTH-1:0] t_addr_wr;
- reg [OPERAND_ADDR_WIDTH-1:0] t_addr_rd;
+ reg [OPERAND_ADDR_WIDTH-1:0] rr_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] rb_addr;
+ reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr;
reg [31: 0] r_data_in;
reg [31: 0] b_data_in;
- reg [31: 0] nn_data_in;
reg [31: 0] t_data_in;
+ reg [31: 0] nn_data_in;
+ reg [31: 0] rr_data_in;
+ reg [31: 0] rb_data_in;
+ reg [31: 0] n_coeff_data_in;
wire [31: 0] r_data_out;
wire [31: 0] b_data_out;
- wire [31: 0] nn_data_out;
wire [31: 0] t_data_out;
+ wire [31: 0] nn_data_out;
+ wire [31: 0] rr_data_out;
+ wire [31: 0] rb_data_out;
- reg r_wren;
- reg b_wren;
- reg nn_wren;
- reg t_wren;
+ reg r_wren;
+ reg b_wren;
+ reg t_wren;
+ reg nn_wren;
+ reg rr_wren;
+ reg rb_wren;
+ reg n_coeff_wren;
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out));
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out));
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
bram_b (.clk(clk), .a_addr(b_addr), .a_wr(b_wren), .a_in(b_data_in), .a_out(b_data_out));
- bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
bram_nn (.clk(clk), .a_addr(nn_addr), .a_wr(nn_wren), .a_in(nn_data_in), .a_out(nn_data_out));
- bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
- bram_t (.clk(clk), .a_addr(t_addr_wr), .a_wr(t_wren), .a_in(t_data_in), .a_out(), .b_addr(t_addr_rd), .b_out(t_data_out));
-
- assign n_bram_addr = n_addr;
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_t (.clk(clk), .a_addr(t_addr), .a_wr(t_wren), .a_in(t_data_in), .a_out(t_data_out));
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_rb (.clk(clk), .a_addr(rb_addr), .a_wr(rb_wren), .a_in(rb_data_in), .a_out(rb_data_out));
+
+ bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_rr (.clk(clk), .a_addr(rr_addr), .a_wr(rr_wren), .a_in(rr_data_in), .a_out(rr_data_out));
+
+ /* handy values */
+ wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] t_addr_next = t_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] nn_addr_next = nn_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] rr_addr_next = rr_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] rb_addr_next = rb_addr + 1'b1;
+ wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] nn_addr_next = nn_addr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] t_addr_wr_next = t_addr_wr + 1'b1;
- wire [OPERAND_ADDR_WIDTH-1:0] t_addr_rd_next = t_addr_rd + 1'b1;
+ /* handy flags */
+ wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire t_addr_done = (t_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire nn_addr_done = (nn_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire rr_addr_done = (rr_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire rb_addr_done = (rb_addr == bram_addr_last) ? 1'b1 : 1'b0;
+ wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire nn_addr_done = (nn_addr == bram_addr_last) ? 1'b1 : 1'b0;
- wire t_addr_wr_done = (t_addr_wr == bram_addr_last) ? 1'b1 : 1'b0;
- wire t_addr_rd_done = (t_addr_rd == bram_addr_last) ? 1'b1 : 1'b0;
+ /* map top-level ports to internal register */
+ assign n_bram_addr = n_addr;
+ assign n_coeff_bram_addr = n_coeff_addr;
+ assign n_coeff_bram_in = n_coeff_data_in;
+ assign n_coeff_bram_wr = n_coeff_wren;
+
+
+ //
+ // Delayed Flags
+ //
+ reg rb_addr_done_dly;
+
+ /* delay rb_addr_done flag by one clock cycle (used later) */
+ always @(posedge clk) rb_addr_done_dly <= rb_addr_done;
//
- // Subtractor
+ // Adder1
//
- wire [31: 0] add_s;
- wire add_c_in;
- reg add_b_lsb;
- reg add_c_in_mask;
- reg add_c_in_mask_dly;
- wire add_c_out;
+
+ /*
+ * This adder is used to calculate NN = ~N + 1.
+ *
+ */
+ wire [31: 0] add1_s; // sum output
+ wire add1_c_in; // carry input
+ reg add1_b_lsb; // B-input
+ reg add1_c_in_mask; // flag to not carry anything into the very first word
+ reg add1_c_in_mask_dly; // delayed carry masking flag
+ wire add1_c_out; // carry output
- assign add_c_in = add_c_out & ~add_c_in_mask;
+ /* add masking into carry feedback chain */
+ assign add1_c_in = add1_c_out & ~add1_c_in_mask;
- always @(posedge clk)
- //
- add_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
-
- always @(posedge clk)
- //
- add_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+ /* feed 1 into port B of adder */
+ always @(posedge clk) add1_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+
+ /* mask carry for the very first word of N */
+ always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
+
+ /* delay carry masking flag by one clock cycle (used later) */
+ always @(posedge clk) add1_c_in_mask_dly <= add1_c_in_mask;
+
+ modexpa7_pe_add add1_inst
+ (
+ .clk (clk), //
+ .ce (1'b1),
+ .a (~n_bram_out), // ~N
+ .b ({{31{1'b0}}, add1_b_lsb}), // 1
+ .c_in (add1_c_in), //
+ .s (add1_s), //
+ .c_out (add1_c_out) //
+ );
+
- always @(posedge clk)
//
- add_c_in_mask_dly <= add_c_in_mask;
+ // Adder2
+ //
+
+ /*
+ * This adder is used to calculate RB = R + B.
+ *
+ */
+ wire [31: 0] add2_s; // sum output
+ reg add2_c_in; // carry input
+ wire add2_c_out; // carry output
- ip_add32 add_inst
+ modexpa7_pe_add add2_inst
(
.clk (clk),
- .a (~n_bram_out),
- .b ({{31{1'b0}}, add_b_lsb}),
- .c_in (add_c_in),
- .s (add_s),
- .c_out (add_c_out)
+ .ce (1'b1),
+ .a (r_data_out),
+ .b (b_data_in),
+ .c_in (add2_c_in),
+ .s (add2_s),
+ .c_out (add2_c_out)
);
//
// Multiplier
//
+
+ /*
+ * This multiplier is used to calculate T = R * NN.
+ *
+ */
+
reg [31: 0] pe_a;
reg [31: 0] pe_b;
reg [31: 0] pe_t;
reg [31: 0] pe_c_in;
wire [31: 0] pe_p;
wire [31: 0] pe_c_out;
-
- modexpa7_pe_mul pe2
+
+ modexpa7_pe_mul pe_mul_inst
(
.clk (clk),
.a (pe_a),
@@ -274,161 +369,413 @@ module modexpa7_n_coeff #
.c_out (pe_c_out)
);
+
+ //
+ // Multiplier Latency Compensation Logic
+ //
- /*
+ localparam SYSTOLIC_PE_LATENCY = 4;
+
+ /* shift register to match data propagation delay */
+ reg [SYSTOLIC_PE_LATENCY:0] pe_latency;
+ wire pe_latency_done = pe_latency[SYSTOLIC_PE_LATENCY];
+
+ /* gradually fill the shift register with ones */
always @(posedge clk)
//
- case (fsm_next_state)
- FSM_STATE_CALC_2: f0_data_out_carry <= 1'b0;
- FSM_STATE_CALC_3,
- FSM_STATE_CALC_4,
- FSM_STATE_CALC_5,
- FSM_STATE_CALC_6: f0_data_out_carry <= f0_data_out[31];
- default: f0_data_out_carry <= 1'bX;
- endcase
- */
+ if (fsm_state == FSM_STATE_CALC_1)
+ pe_latency <= {1'b0, {SYSTOLIC_PE_LATENCY{1'b0}}};
+ else pe_latency <= {pe_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b1};
- /*
- reg sub_b_out_dly1;
- reg f0_data_out_carry_dly1;
- reg f0_data_out_carry_dly2;
+
+ //
+ // Adder2 Output Delay
+ //
+ reg [31: 0] add2_s_dly[1:SYSTOLIC_PE_LATENCY-1];
+ reg add2_c_out_dly[1:SYSTOLIC_PE_LATENCY+2];
+
+ /* delay sum */
+ integer i;
+ always @(posedge clk)
+ //
+ for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
+ add2_s_dly[i] <= (i == 1) ? add2_s : add2_s_dly[i-1];
+
+ /* delay adder carry */
+ always @(posedge clk)
+ //
+ for (i=1; i<=(SYSTOLIC_PE_LATENCY+2); i=i+1)
+ add2_c_out_dly[i] <= (i == 1) ? add2_c_out : add2_c_out_dly[i-1];
+
+ /* adder carry feedback */
+ always @(posedge clk)
+ //
+ if ((fsm_next_state == FSM_STATE_CALC_3) && (nn_addr == bram_addr_zero))
+ add2_c_in <= (r_addr == bram_addr_zero) ? 1'b0 : add2_c_out_dly[SYSTOLIC_PE_LATENCY+2];
+
+ //
+ // Multiplier Output Delay
+ //
+ reg [31: 0] pe_c_out_dly[1:3];
+
+ always @(posedge clk)
+ //
+ for (i=1; i<=3; i=i+1)
+ pe_c_out_dly[i] <= (i == 1) ? pe_c_out : pe_c_out_dly[i-1];
+
+
+ //
+ // Multiplier Operand Loader
+ //
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_CALC_3) begin
+ pe_a <= r_data_out;
+ pe_b <= nn_data_out;
+ pe_t <= (nn_addr == bram_addr_zero) ? {32{1'b0}} : t_data_out;
+ pe_c_in <= (r_addr == bram_addr_zero) ? {32{1'b0}} : pe_c_out_dly[3];
+ end else begin
+ pe_a <= {32{1'bX}};
+ pe_b <= {32{1'bX}};
+ pe_t <= {32{1'bX}};
+ pe_c_in <= {32{1'bX}};
+ end
+
- always @(posedge clk) sub_b_out_dly1 <= sub_b_out;
+ //
+ // B Shift Carry Logic
+ //
- always @(posedge clk) f0_data_out_carry_dly1 <= f0_data_out_carry;
- always @(posedge clk) f0_data_out_carry_dly2 <= f0_data_out_carry_dly1;
+ /*
+ * B value is repeatedly shifted to the left, so we need carry logic
+ * to save the MSB of the current output word and feed into the LSB
+ * of the next input word.
+ *
+ */
+
+ reg b_data_out_carry;
- reg flag_keep_f;
+ always @(posedge clk)
+ //
+ case (fsm_next_state)
+
+ /* mask carry into the very first word */
+ FSM_STATE_CALC_2:
+ if ((nn_addr == bram_addr_zero) && (b_addr == bram_addr_zero))
+ b_data_out_carry <= 1'b0;
+
+ /* carry feedback */
+ FSM_STATE_CALC_3:
+ if (nn_addr == bram_addr_zero)
+ b_data_out_carry <= b_data_out[31];
+
+ endcase
+
+
+ //
+ // R Update Flag
+ //
+ reg flag_update_r;
+ /* indices of the target bit of T */
+ wire [ 4:0] flag_addr_bit = cyc_cnt_next[4:0];
+ wire [OPERAND_ADDR_WIDTH-1:0] flag_addr_word = cyc_cnt_next[OPERAND_ADDR_WIDTH+4:5];
+
+ /* update flag when the target bit of T is available */
always @(posedge clk)
//
- if (fsm_next_state == FSM_STATE_SAVE_1)
- flag_keep_f <= sub_b_out_dly1 & ~f0_data_out_carry_dly2;
- */
+ if (t_wren && (t_addr == flag_addr_word))
+ flag_update_r <= t_data_in[flag_addr_bit];
+
- always @* t_addr_rd = r_addr + nn_addr;
+ //
+ // Block Memory Address Logic
+ //
+
+ reg [OPERAND_ADDR_WIDTH-1:0] r_addr_calc1;
+ reg [OPERAND_ADDR_WIDTH-1:0] b_addr_calc1;
+ reg [OPERAND_ADDR_WIDTH-1:0] t_addr_calc1;
+ reg [OPERAND_ADDR_WIDTH-1:0] nn_addr_calc1;
+ reg [OPERAND_ADDR_WIDTH-1:0] rr_addr_calc1;
+ reg [OPERAND_ADDR_WIDTH-1:0] rb_addr_calc1;
+ /* how to update R duing CALC_1 state */
+ always @*
+ //
+ if (fsm_state == FSM_STATE_INIT_5) r_addr_calc1 <= bram_addr_zero;
+ else begin
+ if (r_addr < (n_num_words_latch - nn_addr)) r_addr_calc1 <= r_addr_next;
+ else r_addr_calc1 <= bram_addr_zero;
+ end
+
+ /* how to update B, RR, RB duing CALC_1 state */
+ always @* begin
+ //
+ b_addr_calc1 = b_addr;
+ rr_addr_calc1 = rr_addr;
+ rb_addr_calc1 = rb_addr;
+ //
+ if ((fsm_state == FSM_STATE_INIT_5) || (fsm_state == FSM_STATE_SAVE_5)) begin
+ //
+ b_addr_calc1 = bram_addr_zero;
+ rr_addr_calc1 = bram_addr_zero;
+ rb_addr_calc1 = bram_addr_zero;
+ //
+ end else if (nn_addr == bram_addr_zero) begin
+ //
+ b_addr_calc1 = !b_addr_done ? b_addr_next : b_addr;
+ rr_addr_calc1 = !rr_addr_done ? rr_addr_next : rr_addr;
+ rb_addr_calc1 = !rb_addr_done ? rb_addr_next : rb_addr;
+ //
+ end
+ //
+ end
+
+ /* how to update T duing CALC_1 state */
+ always @*
+ //
+ if ((fsm_state == FSM_STATE_INIT_5) || (fsm_state == FSM_STATE_SAVE_5))
+ t_addr_calc1 = bram_addr_zero;
+ else begin
+ if (r_addr == (n_num_words_latch - nn_addr))
+ t_addr_calc1 = nn_addr_next;
+ else
+ t_addr_calc1 = t_addr_next;
+ end
+
+ /* how to update NN duing CALC_1 state */
+ always @* begin
+ //
+ nn_addr_calc1 = nn_addr;
+ //
+ if ((fsm_state == FSM_STATE_INIT_5) || (fsm_state == FSM_STATE_SAVE_5))
+ nn_addr_calc1 = bram_addr_zero;
+ else if (r_addr == (n_num_words_latch - nn_addr))
+ nn_addr_calc1 = nn_addr_next;
+ //
+ end
+
+
+ //
+ // Address Update Logic
+ //
always @(posedge clk) begin
//
+ // N
+ //
case (fsm_next_state)
-
FSM_STATE_INIT_1: n_addr <= bram_addr_zero;
-
+ //
FSM_STATE_INIT_2,
FSM_STATE_INIT_3,
FSM_STATE_INIT_4,
FSM_STATE_INIT_5: n_addr <= !n_addr_done ? n_addr_next : n_addr;
-
endcase
//
- case (fsm_next_state)
- FSM_STATE_INIT_4: nn_addr <= bram_addr_zero;
- FSM_STATE_INIT_5: nn_addr <= nn_addr_next;
- FSM_STATE_CALC_1:
- case (fsm_state)
- FSM_STATE_INIT_5: nn_addr <= bram_addr_zero;
- endcase
- endcase
+ // R
//
case (fsm_next_state)
FSM_STATE_INIT_4: r_addr <= bram_addr_zero;
FSM_STATE_INIT_5: r_addr <= r_addr_next;
- FSM_STATE_CALC_1: r_addr <= bram_addr_zero;
- FSM_STATE_CALC_2,
- FSM_STATE_CALC_3,
- FSM_STATE_CALC_4: r_addr <= r_addr_next;
-
+ FSM_STATE_CALC_1: r_addr <= r_addr_calc1;
+ FSM_STATE_SAVE_3: r_addr <= bram_addr_zero;
+ //
+ FSM_STATE_SAVE_4,
+ FSM_STATE_SAVE_5: r_addr <= r_addr_next;
endcase
//
+ // B
+ //
case (fsm_next_state)
-
FSM_STATE_INIT_4: b_addr <= bram_addr_zero;
-
FSM_STATE_INIT_5: b_addr <= b_addr_next;
-
+ FSM_STATE_CALC_1: b_addr <= b_addr_calc1;
+ endcase
+ //
+ // T
+ //
+ case (fsm_next_state)
+ FSM_STATE_CALC_1: t_addr <= t_addr_calc1;
+ endcase
+ //
+ // NN
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_4: nn_addr <= bram_addr_zero;
+ FSM_STATE_INIT_5: nn_addr <= nn_addr_next;
+ FSM_STATE_CALC_1: nn_addr <= nn_addr_calc1;
+ endcase
+ //
+ // RR
+ //
+ case (fsm_next_state)
+ FSM_STATE_CALC_1: rr_addr <= rr_addr_calc1;
+ FSM_STATE_SAVE_1: rr_addr <= bram_addr_zero;
+ //
+ FSM_STATE_SAVE_2,
+ FSM_STATE_SAVE_3,
+ FSM_STATE_SAVE_4: rr_addr <= !rr_addr_done ? rr_addr_next : rr_addr;
+ endcase
+ //
+ // RB
+ //
+ case (fsm_next_state)
+ FSM_STATE_CALC_1: rb_addr <= rb_addr_calc1;
+ FSM_STATE_SAVE_1: rb_addr <= bram_addr_zero;
+ //
+ FSM_STATE_SAVE_2,
+ FSM_STATE_SAVE_3,
+ FSM_STATE_SAVE_4: rb_addr <= !rb_addr_done ? rb_addr_next : rb_addr;
+ endcase
+ //
+ // N_COEFF
+ //
+ case (fsm_next_state)
+ FSM_STATE_SAVE_3: n_coeff_addr <= bram_addr_zero;
+ //
+ FSM_STATE_SAVE_4,
+ FSM_STATE_SAVE_5: n_coeff_addr <= r_addr_next;
endcase
//
end
+ //
+ // Block Memory Write Enable Logic
+ //
always @(posedge clk) begin
//
- case (fsm_next_state)
- FSM_STATE_INIT_4,
- FSM_STATE_INIT_5: nn_wren <= 1'b1;
- default: nn_wren <= 1'b0;
- endcase
+ // R
//
- case (fsm_next_state)
+ case (fsm_next_state)
FSM_STATE_INIT_4,
- FSM_STATE_INIT_5: r_wren <= 1'b1;
+ FSM_STATE_INIT_5,
+ FSM_STATE_SAVE_3,
+ FSM_STATE_SAVE_4,
+ FSM_STATE_SAVE_5: r_wren <= 1'b1;
default: r_wren <= 1'b0;
endcase
//
+ // B
+ //
case (fsm_next_state)
FSM_STATE_INIT_4,
- FSM_STATE_INIT_5: b_wren <= 1'b1;
+ FSM_STATE_INIT_5: b_wren <= 1'b1;
+ FSM_STATE_CALC_3: b_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
default: b_wren <= 1'b0;
endcase
- /*
+ //
+ // T
+ //
case (fsm_next_state)
+ FSM_STATE_CALC_5: t_wren <= 1'b1;
+ default: t_wren <= 1'b0;
+ endcase
+ //
+ // NN
+ //
+ case (fsm_next_state)
+ FSM_STATE_INIT_4,
+ FSM_STATE_INIT_5: nn_wren <= 1'b1;
+ default: nn_wren <= 1'b0;
+ endcase
+ //
+ // RR
+ //
+ case (fsm_next_state)
+ FSM_STATE_CALC_5: rr_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
+ default: rr_wren <= 1'b0;
+ endcase
+ //
+ // RB
+ //
+ case (fsm_next_state)
+ FSM_STATE_CALC_5: rb_wren <= (nn_addr == bram_addr_zero) ? 1'b1 : 1'b0;
+ default: rb_wren <= 1'b0;
+ endcase
+ //
+ // N_COEFF
+ //
+ case (fsm_next_state)
FSM_STATE_SAVE_3,
FSM_STATE_SAVE_4,
- FSM_STATE_SAVE_5: f_wren <= cyc_cnt_done;
- default: f_wren <= 1'b0;
+ FSM_STATE_SAVE_5: n_coeff_wren <= cyc_cnt_done;
+ default: n_coeff_wren <= 1'b0;
endcase
- */
+ //
end
+
+ //
+ // Block Memory Input Logic
+ //
always @(posedge clk) begin
//
+ // R
+ //
case (fsm_next_state)
FSM_STATE_INIT_4,
- FSM_STATE_INIT_5: nn_data_in <= add_s;
- default: nn_data_in <= {32{1'bX}};
+ FSM_STATE_INIT_5: r_data_in <= {{31{1'b0}}, add1_c_in_mask_dly};
+ //
+ FSM_STATE_SAVE_3,
+ FSM_STATE_SAVE_4,
+ FSM_STATE_SAVE_5: r_data_in <= flag_update_r ? rb_data_out : rr_data_out;
+ default: r_data_in <= {32{1'bX}};
endcase
//
+ // B
+ //
case (fsm_next_state)
FSM_STATE_INIT_4,
- FSM_STATE_INIT_5: r_data_in <= {{31{1'b0}}, add_c_in_mask_dly};
- default: r_data_in <= {32{1'bX}};
+ FSM_STATE_INIT_5: b_data_in <= {{31{1'b0}}, add1_c_in_mask_dly};
+ FSM_STATE_CALC_3: b_data_in <= (nn_addr == bram_addr_zero) ?
+ {b_data_out[30:0], b_data_out_carry} : {32{1'bX}};
+ default: b_data_in <= {32{1'bX}};
endcase
//
+ // T
+ //
+ case (fsm_next_state)
+ FSM_STATE_CALC_5: t_data_in <= pe_p;
+ default: t_data_in <= {32{1'bX}};
+ endcase
+ //
+ // NN
+ //
case (fsm_next_state)
FSM_STATE_INIT_4,
- FSM_STATE_INIT_5: b_data_in <= {{31{1'b0}}, add_c_in_mask_dly};
- default: b_data_in <= {32{1'bX}};
+ FSM_STATE_INIT_5: nn_data_in <= add1_s;
+ default: nn_data_in <= {32{1'bX}};
endcase
- /*
+ //
+ // RR
+ //
case (fsm_next_state)
- FSM_STATE_CALC_3,
- FSM_STATE_CALC_4,
- FSM_STATE_CALC_5,
- FSM_STATE_CALC_6: f1_data_in <= f0_data_out_shifted;
- default: f1_data_in <= {32{1'bX}};
+ FSM_STATE_CALC_5: rr_data_in <= r_data_out;
+ default: rr_data_in <= {32{1'bX}};
endcase
//
+ // RB
+ //
case (fsm_next_state)
- FSM_STATE_CALC_5,
- FSM_STATE_CALC_6,
- FSM_STATE_CALC_7,
- FSM_STATE_CALC_8: f2_data_in <= sub_d;
- default: f2_data_in <= {32{1'bX}};
+ FSM_STATE_CALC_5: rb_data_in <= add2_s_dly[SYSTOLIC_PE_LATENCY-1];
+ default: rb_data_in <= {32{1'bX}};
endcase
//
+ // N_COEFF
+ //
case (fsm_next_state)
FSM_STATE_SAVE_3,
FSM_STATE_SAVE_4,
- FSM_STATE_SAVE_5: f_data_in <= flag_keep_f ? f1_data_out : f2_data_out;
- default: f_data_in <= {32{1'bX}};
+ FSM_STATE_SAVE_5: n_coeff_data_in <= flag_update_r ? rb_data_out : rr_data_out;
+ default: n_coeff_data_in <= {32{1'bX}};
endcase
- */
+ //
end
-
//
- // FSM Transition Logic
+ // FSM Process
//
always @(posedge clk or negedge rst_n)
//
@@ -436,6 +783,9 @@ module modexpa7_n_coeff #
else fsm_state <= fsm_next_state;
+ //
+ // FSM Transition Logic
+ //
always @* begin
//
fsm_next_state = FSM_STATE_STOP;
@@ -446,45 +796,28 @@ module modexpa7_n_coeff #
else fsm_next_state = FSM_STATE_IDLE;
FSM_STATE_INIT_1: fsm_next_state = FSM_STATE_INIT_2;
-
FSM_STATE_INIT_2: fsm_next_state = FSM_STATE_INIT_3;
-
FSM_STATE_INIT_3: fsm_next_state = FSM_STATE_INIT_4;
-
FSM_STATE_INIT_4: fsm_next_state = FSM_STATE_INIT_5;
-
FSM_STATE_INIT_5: if (nn_addr_done) fsm_next_state = FSM_STATE_CALC_1;
else fsm_next_state = FSM_STATE_INIT_5;
FSM_STATE_CALC_1: fsm_next_state = FSM_STATE_CALC_2;
-
FSM_STATE_CALC_2: fsm_next_state = FSM_STATE_CALC_3;
-
FSM_STATE_CALC_3: fsm_next_state = FSM_STATE_CALC_4;
-
- FSM_STATE_CALC_4: fsm_next_state = FSM_STATE_STOP;//FSM_STATE_CALC_5;
- /*
- FSM_STATE_CALC_5: fsm_next_state = FSM_STATE_CALC_6;
-
- FSM_STATE_CALC_6: if (f1_addr_done) fsm_next_state = FSM_STATE_CALC_7;
- else fsm_next_state = FSM_STATE_CALC_6;
-
- FSM_STATE_CALC_7: fsm_next_state = FSM_STATE_CALC_8;
-
- FSM_STATE_CALC_8: fsm_next_state = FSM_STATE_SAVE_1;
+ FSM_STATE_CALC_4: if (pe_latency_done) fsm_next_state = FSM_STATE_CALC_5;
+ else fsm_next_state = FSM_STATE_CALC_4;
+ FSM_STATE_CALC_5: if (nn_addr_done) fsm_next_state = FSM_STATE_SAVE_1;
+ else fsm_next_state = FSM_STATE_CALC_1;
FSM_STATE_SAVE_1: fsm_next_state = FSM_STATE_SAVE_2;
-
FSM_STATE_SAVE_2: fsm_next_state = FSM_STATE_SAVE_3;
-
FSM_STATE_SAVE_3: fsm_next_state = FSM_STATE_SAVE_4;
-
- FSM_STATE_SAVE_4: if (f12_addr_done_dly) fsm_next_state = FSM_STATE_SAVE_5;
+ FSM_STATE_SAVE_4: if (rb_addr_done_dly) fsm_next_state = FSM_STATE_SAVE_5;
else fsm_next_state = FSM_STATE_SAVE_4;
-
FSM_STATE_SAVE_5: if (cyc_cnt_done) fsm_next_state = FSM_STATE_STOP;
else fsm_next_state = FSM_STATE_CALC_1;
- */
+
FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
endcase
diff --git a/src/rtl/pe/modexpa7_pe_mul.v b/src/rtl/pe/modexpa7_pe_mul.v
index e56d152..ff15981 100644
--- a/src/rtl/pe/modexpa7_pe_mul.v
+++ b/src/rtl/pe/modexpa7_pe_mul.v
@@ -47,34 +47,21 @@ module modexpa7_pe_mul
output [31: 0] c_out
);
- reg [31: 0] a_reg1;
- reg [31: 0] b_reg1;
- reg [31: 0] t_reg1;
- reg [31: 0] t_reg2;
- reg [31: 0] t_reg3;
- reg [31: 0] c_reg1;
- reg [31: 0] c_reg2;
-
- reg [63: 0] ab_reg;
- reg [63: 0] abc_reg;
- reg [63: 0] abct_reg;
-
- assign p = abct_reg[31: 0];
- assign c_out = abct_reg[63:32];
-
- always @(posedge clk) begin
- a_reg1 <= a;
- b_reg1 <= b;
- c_reg1 <= c_in;
- c_reg2 <= c_reg1;
- t_reg1 <= t;
- t_reg2 <= t_reg1;
- t_reg3 <= t_reg2;
+ localparam LATENCY = 4;
- ab_reg <= {{32{1'b0}}, a_reg1} * {{32{1'b0}}, b_reg1};
- abc_reg <= ab_reg + {{32{1'b0}}, c_reg2};
- abct_reg <= abc_reg + {{32{1'b0}}, t_reg3};
- end
+ reg [63: 0] abct[1:LATENCY];
+
+ assign p = abct[LATENCY][31: 0];
+ assign c_out = abct[LATENCY][63:32];
+
+ wire [63: 0] ab = {{32{1'b0}}, a} * {{32{1'b0}}, b};
+ wire [63: 0] ct = {{32{1'b0}}, c_in} + {{32{1'b0}}, t};
+
+ integer i;
+ always @(posedge clk)
+ //
+ for (i=1; i<=LATENCY; i=i+1)
+ abct[i] <= (i == 1) ? ab + ct : abct[i-1];
endmodule