From 1fd8037d41be46d24b3610c89f781fe85def4317 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Sat, 1 Jul 2017 02:05:02 +0300 Subject: Finished modulus-dependent coefficient calculation module: * fixed bug with latency compensation * cleaned up Verilog source * added 512-bit testbench * works in simulator * synthesizes without warnings Changes: * made latency of generic processing element configurable --- src/rtl/modexpa7_factor.v | 57 --- src/rtl/modexpa7_n_coeff.v | 745 +++++++++++++++++++++++++++++----------- src/rtl/pe/modexpa7_pe_mul.v | 41 +-- src/tb/tb_factor.v | 2 +- src/tb/tb_n_coeff.v | 235 ++++++++++--- src/tb/tb_systolic_multiplier.v | 2 +- 6 files changed, 739 insertions(+), 343 deletions(-) diff --git a/src/rtl/modexpa7_factor.v b/src/rtl/modexpa7_factor.v index 17d4785..510f7af 100644 --- a/src/rtl/modexpa7_factor.v +++ b/src/rtl/modexpa7_factor.v @@ -118,63 +118,6 @@ module modexpa7_factor # localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}}; wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch; - - // - // BRAM Addresses - // - /* - reg [OPERAND_ADDR_WIDTH-1:0] f_bram_addr_reg; - - wire [OPERAND_ADDR_WIDTH-1:0] f_bram_addr_next = f_bram_addr + 1'b1; - - wire f_bram_addr_done = (f_bram_addr == bram_addr_last) ? 1'b1 : 1'b0; - - assign f_bram_addr = f_bram_addr_reg; - - - always @(posedge clk) - // - case (fsm_next_state) - - FSM_STATE_INIT_ZERO_ADDR: f_bram_addr_reg <= bram_addr_zero; - FSM_STATE_INIT_NEXT_ADDR: f_bram_addr_reg <= f_bram_addr_next; - - endcase - - reg f_bram_en; - - assign f_bram_wr = f_bram_en; - - always @(posedge clk) - // - case (fsm_next_state) - - FSM_STATE_INIT_ZERO_ADDR, - FSM_STATE_INIT_NEXT_ADDR, - FSM_STATE_INIT_LAST_ADDR: f_bram_en <= 1'b1; - default: f_bram_en <= 1'b0; - - endcase - - - reg [31: 0] f_bram_data; - - assign f_bram_in = f_bram_data; - - always @(posedge clk) - // - case (fsm_next_state) - FSM_STATE_INIT_ZERO_ADDR: f_bram_data <= 32'd1; - FSM_STATE_INIT_NEXT_ADDR, - FSM_STATE_INIT_LAST_ADDR: f_bram_data <= 32'd0; - default: f_bram_data <= {32{1'bX}}; - - endcase - */ - - - - // // Cycle Counters // diff --git a/src/rtl/modexpa7_n_coeff.v b/src/rtl/modexpa7_n_coeff.v index 1e763ba..cba59e2 100644 --- a/src/rtl/modexpa7_n_coeff.v +++ b/src/rtl/modexpa7_n_coeff.v @@ -40,28 +40,28 @@ module modexpa7_n_coeff # ( // // This sets the address widths of memory buffers. Internal data - // width is 32 bits, so for e.g. 1024-bit operands buffers must store - // 1024 / 32 = 32 words, and these need 5-bit address bus, because - // 2 ** 5 = 32. + // width is 32 bits, so for e.g. 2048-bit operands buffers must store + // 2048 / 32 = 64 words, and these need 6-bit address bus, because + // 2 ** 6 = 64. // - parameter OPERAND_ADDR_WIDTH = 5 + parameter OPERAND_ADDR_WIDTH = 6 ) ( - input clk, - input rst_n, + input clk, // clock + input rst_n, // active-low reset - input ena, - output rdy, + input ena, // enable input + output rdy, // ready output - output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, - output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, + output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, // modulus memory address + output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, // modulus coefficient memory address - input [ 32-1:0] n_bram_out, + input [ 32-1:0] n_bram_out, // modulus memory output - output [ 32-1:0] n_coeff_bram_in, - output n_coeff_bram_wr, + output [ 32-1:0] n_coeff_bram_in, // modulus coefficient memory input + output n_coeff_bram_wr, // modulus coefficient memory write enable - input [OPERAND_ADDR_WIDTH-1:0] n_num_words + input [OPERAND_ADDR_WIDTH-1:0] n_num_words // number of words in modulus ); // @@ -79,191 +79,286 @@ module modexpa7_n_coeff # localparam [ 7: 0] FSM_STATE_CALC_2 = 8'hB2; localparam [ 7: 0] FSM_STATE_CALC_3 = 8'hB3; localparam [ 7: 0] FSM_STATE_CALC_4 = 8'hB4; - /* localparam [ 7: 0] FSM_STATE_CALC_5 = 8'hB5; - localparam [ 7: 0] FSM_STATE_CALC_6 = 8'hB6; - localparam [ 7: 0] FSM_STATE_CALC_7 = 8'hB7; - localparam [ 7: 0] FSM_STATE_CALC_8 = 8'hB8; localparam [ 7: 0] FSM_STATE_SAVE_1 = 8'hC1; localparam [ 7: 0] FSM_STATE_SAVE_2 = 8'hC2; localparam [ 7: 0] FSM_STATE_SAVE_3 = 8'hC3; localparam [ 7: 0] FSM_STATE_SAVE_4 = 8'hC4; localparam [ 7: 0] FSM_STATE_SAVE_5 = 8'hC5; - */ + localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; + + // + // FSM State / Next State + // reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; - // - // Enable Delay (Trigger) - // + // + // Enable Delay and Trigger + // reg ena_dly = 1'b0; - wire ena_trig = ena && !ena_dly; + + /* delay enable by one clock cycle */ always @(posedge clk) ena_dly <= ena; + + /* trigger new operation when enable goes high */ + wire ena_trig = ena && !ena_dly; + + // + // Ready Flag Logic + // + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; + + always @(posedge clk or negedge rst_n) + + /* reset flag */ + if (rst_n == 1'b0) rdy_reg <= 1'b1; + else begin + + /* clear flag when operation is started */ + if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; + + /* set flag after operation is finished */ + if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; + + end + // // Parameters Latch // reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; + /* save number of words in modulus when new operation starts*/ always @(posedge clk) // if (fsm_next_state == FSM_STATE_INIT_1) n_num_words_latch <= n_num_words; - - // - // Addresses - // - localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}}; - wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch; - - /* // // Cycle Counters // - reg [OPERAND_ADDR_WIDTH+5:0] cyc_cnt; // cycle counter + reg [OPERAND_ADDR_WIDTH+4:0] cyc_cnt; - wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}}; - wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_last = {n_num_words, 1'b1, {5{1'b1}}}; - wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_next = cyc_cnt + 1'b1; + wire [OPERAND_ADDR_WIDTH+4:0] cyc_cnt_zero = {{OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}}; + wire [OPERAND_ADDR_WIDTH+4:0] cyc_cnt_last = {n_num_words, 5'b11110}; + wire [OPERAND_ADDR_WIDTH+4:0] cyc_cnt_next = cyc_cnt + 1'b1; + /* handy flag */ wire cyc_cnt_done = (cyc_cnt == cyc_cnt_last) ? 1'b1 : 1'b0; - always @(posedge clk) // if (fsm_next_state == FSM_STATE_CALC_1) // case (fsm_state) - FSM_STATE_INIT_2: cyc_cnt <= cyc_cnt_zero; - FSM_STATE_SAVE_5: cyc_cnt <= cyc_cnt_done ? cyc_cnt : cyc_cnt_next; + FSM_STATE_INIT_5: cyc_cnt <= cyc_cnt_zero; + FSM_STATE_SAVE_5: cyc_cnt <= !cyc_cnt_done ? cyc_cnt_next : cyc_cnt; endcase - */ - - - - - // - // Ready Flag Logic + + // - reg rdy_reg = 1'b1; - assign rdy = rdy_reg; - - always @(posedge clk or negedge rst_n) + // Handy Address Values // - if (rst_n == 1'b0) rdy_reg <= 1'b1; - else begin - if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; - if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; - end + + /* the very first address */ + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}}; + + /* the very last address */ + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch; // // Block Memories // + + /* + * This module uses 8 block memories: + * + * N - external input, stores modulus + * R - internal, stores intermediate result + * B - internal, stores current bit mask (see high-level algorithm) + * T - internal, stores the product R * NN (see high-level algorithm) + * NN - internal, stores the quantity ~N + 1 (see high-level algorithm) + * RR - internal, stores a copy of R (see high-level algorithm) + * RB - internal, stores the sum R + B (see high-level algorithm) + * N_COEFF - external output, stores the calculated modulus-depentent coefficient + * + */ + reg [OPERAND_ADDR_WIDTH-1:0] n_addr; reg [OPERAND_ADDR_WIDTH-1:0] r_addr; - reg [OPERAND_ADDR_WIDTH-1:0] b_addr; + reg [OPERAND_ADDR_WIDTH-1:0] b_addr; + reg [OPERAND_ADDR_WIDTH-1:0] t_addr; reg [OPERAND_ADDR_WIDTH-1:0] nn_addr; - reg [OPERAND_ADDR_WIDTH-1:0] t_addr_wr; - reg [OPERAND_ADDR_WIDTH-1:0] t_addr_rd; + reg [OPERAND_ADDR_WIDTH-1:0] rr_addr; + reg [OPERAND_ADDR_WIDTH-1:0] rb_addr; + reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr; reg [31: 0] r_data_in; reg [31: 0] b_data_in; - reg [31: 0] nn_data_in; reg [31: 0] t_data_in; + reg [31: 0] nn_data_in; + reg [31: 0] rr_data_in; + reg [31: 0] rb_data_in; + reg [31: 0] n_coeff_data_in; wire [31: 0] r_data_out; wire [31: 0] b_data_out; - wire [31: 0] nn_data_out; wire [31: 0] t_data_out; + wire [31: 0] nn_data_out; + wire [31: 0] rr_data_out; + wire [31: 0] rb_data_out; - reg r_wren; - reg b_wren; - reg nn_wren; - reg t_wren; + reg r_wren; + reg b_wren; + reg t_wren; + reg nn_wren; + reg rr_wren; + reg rb_wren; + reg n_coeff_wren; - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out)); + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out)); - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_b (.clk(clk), .a_addr(b_addr), .a_wr(b_wren), .a_in(b_data_in), .a_out(b_data_out)); - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_nn (.clk(clk), .a_addr(nn_addr), .a_wr(nn_wren), .a_in(nn_data_in), .a_out(nn_data_out)); - bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_t (.clk(clk), .a_addr(t_addr_wr), .a_wr(t_wren), .a_in(t_data_in), .a_out(), .b_addr(t_addr_rd), .b_out(t_data_out)); - - assign n_bram_addr = n_addr; + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_t (.clk(clk), .a_addr(t_addr), .a_wr(t_wren), .a_in(t_data_in), .a_out(t_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_rb (.clk(clk), .a_addr(rb_addr), .a_wr(rb_wren), .a_in(rb_data_in), .a_out(rb_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_rr (.clk(clk), .a_addr(rr_addr), .a_wr(rr_wren), .a_in(rr_data_in), .a_out(rr_data_out)); + + /* handy values */ + wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] t_addr_next = t_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] nn_addr_next = nn_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] rr_addr_next = rr_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] rb_addr_next = rb_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] nn_addr_next = nn_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] t_addr_wr_next = t_addr_wr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] t_addr_rd_next = t_addr_rd + 1'b1; + /* handy flags */ + wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire t_addr_done = (t_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire nn_addr_done = (nn_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire rr_addr_done = (rr_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire rb_addr_done = (rb_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0; - wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0; - wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0; - wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; - wire nn_addr_done = (nn_addr == bram_addr_last) ? 1'b1 : 1'b0; - wire t_addr_wr_done = (t_addr_wr == bram_addr_last) ? 1'b1 : 1'b0; - wire t_addr_rd_done = (t_addr_rd == bram_addr_last) ? 1'b1 : 1'b0; + /* map top-level ports to internal register */ + assign n_bram_addr = n_addr; + assign n_coeff_bram_addr = n_coeff_addr; + assign n_coeff_bram_in = n_coeff_data_in; + assign n_coeff_bram_wr = n_coeff_wren; + + + // + // Delayed Flags + // + reg rb_addr_done_dly; + + /* delay rb_addr_done flag by one clock cycle (used later) */ + always @(posedge clk) rb_addr_done_dly <= rb_addr_done; // - // Subtractor + // Adder1 // - wire [31: 0] add_s; - wire add_c_in; - reg add_b_lsb; - reg add_c_in_mask; - reg add_c_in_mask_dly; - wire add_c_out; + + /* + * This adder is used to calculate NN = ~N + 1. + * + */ + wire [31: 0] add1_s; // sum output + wire add1_c_in; // carry input + reg add1_b_lsb; // B-input + reg add1_c_in_mask; // flag to not carry anything into the very first word + reg add1_c_in_mask_dly; // delayed carry masking flag + wire add1_c_out; // carry output - assign add_c_in = add_c_out & ~add_c_in_mask; + /* add masking into carry feedback chain */ + assign add1_c_in = add1_c_out & ~add1_c_in_mask; - always @(posedge clk) - // - add_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; - - always @(posedge clk) - // - add_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; + /* feed 1 into port B of adder */ + always @(posedge clk) add1_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; + + /* mask carry for the very first word of N */ + always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; + + /* delay carry masking flag by one clock cycle (used later) */ + always @(posedge clk) add1_c_in_mask_dly <= add1_c_in_mask; + + modexpa7_pe_add add1_inst + ( + .clk (clk), // + .ce (1'b1), + .a (~n_bram_out), // ~N + .b ({{31{1'b0}}, add1_b_lsb}), // 1 + .c_in (add1_c_in), // + .s (add1_s), // + .c_out (add1_c_out) // + ); + - always @(posedge clk) // - add_c_in_mask_dly <= add_c_in_mask; + // Adder2 + // + + /* + * This adder is used to calculate RB = R + B. + * + */ + wire [31: 0] add2_s; // sum output + reg add2_c_in; // carry input + wire add2_c_out; // carry output - ip_add32 add_inst + modexpa7_pe_add add2_inst ( .clk (clk), - .a (~n_bram_out), - .b ({{31{1'b0}}, add_b_lsb}), - .c_in (add_c_in), - .s (add_s), - .c_out (add_c_out) + .ce (1'b1), + .a (r_data_out), + .b (b_data_in), + .c_in (add2_c_in), + .s (add2_s), + .c_out (add2_c_out) ); // // Multiplier // + + /* + * This multiplier is used to calculate T = R * NN. + * + */ + reg [31: 0] pe_a; reg [31: 0] pe_b; reg [31: 0] pe_t; reg [31: 0] pe_c_in; wire [31: 0] pe_p; wire [31: 0] pe_c_out; - - modexpa7_pe_mul pe2 + + modexpa7_pe_mul pe_mul_inst ( .clk (clk), .a (pe_a), @@ -274,161 +369,413 @@ module modexpa7_n_coeff # .c_out (pe_c_out) ); + + // + // Multiplier Latency Compensation Logic + // - /* + localparam SYSTOLIC_PE_LATENCY = 4; + + /* shift register to match data propagation delay */ + reg [SYSTOLIC_PE_LATENCY:0] pe_latency; + wire pe_latency_done = pe_latency[SYSTOLIC_PE_LATENCY]; + + /* gradually fill the shift register with ones */ always @(posedge clk) // - case (fsm_next_state) - FSM_STATE_CALC_2: f0_data_out_carry <= 1'b0; - FSM_STATE_CALC_3, - FSM_STATE_CALC_4, - FSM_STATE_CALC_5, - FSM_STATE_CALC_6: f0_data_out_carry <= f0_data_out[31]; - default: f0_data_out_carry <= 1'bX; - endcase - */ + if (fsm_state == FSM_STATE_CALC_1) + pe_latency <= {1'b0, {SYSTOLIC_PE_LATENCY{1'b0}}}; + else pe_latency <= {pe_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b1}; - /* - reg sub_b_out_dly1; - reg f0_data_out_carry_dly1; - reg f0_data_out_carry_dly2; + + // + // Adder2 Output Delay + // + reg [31: 0] add2_s_dly[1:SYSTOLIC_PE_LATENCY-1]; + reg add2_c_out_dly[1:SYSTOLIC_PE_LATENCY+2]; + + /* delay sum */ + integer i; + always @(posedge clk) + // + for (i=1; i