From f96ad01980fc4d0ed40f6ffb0fbb7c2006421c18 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Sun, 6 Aug 2017 21:46:35 +0300 Subject: * Moved systolic processing element array into a separate module. * Finished top-level wrapper module. --- src/rtl/modexpa7_exponentiator.v | 4 +- src/rtl/modexpa7_systolic_multiplier.v | 424 ++++++++- src/rtl/modexpa7_systolic_multiplier_array.v | 72 +- src/rtl/modexpa7_systolic_multiplier_fix.v | 1202 ------------------------ src/rtl/modexpa7_systolic_multiplier_old.v | 1260 -------------------------- src/rtl/modexpa7_wrapper.v | 130 ++- src/tb/tb_exponentiator.v | 4 +- src/tb/tb_systolic_multiplier.v | 4 +- src/tb/tb_wrapper.v | 123 ++- 9 files changed, 638 insertions(+), 2585 deletions(-) delete mode 100644 src/rtl/modexpa7_systolic_multiplier_fix.v delete mode 100644 src/rtl/modexpa7_systolic_multiplier_old.v diff --git a/src/rtl/modexpa7_exponentiator.v b/src/rtl/modexpa7_exponentiator.v index cda6882..b33360a 100644 --- a/src/rtl/modexpa7_exponentiator.v +++ b/src/rtl/modexpa7_exponentiator.v @@ -665,7 +665,7 @@ module modexpa7_exponentiator # .r_bram_in (pp_data_in), .r_bram_wr (pp_wren), - .ab_num_words (m_num_words_latch) + .n_num_words (m_num_words_latch) ); modexpa7_systolic_multiplier # @@ -695,7 +695,7 @@ module modexpa7_exponentiator # .r_bram_in (tp_data_in), .r_bram_wr (tp_wren), - .ab_num_words (m_num_words_latch) + .n_num_words (m_num_words_latch) ); diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v index 32ed543..7293998 100644 --- a/src/rtl/modexpa7_systolic_multiplier.v +++ b/src/rtl/modexpa7_systolic_multiplier.v @@ -96,14 +96,26 @@ module modexpa7_systolic_multiplier # localparam [ 7: 0] FSM_STATE_MULT_START = 8'h21; localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h22; localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h23; + + localparam [ 7: 0] FSM_STATE_ADD_START = 8'h31; + localparam [ 7: 0] FSM_STATE_ADD_CRUNCH = 8'h32; + localparam [ 7: 0] FSM_STATE_ADD_UNLOAD = 8'h33; + localparam [ 7: 0] FSM_STATE_SUB_UNLOAD = 8'h34; + localparam [ 7: 0] FSM_STATE_ADD_FINAL = 8'h35; + + localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h41; + localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h42; + localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h43; localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; + /* - * FSM State / Next State + * FSM State / Next State / Previous State */ reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; + reg [ 7: 0] fsm_prev_state; /* @@ -152,6 +164,31 @@ module modexpa7_systolic_multiplier # n_num_words_latch <= n_num_words; + /* + * Multiplication Phase + */ + localparam [ 1: 0] MULT_PHASE_A_B = 2'd1; + localparam [ 1: 0] MULT_PHASE_AB_N_COEFF = 2'd2; + localparam [ 1: 0] MULT_PHASE_Q_N = 2'd3; + localparam [ 1: 0] MULT_PHASE_STALL = 2'd0; + + reg [ 1: 0] mult_phase; + + wire mult_phase_done = (mult_phase == MULT_PHASE_STALL) ? 1'b1 : 1'b0; + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_LOAD_START: if (ena_trig) mult_phase <= MULT_PHASE_A_B; + FSM_STATE_MULT_FINAL: + case (mult_phase) + MULT_PHASE_A_B: mult_phase <= MULT_PHASE_AB_N_COEFF; + MULT_PHASE_AB_N_COEFF: mult_phase <= MULT_PHASE_Q_N; + MULT_PHASE_Q_N: mult_phase <= MULT_PHASE_STALL; + endcase + endcase + + /* * Counters */ @@ -258,41 +295,130 @@ module modexpa7_systolic_multiplier # wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {n_num_words_latch}; wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {n_num_words_latch, 1'b1}; - // address registers + // address registers reg [OPERAND_ADDR_WIDTH-1:0] b_addr; + reg [OPERAND_ADDR_WIDTH-1:0] n_addr; wire [OPERAND_ADDR_WIDTH :0] p_addr_ext_wr; - reg [OPERAND_ADDR_WIDTH :0] p_addr_ext_rd; + wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_wr; + reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext_rd; + wire [OPERAND_ADDR_WIDTH-1:0] q_addr_wr; + wire [OPERAND_ADDR_WIDTH-1:0] q_addr_rd; + wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_wr; + reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd; + reg [OPERAND_ADDR_WIDTH-1:0] s_addr; + reg [OPERAND_ADDR_WIDTH-1:0] sn_addr; + reg [OPERAND_ADDR_WIDTH-1:0] r_addr; // handy increment values - wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH :0] p_addr_ext_rd_next = b_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_rd_next = ab_addr_ext_rd + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] q_addr_rd_next = q_addr_rd + 1'b1; + wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd_next = qn_addr_ext_rd + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1; // write enables wire p_wren; + wire ab_wren; + wire q_wren; + wire qn_wren; + reg s_wren; + reg sn_wren; + reg r_wren; // data buses wire [31: 0] p_data_in; - wire [31: 0] p_data_out; + wire [31: 0] ab_data_in; + wire [31: 0] ab_data_out; + wire [31: 0] q_data_in; + wire [31: 0] q_data_out; + wire [31: 0] qn_data_in; + wire [31: 0] qn_data_out; + wire [31: 0] s_data_in; + wire [31: 0] s_data_out; + wire [31: 0] sn_data_in; + wire [31: 0] sn_data_out; + wire [31: 0] r_data_in; // handy stop flags - wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; - wire p_addr_ext_rd_done = (p_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0; + wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire ab_addr_ext_rd_done = (ab_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0; + wire q_addr_rd_done = (q_addr_rd == bram_addr_last) ? 1'b1 : 1'b0; + wire qn_addr_ext_rd_done = (qn_addr_ext_rd == bram_addr_ext_last) ? 1'b1 : 1'b0; + wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0; // delayed addresses reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly; + reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly; + reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext_rd_dly; + reg [OPERAND_ADDR_WIDTH : 0] qn_addr_ext_rd_dly1; + reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd_dly2; + reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext_rd_dly3; - always @(posedge clk) b_addr_dly <= b_addr; - + always @(posedge clk) b_addr_dly <= b_addr; + always @(posedge clk) n_addr_dly <= n_addr; + always @(posedge clk) ab_addr_ext_rd_dly <= ab_addr_ext_rd; + always @(posedge clk) qn_addr_ext_rd_dly1 <= qn_addr_ext_rd; + always @(posedge clk) qn_addr_ext_rd_dly2 <= qn_addr_ext_rd_dly1; + always @(posedge clk) qn_addr_ext_rd_dly3 <= qn_addr_ext_rd_dly2; // map registers to top-level ports assign b_bram_addr = b_addr; + assign n_bram_addr = n_addr; + assign r_bram_addr = r_addr; + + // map + assign ab_addr_ext_wr = p_addr_ext_wr[OPERAND_ADDR_WIDTH :0]; + assign q_addr_wr = p_addr_ext_wr[OPERAND_ADDR_WIDTH-1:0]; + assign qn_addr_ext_wr = p_addr_ext_wr[OPERAND_ADDR_WIDTH :0]; + assign r_bram_wr = r_wren; + + assign ab_data_in = p_data_in; + assign q_data_in = p_data_in; + assign qn_data_in = p_data_in; + assign r_bram_in = r_data_in; + + assign ab_wren = p_wren && (mult_phase == MULT_PHASE_A_B); + assign q_wren = p_wren && (mult_phase == MULT_PHASE_AB_N_COEFF); + assign qn_wren = p_wren && (mult_phase == MULT_PHASE_Q_N); + + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) + bram_ab + ( .clk(clk), + .a_addr(ab_addr_ext_wr), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(), + .b_addr(ab_addr_ext_rd), .b_out(ab_data_out) + ); + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_q + ( .clk(clk), + .a_addr(q_addr_wr), .a_wr(q_wren), .a_in(q_data_in), .a_out(), + .b_addr(q_addr_rd), .b_out(q_data_out) + ); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) - bram_p + bram_qn ( .clk(clk), - .a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(), - .b_addr(p_addr_ext_rd), .b_out(p_data_out) + .a_addr(qn_addr_ext_wr), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(), + .b_addr(qn_addr_ext_rd), .b_out(qn_data_out) + ); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_s + ( .clk(clk), + .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out) + ); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_sn + ( .clk(clk), + .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out) ); @@ -308,10 +434,24 @@ module modexpa7_systolic_multiplier # // FSM_STATE_LOAD_SHIFT: begin - // update the rightmost part of loader buffer - loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; + // update the rightmost part of loader buffer + case (mult_phase) - // shift the loader buffer to the left + MULT_PHASE_A_B: + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= + (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; + + MULT_PHASE_AB_N_COEFF: + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= + (ab_addr_ext_rd_dly <= {1'b0, bram_addr_last}) ? ab_data_out : {32{1'b0}}; + + MULT_PHASE_Q_N: + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= + (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}}; + + endcase + + // shift the loader buffer to the left for (j=1; j {1'b0, bram_addr_last}) n_addr <= n_addr_next; + end + endcase // end @@ -378,10 +562,27 @@ module modexpa7_systolic_multiplier # always @(posedge clk) // - case (fsm_next_state) - FSM_STATE_MULT_START: p_num_words_latch <= {n_num_words_latch, 1'b1}; - endcase + if (fsm_next_state == FSM_STATE_MULT_START) + // + case (mult_phase) + MULT_PHASE_A_B: p_num_words_latch <= {n_num_words_latch, 1'b1}; + MULT_PHASE_AB_N_COEFF: p_num_words_latch <= {1'b0, n_num_words_latch}; + MULT_PHASE_Q_N: p_num_words_latch <= {n_num_words_latch, 1'b1}; + endcase + assign n_coeff_bram_addr = a_bram_addr; + assign q_addr_rd = a_bram_addr; + + reg [31: 0] a_data_out; + + always @* + // + case (mult_phase) + MULT_PHASE_A_B: a_data_out = a_bram_out; + MULT_PHASE_AB_N_COEFF: a_data_out = n_coeff_bram_out; + MULT_PHASE_Q_N: a_data_out = q_data_out; + default: a_data_out = {32{1'bX}}; + endcase modexpa7_systolic_multiplier_array # ( @@ -398,7 +599,7 @@ module modexpa7_systolic_multiplier # .loader_addr_rd (loader_addr_rd), - .pe_a_wide ({SYSTOLIC_ARRAY_LENGTH{a_bram_out}}), + .pe_a_wide ({SYSTOLIC_ARRAY_LENGTH{a_data_out}}), .pe_b_wide (pe_b_wide), .a_bram_addr (a_bram_addr), @@ -411,24 +612,174 @@ module modexpa7_systolic_multiplier # .p_num_words (p_num_words_latch) ); + /* + * Adder + */ + + reg add1_ce; // clock enable + wire [31: 0] add1_s; // sum output + wire add1_c_in; // carry input + wire [31: 0] add1_a; // A-input + wire [31: 0] add1_b; // B-input + reg add1_c_in_mask; // flag to not carry anything into the very first word + wire add1_c_out; // carry output + modexpa7_adder32 add1_inst + ( + .clk (clk), + .ce (add1_ce), + .a (add1_a), + .b (add1_b), + .c_in (add1_c_in), + .s (add1_s), + .c_out (add1_c_out) + ); + /* + * Subtractor + */ + reg sub1_ce; // clock enable + wire [31: 0] sub1_d; // difference output + wire sub1_b_in; // borrow input + wire [31: 0] sub1_a; // A-input + wire [31: 0] sub1_b; // B-input + reg sub1_b_in_mask; // flag to not borrow anything from the very first word + wire sub1_b_out; // borrow output + modexpa7_subtractor32 sub1_inst + ( + .clk (clk), + .ce (sub1_ce), + .a (sub1_a), + .b (sub1_b), + .b_in (sub1_b_in), + .d (sub1_d), + .b_out (sub1_b_out) + ); + + // add masking into carry feedback chain + assign add1_c_in = add1_c_out & ~add1_c_in_mask; + // add masking into borrow feedback chain + assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; + + // mask carry for the very first words of AB and QN + always @(posedge clk) + // + add1_c_in_mask <= (fsm_state == FSM_STATE_ADD_START) ? 1'b1 : 1'b0; + // mask borrow for the very first words of S and N + always @(posedge clk) + // + sub1_b_in_mask <= add1_c_in_mask; + + + // map adder inputs + assign add1_a = ab_data_out; + assign add1_b = qn_data_out; + + // map subtractor inputs + assign sub1_a = add1_s; + assign sub1_b = (qn_addr_ext_rd_dly2 <= {1'b0, bram_addr_last}) ? 32'd0 : n_bram_out; + + // clock enable + always @(posedge clk) begin + // + case (fsm_state) + FSM_STATE_ADD_START, + FSM_STATE_ADD_CRUNCH: add1_ce <= 1'b1; + default: add1_ce <= 1'b0; + endcase + // + sub1_ce <= add1_ce; + // + end + + // map outputs + assign s_data_in = add1_s; + assign sn_data_in = sub1_d; + + // write enabled + always @(posedge clk) begin + // + case (fsm_state) + FSM_STATE_ADD_CRUNCH, + FSM_STATE_ADD_UNLOAD: s_wren <= 1'b1; + default: s_wren <= 1'b0; + endcase + // + case (fsm_state) + FSM_STATE_ADD_CRUNCH, + FSM_STATE_ADD_UNLOAD, + FSM_STATE_SUB_UNLOAD, + FSM_STATE_ADD_FINAL: sn_wren <= s_wren; + default: sn_wren <= 1'b0; + endcase + // + case (fsm_state) + FSM_STATE_SAVE_START, + FSM_STATE_SAVE_WRITE: r_wren <= 1'b1; + default: r_wren <= 1'b0; + endcase + // + end + + // ... + always @(posedge clk) begin + // + case (fsm_state) + FSM_STATE_ADD_CRUNCH, + FSM_STATE_ADD_UNLOAD: begin + if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_zero}) s_addr <= bram_addr_zero; + else if (qn_addr_ext_rd_dly2 > {1'b0, bram_addr_last}) s_addr <= s_addr_next; + end + FSM_STATE_ADD_FINAL: s_addr <= bram_addr_zero; + FSM_STATE_SAVE_START, + FSM_STATE_SAVE_WRITE: s_addr <= s_addr_next; + endcase + // + case (fsm_state) + FSM_STATE_ADD_CRUNCH, + FSM_STATE_ADD_UNLOAD, + FSM_STATE_SUB_UNLOAD: begin + if (qn_addr_ext_rd_dly2 == {1'b0, bram_addr_zero}) sn_addr <= bram_addr_zero; + else if (qn_addr_ext_rd_dly3 > {1'b0, bram_addr_last}) sn_addr <= sn_addr_next; + end + FSM_STATE_ADD_FINAL: sn_addr <= bram_addr_zero; + FSM_STATE_SAVE_START, + FSM_STATE_SAVE_WRITE: sn_addr <= sn_addr_next; + endcase + // + case (fsm_state) + FSM_STATE_SAVE_START: r_addr <= bram_addr_zero; + FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next; + endcase + // + end + + + /* + * Flag Update Logic + */ + always @(posedge clk) + // + if (fsm_state == FSM_STATE_ADD_FINAL) + flag_select_s <= sub1_b_out & ~add1_c_out; - - /* * FSM Process - - */ + */ always @(posedge clk or negedge rst_n) - // + // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; - else fsm_state <= fsm_next_state; + else fsm_state <= fsm_next_state; + + always @(posedge clk) + // + fsm_prev_state <= fsm_state; /* @@ -453,7 +804,20 @@ module modexpa7_systolic_multiplier # FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_CRUNCH: if (pe_array_rdy) fsm_next_state = FSM_STATE_MULT_FINAL; else fsm_next_state = FSM_STATE_MULT_CRUNCH; - FSM_STATE_MULT_FINAL: fsm_next_state = FSM_STATE_STOP; + FSM_STATE_MULT_FINAL: if (mult_phase_done) fsm_next_state = FSM_STATE_ADD_START; + else fsm_next_state = FSM_STATE_LOAD_START; + // + FSM_STATE_ADD_START: fsm_next_state = FSM_STATE_ADD_CRUNCH; + FSM_STATE_ADD_CRUNCH: if (ab_addr_ext_rd_done) fsm_next_state = FSM_STATE_ADD_UNLOAD; + else fsm_next_state = FSM_STATE_ADD_CRUNCH; + FSM_STATE_ADD_UNLOAD: fsm_next_state = FSM_STATE_SUB_UNLOAD; + FSM_STATE_SUB_UNLOAD: fsm_next_state = FSM_STATE_ADD_FINAL; + FSM_STATE_ADD_FINAL: fsm_next_state = FSM_STATE_SAVE_START; + // + FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE; + FSM_STATE_SAVE_WRITE: if (s_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL; + else fsm_next_state = FSM_STATE_SAVE_WRITE; + FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP; // FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; // diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v index 22d5aaf..754203d 100644 --- a/src/rtl/modexpa7_systolic_multiplier_array.v +++ b/src/rtl/modexpa7_systolic_multiplier_array.v @@ -195,11 +195,15 @@ module modexpa7_systolic_multiplier_array # wire shreg_done_load = shreg_load[syst_cnt_last]; wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; - wire shreg_done_unload = shreg_unload[syst_cnt_last]; - + wire shreg_done_unload = shreg_unload[syst_cnt_last]; + reg shreg_now_loading; reg shreg_now_latency; reg shreg_now_unloading; + + reg shreg_done_latency_dly; + always @(posedge clk) + shreg_done_latency_dly <= shreg_done_latency; always @(posedge clk) // @@ -257,17 +261,22 @@ module modexpa7_systolic_multiplier_array # reg fifo_c_rst; reg fifo_t_rst; - wire fifo_c_wren; + reg fifo_c_wren; wire fifo_c_rden; - wire fifo_t_wren; + reg fifo_t_wren; wire fifo_t_rden; - wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din; + reg [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_din; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_c_dout; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_din; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] fifo_t_dout; + + wire [32 * 1 - 1 : 0] fifo_t_din_msb; + reg [32 * (SYSTOLIC_ARRAY_LENGTH - 1) - 1 : 0] fifo_t_din_lsb; + + assign fifo_t_din = {fifo_t_din_msb, fifo_t_din_lsb}; modexpa7_simple_fifo # ( @@ -317,10 +326,26 @@ module modexpa7_systolic_multiplier_array # // assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32]; assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32]; + // + always @(posedge clk) + fifo_c_din[32 * (i + 1) - 1 -: 32] <= pe_c_out[i]; + // + end + // + endgenerate + + generate for (i=1; i {1'b0, bram_addr_last}) - n_addr <= n_addr_next; - - endcase - // - case (fsm_state) - FSM_STATE_SAVE_START: r_addr <= bram_addr_zero; - FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next; - endcase - // - case (fsm_next_state) - FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero; - FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; - endcase - // - end - - - // - // Internal Memories - // - - /* memory inputs */ - reg [31: 0] ab_data_in; - reg [31: 0] q_data_in; - reg [31: 0] qn_data_in; - wire [31: 0] s_data_in; - wire [31: 0] sn_data_in; - reg [31: 0] r_data_in; - - /* memory outputs */ - wire [31: 0] ab_data_out; - wire [31: 0] q_data_out; - wire [31: 0] qn_data_out; - wire [31: 0] s_data_out; - wire [31: 0] sn_data_out; - - /* write enables */ - reg ab_wren; - reg q_wren; - reg qn_wren; - reg s_wren; - reg sn_wren; - reg r_wren; - - /* map */ - assign r_bram_in = r_data_in; - assign r_bram_wr = r_wren; - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) - bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) - bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)); - - - // - // Wide Operand Loader - // - integer j; - - /* shift logic */ - always @(posedge clk) - // - case (fsm_state) - // - FSM_STATE_LOAD_B_SHIFT: begin - - /* update the rightmost part of loader buffer */ - loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; - - /* shift the loader buffer to the left */ - for (j=1; j {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin - s_addr <= s_addr_next; - sn_addr <= sn_addr_next; - end - - if (qn_addr_ext == bram_addr_ext_last) begin - s_addr <= bram_addr_zero; - sn_addr <= bram_addr_zero; - end - - end - - FSM_STATE_MULT_Q_N_FINAL, - FSM_STATE_SAVE_START, - FSM_STATE_SAVE_WRITE: begin - s_addr <= !s_addr_done ? s_addr_next : s_addr; - sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr; - end - - endcase - - // - case (fsm_next_state) - FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero; - FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next; - endcase - // - case (fsm_next_state) - FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero; - FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr; - endcase - - // - end - - always @(posedge clk) begin - // - if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin - ab_wren <= shreg_done_latency_dly; - ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; - end else begin - ab_wren <= 1'b0; - ab_data_in <= 32'hXXXXXXXX; - end - // - if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin - q_wren <= shreg_done_latency_dly; - q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; - end else begin - q_wren <= 1'b0; - q_data_in <= 32'hXXXXXXXX; - end - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin - qn_wren <= shreg_done_latency_dly; - qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; - end else begin - qn_wren <= 1'b0; - qn_data_in <= 32'hXXXXXXXX; - end - // - case (fsm_state) - FSM_STATE_SAVE_START: r_wren <= 1'b1; - FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done; - default: r_wren <= 1'b0; - endcase - // - end - - - always @(posedge clk) - // - case (fsm_next_state) - FSM_STATE_MULT_A_B_START, - FSM_STATE_MULT_AB_N_COEFF_START, - FSM_STATE_MULT_Q_N_START, - FSM_STATE_MULT_A_B_RELOAD, - FSM_STATE_MULT_AB_N_COEFF_RELOAD, - FSM_STATE_MULT_Q_N_RELOAD: - // - syst_cnt_load <= syst_cnt_zero; - - FSM_STATE_MULT_A_B_CRUNCH, - FSM_STATE_MULT_AB_N_COEFF_CRUNCH, - FSM_STATE_MULT_Q_N_CRUNCH: - // - syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; - - endcase - - - - always @(posedge clk) - // - case (fsm_state) - FSM_STATE_MULT_A_B_CRUNCH, - FSM_STATE_MULT_AB_N_COEFF_CRUNCH, - FSM_STATE_MULT_Q_N_CRUNCH: begin - - if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero; - else if (shreg_now_unloading) - syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; - - end - endcase - - - // - // T and C_IN can be moved to a separate code block - // - always @(posedge clk) begin - // - if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) - // - for (j=0; j {1'b0, a_addr}) ? 32'd0 : a_bram_out; - pe_b[j] <= loader_dout[j]; - //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; - //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; - end else begin - pe_a[j] <= 32'hXXXXXXXX; - pe_b[j] <= 32'hXXXXXXXX; - //pe_t[j] <= 32'hXXXXXXXX; - //pe_c_in[j] <= 32'hXXXXXXXX; - end - // - if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) - // - for (j=0; j {1'b0, q_addr}) ? 32'd0 : q_data_out; - pe_b[j] <= loader_dout[j]; - //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; - //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; - end else begin - pe_a[j] <= 32'hXXXXXXXX; - pe_b[j] <= 32'hXXXXXXXX; - //pe_t[j] <= 32'hXXXXXXXX; - //pe_c_in[j] <= 32'hXXXXXXXX; - end - // - - // - end - - - // - // Adder - // - /* - * This adder is used to calculate S = AB + QN. - * - */ - reg add1_ce; // clock enable - reg [31: 0] add1_s; // sum output - wire add1_c_in; // carry input - wire [31: 0] add1_a; // A-input - reg [31: 0] add1_b; // B-input - reg add1_c_in_mask; // flag to not carry anything into the very first word - reg add1_c_out; // carry output - - /* add masking into carry feedback chain */ - assign add1_c_in = add1_c_out & ~add1_c_in_mask; - - /* mask carry for the very first word of N */ - //always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; - - always @(posedge clk) - // - if (add1_ce) - // - {add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in}; - - assign add1_a = qn_data_in; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX; - else - add1_b <= 32'hXXXXXXXX; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0; - else - add1_c_in_mask <= 1'b0; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - add1_ce <= shreg_done_latency_dly; - else - add1_ce <= 1'b0; - - - assign s_data_in = add1_s; - assign sn_data_in = sub1_d; - - always @(posedge clk) begin - // - s_wren <= add1_ce; - sn_wren <= sub1_ce; - end - - - - // - // Subtractor - // - /* - * This subtractor is used to calculate SN = S - N. - * - */ - reg sub1_ce; // clock enable - reg [31: 0] sub1_d; // difference output - wire sub1_b_in; // borrow input - wire [31: 0] sub1_a; // A-input - reg [31: 0] sub1_b; // B-input - reg sub1_b_in_mask; // flag to not borrow anything from the very first word - reg sub1_b_out; // borrow output - - /* add masking into borrow feedback chain */ - assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; - - always @(posedge clk) - // - if (sub1_ce) - // - {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in}; - - assign sub1_a = add1_s; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX; - else - sub1_b <= 32'hXXXXXXXX; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0; - else - sub1_b_in_mask <= 1'b0; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr}); - else - sub1_ce <= 1'b0; - - - assign s_data_in = add1_s; - - always @(posedge clk) - // - s_wren <= add1_ce; - - - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_FINAL) - flag_select_s <= sub1_b_out & ~add1_c_out; - - - always @(posedge clk) - // - case (fsm_state) - FSM_STATE_SAVE_START, - FSM_STATE_SAVE_WRITE: - r_data_in <= flag_select_s ? s_data_out : sn_data_out; - endcase - - - - // - // FSM Process - // - always @(posedge clk or negedge rst_n) - // - if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; - else fsm_state <= fsm_next_state; - - - // - // FSM Transition Logic - // - always @* begin - // - fsm_next_state = FSM_STATE_STOP; - // - case (fsm_state) - - FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START; - else fsm_next_state = FSM_STATE_IDLE; - // - FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT; - FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE; - else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; - FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL; - else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; - FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START; - // - FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; - FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE; - else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; - FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL; - else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; - FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START; - // - FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT; - FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE; - else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; - FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL; - else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; - FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START; - // - FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; - FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD; - else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; - FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL; - else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; - FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START; - // - FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; - FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD; - else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; - FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL; - else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; - FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START; - // - FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; - FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD; - else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; - FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL; - else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; - FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START; - // - FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE; - FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL; - else fsm_next_state = FSM_STATE_SAVE_WRITE; - FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP; - // - FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; - - endcase - // - end - - -endmodule - -//====================================================================== -// End of file -//====================================================================== diff --git a/src/rtl/modexpa7_systolic_multiplier_old.v b/src/rtl/modexpa7_systolic_multiplier_old.v deleted file mode 100644 index 8b00370..0000000 --- a/src/rtl/modexpa7_systolic_multiplier_old.v +++ /dev/null @@ -1,1260 +0,0 @@ -//====================================================================== -// -// modexpa7_systolic_multiplier.v -// ----------------------------------------------------------------------------- -// Systolic Montgomery multiplier. -// -// Authors: Pavel Shatov -// -// Copyright (c) 2017, NORDUnet A/S All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// - Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// - Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// - Neither the name of the NORDUnet nor the names of its contributors may -// be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -//====================================================================== - -module modexpa7_systolic_multiplier # - ( - // - // This sets the address widths of memory buffers. Internal data - // width is 32 bits, so for e.g. 2048-bit operands buffers must store - // 2048 / 32 = 64 words, and these need 6-bit address bus, because - // 2 ** 6 = 64. - // - parameter OPERAND_ADDR_WIDTH = 4, - - // - // Explain. - // - parameter SYSTOLIC_ARRAY_POWER = 1 - ) - ( - input clk, - input rst_n, - - input ena, - output rdy, - - output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, - output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, - output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, - output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, - output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, - - input [ 32-1:0] a_bram_out, - input [ 32-1:0] b_bram_out, - input [ 32-1:0] n_bram_out, - input [ 32-1:0] n_coeff_bram_out, - - output [ 32-1:0] r_bram_in, - output r_bram_wr, - - input [OPERAND_ADDR_WIDTH-1:0] ab_num_words - ); - - - // - // Include Settings - // - `include "pe/modexpa7_primitive_switch.v" - `include "modexpa7_settings.v" - - - // - // FSM Declaration - // - localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; - - localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11; - localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12; - localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13; - localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14; - - localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21; - localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22; - localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23; - localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24; - - localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31; - localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32; - localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33; - localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34; - - localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41; - localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42; - localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43; - localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44; - - localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51; - localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52; - localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53; - localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54; - - localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61; - localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62; - localparam [ 7: 0] FSM_STATE_MULT_Q_N_ADD_S = 8'h63; - localparam [ 7: 0] FSM_STATE_MULT_Q_N_SUB_SN = 8'h64; - localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h65; - localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h66; - - localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71; - localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72; - localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73; - - localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; - - // - // FSM State / Next State - // - reg [ 7: 0] fsm_state = FSM_STATE_IDLE; - reg [ 7: 0] fsm_next_state; - - - // - // Enable Delay and Trigger - // - reg ena_dly = 1'b0; - - /* delay enable by one clock cycle */ - always @(posedge clk) ena_dly <= ena; - - /* trigger new operation when enable goes high */ - wire ena_trig = ena && !ena_dly; - - - // - // Ready Flag Logic - // - reg rdy_reg = 1'b1; - assign rdy = rdy_reg; - - always @(posedge clk or negedge rst_n) - - /* reset flag */ - if (rst_n == 1'b0) rdy_reg <= 1'b1; - else begin - - /* clear flag when operation is started */ - if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; - - /* set flag after operation is finished */ - if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; - - end - - - // - // Parameters Latch - // - reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch; - - /* save number of words in a and b when new operation starts */ - always @(posedge clk) - // - if (fsm_next_state == FSM_STATE_LOAD_B_START) - ab_num_words_latch <= ab_num_words; - - - // - // Systolic Cycle Counters - // - - /* handy values */ - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; - - /* counters */ - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init; - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; - - /* handy increment values */ - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; - - /* handy stop flags */ - wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0; - wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; - wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; - - /* delayed load counter */ - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly; - always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load; - - - // - // Multiplier Iteration Counter - // - - /* handy values */ - wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; - wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; - - /* counter */ - reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt; - - /* handy increment value and stop flag */ - wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1; - wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0; - - - // - // Initialization Counter Control Logic - // - always @(posedge clk) begin - // - case (fsm_state) - FSM_STATE_LOAD_B_START, - FSM_STATE_LOAD_N_COEFF_START, - FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero; - - FSM_STATE_LOAD_B_SHIFT, - FSM_STATE_LOAD_N_COEFF_SHIFT, - FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next; - endcase - // - case (fsm_state) - FSM_STATE_LOAD_B_START, - FSM_STATE_LOAD_N_COEFF_START, - FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero; - - FSM_STATE_LOAD_B_WRITE, - FSM_STATE_LOAD_N_COEFF_WRITE, - FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init; - endcase - // - end - - - // - // Operand Loader - // - - /* - * Explain how parallelized loader works here... - * - */ - - /* loader banks */ - localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0; - localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1; - localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2; - - /* loader input */ - reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1]; - reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1]; - reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1]; - reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; - - /* loader output */ - wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1]; - - /* generate parallelized loader */ - - // - // Loader currently stores B, N_COEFF and N, it can be coded another way - // to initially store B, then AB, then Q. Some memory can be saved thay way. - // Maybe later... - // - - genvar i; - generate for (i=0; i {1'b0, bram_addr_last}) - n_addr <= n_addr_next; - - endcase - // - case (fsm_state) - FSM_STATE_SAVE_START: r_addr <= bram_addr_zero; - FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next; - endcase - // - case (fsm_next_state) - FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero; - FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; - endcase - // - end - - - // - // Internal Memories - // - - /* memory inputs */ - reg [31: 0] ab_data_in; - reg [31: 0] q_data_in; - reg [31: 0] qn_data_in; - wire [31: 0] s_data_in; - wire [31: 0] sn_data_in; - reg [31: 0] r_data_in; - - /* memory outputs */ - wire [31: 0] ab_data_out; - wire [31: 0] q_data_out; - wire [31: 0] qn_data_out; - wire [31: 0] s_data_out; - wire [31: 0] sn_data_out; - - /* write enables */ - reg ab_wren; - reg q_wren; - reg qn_wren; - reg s_wren; - reg sn_wren; - reg r_wren; - - /* map */ - assign r_bram_in = r_data_in; - assign r_bram_wr = r_wren; - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) - bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) - bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)); - - bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)); - - - // - // Wide Operand Loader - // - integer j; - - /* shift logic */ - always @(posedge clk) - // - case (fsm_state) - // - FSM_STATE_LOAD_B_SHIFT: begin - - /* update the rightmost part of loader buffer */ - loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; - - /* shift the loader buffer to the left */ - for (j=1; j {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin - s_addr <= s_addr_next; - sn_addr <= sn_addr_next; - end - // - if (qn_addr_ext == bram_addr_ext_last) begin - s_addr <= bram_addr_zero; - sn_addr <= bram_addr_zero; - end - // - end - // - /* - case (fsm_state) - - FSM_STATE_MULT_Q_N_RELOAD: begin - if (qn_addr_ext == {1'b0, bram_addr_last}) begin - s_addr <= bram_addr_zero; - sn_addr <= bram_addr_zero; - end - - if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin - s_addr <= s_addr_next; - sn_addr <= sn_addr_next; - end - - if (qn_addr_ext == bram_addr_ext_last) begin - s_addr <= bram_addr_zero; - sn_addr <= bram_addr_zero; - end - - end - - FSM_STATE_MULT_Q_N_FINAL, - FSM_STATE_SAVE_START, - FSM_STATE_SAVE_WRITE: begin - s_addr <= !s_addr_done ? s_addr_next : s_addr; - sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr; - end - */ - endcase - - // - case (fsm_next_state) - FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero; - FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next; - endcase - // - case (fsm_next_state) - FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero; - FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr; - endcase - - // - end - - always @(posedge clk) begin - // - if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin - ab_wren <= shreg_done_latency_dly; - ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; - end else begin - ab_wren <= 1'b0; - ab_data_in <= 32'hXXXXXXXX; - end - // - if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin - q_wren <= shreg_done_latency_dly; - q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; - end else begin - q_wren <= 1'b0; - q_data_in <= 32'hXXXXXXXX; - end - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin - qn_wren <= shreg_done_latency_dly; - qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; - end else begin - qn_wren <= 1'b0; - qn_data_in <= 32'hXXXXXXXX; - end - // - case (fsm_state) - FSM_STATE_SAVE_START: r_wren <= 1'b1; - FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done; - default: r_wren <= 1'b0; - endcase - // - end - - - always @(posedge clk) - // - case (fsm_next_state) - FSM_STATE_MULT_A_B_START, - FSM_STATE_MULT_AB_N_COEFF_START, - FSM_STATE_MULT_Q_N_START, - FSM_STATE_MULT_A_B_RELOAD, - FSM_STATE_MULT_AB_N_COEFF_RELOAD, - FSM_STATE_MULT_Q_N_RELOAD: - // - syst_cnt_load <= syst_cnt_zero; - - FSM_STATE_MULT_A_B_CRUNCH, - FSM_STATE_MULT_AB_N_COEFF_CRUNCH, - FSM_STATE_MULT_Q_N_CRUNCH: - // - syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; - - endcase - - - - always @(posedge clk) - // - case (fsm_state) - FSM_STATE_MULT_A_B_CRUNCH, - FSM_STATE_MULT_AB_N_COEFF_CRUNCH, - FSM_STATE_MULT_Q_N_CRUNCH: begin - - if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero; - else if (shreg_now_unloading) - syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; - - end - endcase - - - // - // T and C_IN can be moved to a separate code block - // - always @(posedge clk) begin - // - if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) - // - for (j=0; j {1'b0, a_addr}) ? 32'd0 : a_bram_out; - pe_b[j] <= loader_dout[j]; - //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; - //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; - end else begin - pe_a[j] <= 32'hXXXXXXXX; - pe_b[j] <= 32'hXXXXXXXX; - //pe_t[j] <= 32'hXXXXXXXX; - //pe_c_in[j] <= 32'hXXXXXXXX; - end - // - if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) - // - for (j=0; j {1'b0, q_addr}) ? 32'd0 : q_data_out; - pe_b[j] <= loader_dout[j]; - //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; - //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; - end else begin - pe_a[j] <= 32'hXXXXXXXX; - pe_b[j] <= 32'hXXXXXXXX; - //pe_t[j] <= 32'hXXXXXXXX; - //pe_c_in[j] <= 32'hXXXXXXXX; - end - // - - // - end - - - // - // Adder - // - - reg add1_ce; // clock enable - wire [31: 0] add1_s; // sum output - wire add1_c_in; // carry input - reg [31: 0] add1_a; // A-input - reg [31: 0] add1_b; // B-input - reg add1_c_in_mask; // flag to not carry anything into the very first word - wire add1_c_out; // carry output - - // add masking into carry feedback chain - assign add1_c_in = add1_c_out & ~add1_c_in_mask; - - // mask carry for the very first word of N - always @(posedge clk) - // - if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) - add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0; - - modexpa7_adder32 add1_inst - ( - .clk (clk), - .ce (add1_ce), - .a (add1_a), - .b (add1_b), - .c_in (add1_c_in), - .s (add1_s), - .c_out (add1_c_out) - ); - - always @(posedge clk) - // - add1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_ADD_S) ? 1'b1 : 1'b0; - - always @(posedge clk) - // - if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin - add1_a <= pe_p[0]; - add1_b <= ab_data_out; - end - - - // - // Subtractor - // - /* - * This subtractor is used to calculate SN = S - N. - * - */ - - reg sub1_ce; // clock enable - wire [31: 0] sub1_d; // difference output - wire sub1_b_in; // borrow input - reg [31: 0] sub1_a; // A-input - reg [31: 0] sub1_b; // B-input - reg sub1_b_in_mask; // flag to not borrow anything from the very first word - wire sub1_b_out; // borrow output - - // add masking into borrow feedback chain - assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; - - // mask carry for the very first word of N TODO! - //always @(posedge clk) - // - //if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) - //add1_c_in_mask <= (ab_addr_ext == bram_addr_ext_zero) ? 1'b1 : 1'b0; - - modexpa7_subtractor32 sub1_inst - ( - .clk (clk), - .ce (sub1_ce), - .a (sub1_a), - .b (sub1_b), - .b_in (sub1_b_in), - .d (sub1_d), - .b_out (sub1_b_out) - ); - - always @(posedge clk) - // - sub1_ce <= (fsm_next_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr}) ? 1'b1 : 1'b0; - - always @* - sub1_a = add1_s; - - always @(posedge clk) - // - //if ((fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) && shreg_done_latency_dly) begin - //add1_a <= pe_p[0]; - //add1_b <= ab_data_out; - //end - - - /* - reg sub1_ce; // clock enable - reg [31: 0] sub1_d; // difference output - wire sub1_b_in; // borrow input - wire [31: 0] sub1_a; // A-input - reg [31: 0] sub1_b; // B-input - reg sub1_b_in_mask; // flag to not borrow anything from the very first word*/ -// wire sub1_b_out; // borrow output - /* - - // add masking into borrow feedback chain - assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; - - always @(posedge clk) - // - if (sub1_ce) - // - {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in}; - - assign sub1_a = add1_s; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX; - else - sub1_b <= 32'hXXXXXXXX; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0; - else - sub1_b_in_mask <= 1'b0; - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) - sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr}); - else - sub1_ce <= 1'b0; - */ - - - assign s_data_in = add1_s; - assign sn_data_in = sub1_d; - - always @(posedge clk) begin - // - s_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_ADD_S) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0; - sn_wren <= ((fsm_state == FSM_STATE_MULT_Q_N_SUB_SN) && (qn_addr_ext > {1'b0, q_addr})) ? 1'b1 : 1'b0; - // - end - - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_Q_N_FINAL) - flag_select_s <= sub1_b_out & ~add1_c_out; - - - always @(posedge clk) - // - case (fsm_state) - FSM_STATE_SAVE_START, - FSM_STATE_SAVE_WRITE: - r_data_in <= flag_select_s ? s_data_out : sn_data_out; - endcase - - - - // - // FSM Process - // - always @(posedge clk or negedge rst_n) - // - if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; - else fsm_state <= fsm_next_state; - - - // - // FSM Transition Logic - // - always @* begin - // - fsm_next_state = FSM_STATE_STOP; - // - case (fsm_state) - - FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START; - else fsm_next_state = FSM_STATE_IDLE; - // - FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT; - FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE; - else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; - FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL; - else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; - FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START; - // - FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; - FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE; - else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; - FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL; - else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; - FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START; - // - FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT; - FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE; - else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; - FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL; - else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; - FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START; - // - FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; - FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD; - else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; - FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL; - else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; - FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START; - // - FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; - FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD; - else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; - FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL; - else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; - FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START; - // - FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; - FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_ADD_S; - else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; - FSM_STATE_MULT_Q_N_ADD_S: fsm_next_state = FSM_STATE_MULT_Q_N_SUB_SN; - FSM_STATE_MULT_Q_N_SUB_SN: fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD; - FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL; - else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; - FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START; - // - FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE; - FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL; - else fsm_next_state = FSM_STATE_SAVE_WRITE; - FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP; - // - FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; - - endcase - // - end - - -endmodule - -//====================================================================== -// End of file -//====================================================================== diff --git a/src/rtl/modexpa7_wrapper.v b/src/rtl/modexpa7_wrapper.v index 3b749be..090ea8d 100644 --- a/src/rtl/modexpa7_wrapper.v +++ b/src/rtl/modexpa7_wrapper.v @@ -35,7 +35,6 @@ module modexpa7_wrapper # parameter OPERAND_ADDR_WIDTH = 5, parameter SYSTOLIC_ARRAY_POWER = 2 ) - ( input clk, input rst_n, @@ -62,7 +61,7 @@ module modexpa7_wrapper # /* * Output Mux */ - wire [31: 0] read_data_regs; + reg [31: 0] read_data_regs; wire [31: 0] read_data_core; @@ -75,27 +74,31 @@ module modexpa7_wrapper # localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_CONTROL = 'h08; // {next, init} localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_STATUS = 'h09; // {valid, ready} -// localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODE // NOT USED ANYMORE + localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODE = 'h10; // {crt, dummy} localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODULUS_BITS = 'h11; // number of bits in modulus localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_EXPONENT_BITS = 'h12; // number of bits in exponent localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_BUFFER_BITS = 'h13; // largest supported number of bits - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_ARRAY_BITS = 'h15; // number of bits in systolic array + localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_ARRAY_BITS = 'h14; // number of bits in systolic array localparam CONTROL_INIT_BIT = 0; localparam CONTROL_NEXT_BIT = 1; localparam STATUS_READY_BIT = 0; - localparam STATUS_VALID_BIT = 1; + localparam STATUS_VALID_BIT = 1; + + localparam MODE_DUMMY_BIT = 0; + localparam MODE_CRT_BIT = 1; localparam CORE_NAME0 = 32'h6D6F6465; // "mode" localparam CORE_NAME1 = 32'h78706137; // "xpa7" - localparam CORE_VERSION = 32'h302E3230; // "0.10" + localparam CORE_VERSION = 32'h302E3230; // "0.20" /* * Registers */ - reg [ 1:0] reg_control; + reg [ 1:0] reg_control; + reg [ 1:1] reg_mode; reg [OPERAND_ADDR_WIDTH+5:0] reg_modulus_bits; reg [OPERAND_ADDR_WIDTH+5:0] reg_exponent_bits; @@ -142,34 +145,53 @@ module modexpa7_wrapper # .bus_data_wr (write_data), .bus_data_rd (read_data_core) ); - - - /* - * Read Latch - */ - - reg [31: 0] read_data_regs; /* * Write Checker */ - - // largest supported operand width - localparam [OPERAND_ADDR_WIDTH+5:0] BUFFER_BITS = {1'b1, {OPERAND_ADDR_WIDTH+4{1'b0}}}; + + // largest supported operand width + localparam [OPERAND_ADDR_WIDTH+5:0] EXPONENT_MIN_BITS = {{OPERAND_ADDR_WIDTH+4{1'b0}}, 2'b10}; + localparam [OPERAND_ADDR_WIDTH+5:0] EXPONENT_MAX_BITS = {1'b1, {OPERAND_ADDR_WIDTH+5{1'b0}}}; + + localparam [OPERAND_ADDR_WIDTH+5:0] MODULUS_MIN_BITS = {{OPERAND_ADDR_WIDTH-1{1'b0}}, 7'b1000000}; + localparam [OPERAND_ADDR_WIDTH+5:0] MODULUS_MAX_BITS = {1'b1, {OPERAND_ADDR_WIDTH+5{1'b0}}}; - // check_modulus_bits + // + // Limits on modulus_bits: + // + // Must be 64 .. BUFFER_BITS in steps of 32 + // function [OPERAND_ADDR_WIDTH+5:0] check_modulus_bits; input [OPERAND_ADDR_WIDTH+5:0] num_bits; begin - // - //t = num_bits[] - //if (num_bits > MAX_BITS) write_check_bits = MAX_BITS; - //else write_check_bits = num_bits; - // + + // store input value + check_modulus_bits = num_bits; + + // must be multiple of 32 + check_modulus_bits[4:0] = {5{1'b0}}; + if (check_modulus_bits < num_bits) + check_modulus_bits = check_modulus_bits + 6'd32; + + // too large? + if (check_modulus_bits > MODULUS_MAX_BITS) + check_modulus_bits = MODULUS_MAX_BITS; + + // too small? + if (check_modulus_bits < MODULUS_MIN_BITS) + check_modulus_bits = MODULUS_MIN_BITS; + end endfunction + // + // Limits on exponent_bits: + // + // Must be 2 .. BUFFER_BITS; + // + // function [OPERAND_ADDR_WIDTH+5:0] check_exponent_bits; input [OPERAND_ADDR_WIDTH+5:0] num_bits; begin @@ -178,12 +200,12 @@ module modexpa7_wrapper # check_exponent_bits = num_bits; // too large? - if (num_bits > BUFFER_BITS) - check_exponent_bits = BUFFER_BITS; + if (check_exponent_bits > EXPONENT_MAX_BITS) + check_exponent_bits = EXPONENT_MAX_BITS; // too small? - if (num_bits == {OPERAND_ADDR_WIDTH+5{1'b0}}) - num_bits = {{OPERAND_ADDR_WIDTH+4{1'b0}}, 1'b1}; + if (check_exponent_bits < EXPONENT_MIN_BITS) + check_exponent_bits = EXPONENT_MIN_BITS; // end @@ -194,9 +216,24 @@ module modexpa7_wrapper # * Internal Quantities Generator */ - function [OPERAND_ADDR_WIDTH-1:0] modulus_num_words_core; - input [OPERAND_ADDR_WIDTH+5:0] num_bits; + + function [OPERAND_ADDR_WIDTH-1:0] get_modulus_num_words_core; + input [OPERAND_ADDR_WIDTH+5:0] num_bits; + reg [OPERAND_ADDR_WIDTH+5:0] num_words_checked; begin + + // check number of bits + num_words_checked = check_modulus_bits(num_bits); + + // reduce by 1 + num_words_checked = {{5{1'b0}}, num_words_checked[OPERAND_ADDR_WIDTH+5:5]}; + + // reduce by 1 + num_words_checked = num_words_checked - 1'b1; + + // return + get_modulus_num_words_core = num_words_checked[OPERAND_ADDR_WIDTH-1:0]; + end endfunction @@ -205,14 +242,19 @@ module modexpa7_wrapper # reg [OPERAND_ADDR_WIDTH+5:0] num_bits_checked; begin - // check number of bits (not too large, not too small) + // check number of bits num_bits_checked = check_exponent_bits(num_bits); - // de + // reduce by 1 + num_bits_checked = num_bits_checked - 1'b1; + + // return + get_exponent_num_bits_core = num_bits_checked[OPERAND_ADDR_WIDTH+4:0]; + end endfunction - + /* * Write Interface (External Registers) */ @@ -229,7 +271,8 @@ module modexpa7_wrapper # // case (address_lsb) // - ADDR_CONTROL: reg_control <= write_data[ 1: 0]; + ADDR_CONTROL: reg_control <= write_data[ 1: 0]; + ADDR_MODE: reg_mode <= write_data[MODE_CRT_BIT]; ADDR_MODULUS_BITS: reg_modulus_bits <= check_modulus_bits(write_data[OPERAND_ADDR_WIDTH+5:0]); ADDR_EXPONENT_BITS: reg_exponent_bits <= check_exponent_bits(write_data[OPERAND_ADDR_WIDTH+5:0]); // @@ -265,17 +308,20 @@ module modexpa7_wrapper # // case (address_lsb) // - ADDR_NAME0: tmp_read_data <= CORE_NAME0; - ADDR_NAME1: tmp_read_data <= CORE_NAME1; - ADDR_VERSION: tmp_read_data <= CORE_VERSION; + ADDR_NAME0: read_data_regs <= CORE_NAME0; + ADDR_NAME1: read_data_regs <= CORE_NAME1; + ADDR_VERSION: read_data_regs <= CORE_VERSION; - ADDR_CONTROL: tmp_read_data <= {{30{1'b0}}, reg_control}; - ADDR_STATUS: tmp_read_data <= {{30{1'b0}}, reg_status}; + ADDR_CONTROL: read_data_regs <= {{30{1'b0}}, reg_control}; + ADDR_MODE: read_data_regs <= {{30{1'b0}}, reg_mode, 1'b0}; + ADDR_STATUS: read_data_regs <= {{30{1'b0}}, reg_status}; - ADDR_MODULUS_BITS: tmp_read_data <= {{19{1'b0}}, reg_modulus_bits}; - ADDR_EXPONENT_BITS: tmp_read_data <= {{19{1'b0}}, reg_exponent_bits}; + ADDR_MODULUS_BITS: read_data_regs <= {{19{1'b0}}, reg_modulus_bits}; + ADDR_EXPONENT_BITS: read_data_regs <= {{19{1'b0}}, reg_exponent_bits}; + ADDR_BUFFER_BITS: read_data_regs <= {{26-OPERAND_ADDR_WIDTH {1'b0}}, 1'b1, { OPERAND_ADDR_WIDTH+5{1'b0}}}; + ADDR_ARRAY_BITS: read_data_regs <= {{26-SYSTOLIC_ARRAY_POWER{1'b0}}, 1'b1, {SYSTOLIC_ARRAY_POWER+5{1'b0}}}; // - default: tmp_read_data <= {32{1'b0}}; + default: read_data_regs <= {32{1'b0}}; // endcase @@ -294,7 +340,7 @@ module modexpa7_wrapper # always @(*) // - case (address_msb_last) + case (address_msb_dly) ADDR_MSB_REGS: read_data_mux = read_data_regs; ADDR_MSB_CORE: read_data_mux = read_data_core; endcase diff --git a/src/tb/tb_exponentiator.v b/src/tb/tb_exponentiator.v index c9a9f7e..16be0a5 100644 --- a/src/tb/tb_exponentiator.v +++ b/src/tb/tb_exponentiator.v @@ -160,7 +160,7 @@ module tb_exponentiator; modexpa7_exponentiator # ( .OPERAND_ADDR_WIDTH (4), // 32 * (2**4) = 512-bit operands - .SYSTOLIC_ARRAY_POWER (2) // 2 ** 2 = 4-tap systolic array + .SYSTOLIC_ARRAY_POWER (3) // 2 ** 2 = 4-tap systolic array ) uut ( @@ -207,7 +207,7 @@ module tb_exponentiator; rst_n = 1'b1; #100; - //test_exponent_384(M_384, D_384, FACTOR_384, N_384, N_COEFF_384, S_384); + test_exponent_384(M_384, D_384, FACTOR_384, N_384, N_COEFF_384, S_384); test_exponent_512(M_512, D_512, FACTOR_512, N_512, N_COEFF_512, S_512); end diff --git a/src/tb/tb_systolic_multiplier.v b/src/tb/tb_systolic_multiplier.v index e9d532e..96e76d5 100644 --- a/src/tb/tb_systolic_multiplier.v +++ b/src/tb/tb_systolic_multiplier.v @@ -57,7 +57,7 @@ module tb_systolic_multiplier; // // Model Settings // - localparam NUM_ROUNDS = 43; + localparam NUM_ROUNDS = 1000; // @@ -193,7 +193,7 @@ module tb_systolic_multiplier; #100; test_systolic_multiplier_384(M_384, N_384, N_COEFF_384, FACTOR_384, COEFF_384); - //test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512); + test_systolic_multiplier_512(M_512, N_512, N_COEFF_512, FACTOR_512, COEFF_512); end diff --git a/src/tb/tb_wrapper.v b/src/tb/tb_wrapper.v index bd8dbf1..fae0934 100644 --- a/src/tb/tb_wrapper.v +++ b/src/tb/tb_wrapper.v @@ -2,43 +2,108 @@ module tb_wrapper; - // Inputs + /* + * Settings + */ + localparam USE_OPERAND_ADDR_WIDTH = 7; + localparam USE_SYSTOLIC_ARRAY_POWER = 1; + + /* + * Clock (100 MHz) + */ reg clk; + initial clk = 1'b0; + always #5 clk = ~clk; + + /* + * Reset + */ reg rst_n; - reg cs; - reg we; - reg [7:0] address; - reg [31:0] write_data; - - // Outputs - wire [31:0] read_data; + + /* + * Access Bus + */ + reg bus_cs; + reg bus_we; + reg [USE_OPERAND_ADDR_WIDTH+2:0] bus_addr; + reg [ 32-1:0] bus_wr_data; + wire [ 32-1:0] bus_rd_data; - // Instantiate the Unit Under Test (UUT) - modexpa7_wrapper uut ( - .clk(clk), - .rst_n(rst_n), - .cs(cs), - .we(we), - .address(address), - .write_data(write_data), - .read_data(read_data) + modexpa7_wrapper # + ( + .OPERAND_ADDR_WIDTH (USE_OPERAND_ADDR_WIDTH), + .SYSTOLIC_ARRAY_POWER (USE_SYSTOLIC_ARRAY_POWER) + ) + uut + ( + .clk (clk), + + .rst_n (rst_n), + + .cs (bus_cs), + .we (bus_we), + .address (bus_addr), + .write_data (bus_wr_data), + .read_data (bus_rd_data) ); + reg [31: 0] tmp; initial begin - // Initialize Inputs - clk = 0; + // rst_n = 0; - cs = 0; - we = 0; - address = 0; - write_data = 0; - - // Wait 100 ns for global reset to finish - #100; - - // Add stimulus here - + // + bus_cs = 0; + bus_we = 0; + bus_addr = 'bX; + bus_wr_data = 'bX; + // + #200; + // + rst_n = 1; + // + read_reg('h00, tmp); // NAME0 + read_reg('h01, tmp); // NAME1 + read_reg('h02, tmp); // VERSION + // + read_reg('h13, tmp); // BUFFER_BITS + read_reg('h14, tmp); // ARRAY_BITS + // + write_reg('h12, 32'd384); // EXPONENT_BITS + read_reg ('h12, tmp); + // + write_reg('h11, 32'd384); // MODULUS_BITS + read_reg ('h11, tmp); + // + // end + + task read_reg; + input [USE_OPERAND_ADDR_WIDTH+1:0] addr; + output [ 32-1:0] data; + begin + bus_cs = 1; + bus_addr = {1'b0, addr}; + #10; + bus_cs = 0; + bus_addr = 'bX; + data = bus_rd_data; + end + endtask + + task write_reg; + input [USE_OPERAND_ADDR_WIDTH+1:0] addr; + input [ 32-1:0] data; + begin + bus_cs = 1; + bus_we = 1; + bus_addr = {1'b0, addr}; + bus_wr_data = data; + #10; + bus_cs = 0; + bus_we = 0; + bus_addr = 'bX; + end + endtask endmodule -- cgit v1.2.3