From 9f77c4f559daf20e8b495e26003178c57da93fe2 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Thu, 27 Jul 2017 10:43:49 +0300 Subject: Work in progress. --- src/rtl/modexpa7_systolic_multiplier.v | 8 +- src/rtl/modexpa7_systolic_multiplier_array.v | 335 +++---- src/rtl/modexpa7_systolic_multiplier_fix.v | 1202 ++++++++++++++++++++++++++ 3 files changed, 1391 insertions(+), 154 deletions(-) create mode 100644 src/rtl/modexpa7_systolic_multiplier_fix.v diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v index 9d96f98..32ed543 100644 --- a/src/rtl/modexpa7_systolic_multiplier.v +++ b/src/rtl/modexpa7_systolic_multiplier.v @@ -292,7 +292,8 @@ module modexpa7_systolic_multiplier # bram_p ( .clk(clk), .a_addr(p_addr_ext_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(), - .b_addr(p_addr_ext_rd), .b_out(p_data_out)); + .b_addr(p_addr_ext_rd), .b_out(p_data_out) + ); /* @@ -397,13 +398,14 @@ module modexpa7_systolic_multiplier # .loader_addr_rd (loader_addr_rd), - .pe_a_wide (), + .pe_a_wide ({SYSTOLIC_ARRAY_LENGTH{a_bram_out}}), .pe_b_wide (pe_b_wide), + .a_bram_addr (a_bram_addr), + .p_bram_addr (p_addr_ext_wr), .p_bram_in (p_data_in), .p_bram_wr (p_wren), - .n_num_words (n_num_words_latch), .p_num_words (p_num_words_latch) diff --git a/src/rtl/modexpa7_systolic_multiplier_array.v b/src/rtl/modexpa7_systolic_multiplier_array.v index 029d9d6..22d5aaf 100644 --- a/src/rtl/modexpa7_systolic_multiplier_array.v +++ b/src/rtl/modexpa7_systolic_multiplier_array.v @@ -42,23 +42,25 @@ module modexpa7_systolic_multiplier_array # parameter SYSTOLIC_ARRAY_POWER = 2 ) ( - input clk, - input rst_n, + input clk, + input rst_n, - input ena, - output rdy, + input ena, + output rdy, output [OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER - 1 : 0] loader_addr_rd, - input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide, - input [32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide, + input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_a_wide, + input [ 32 * (2 ** SYSTOLIC_ARRAY_POWER) - 1 : 0] pe_b_wide, + + output [ OPERAND_ADDR_WIDTH - 1 : 0] a_bram_addr, - output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr, - output [ 32 - 1 : 0] p_bram_in, - output p_bram_wr, + output [ OPERAND_ADDR_WIDTH : 0] p_bram_addr, + output [ 32 - 1 : 0] p_bram_in, + output p_bram_wr, - input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words, - input [ OPERAND_ADDR_WIDTH : 0] p_num_words + input [ OPERAND_ADDR_WIDTH - 1 : 0] n_num_words, + input [ OPERAND_ADDR_WIDTH : 0] p_num_words ); @@ -75,7 +77,7 @@ module modexpa7_systolic_multiplier_array # localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_MULT_START = 8'h11; - localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12; + localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h12; localparam [ 7: 0] FSM_STATE_MULT_RELOAD = 8'h13; localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h14; @@ -137,6 +139,107 @@ module modexpa7_systolic_multiplier_array # end + /* + * Systolic Cycle Counters + */ + + // handy values + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; + + // counters + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; + + // handy increment values + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; + + // handy stop flags + wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; + wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_MULT_START, + FSM_STATE_MULT_RELOAD: + // + syst_cnt_load <= syst_cnt_zero; + + FSM_STATE_MULT_CRUNCH: + // + syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; + + endcase + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_CRUNCH) begin + // + if (shreg_done_latency) + syst_cnt_unload <= syst_cnt_zero; + else if (shreg_now_unloading) + syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; + // + end + + + /* + * Timing Shift Registers + */ + + reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load; + reg [SYSTOLIC_PE_LATENCY :0] shreg_latency; + reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload; + + wire shreg_done_load = shreg_load[syst_cnt_last]; + wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; + wire shreg_done_unload = shreg_unload[syst_cnt_last]; + + reg shreg_now_loading; + reg shreg_now_latency; + reg shreg_now_unloading; + + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_MULT_START, + FSM_STATE_MULT_RELOAD: begin + // + shreg_now_loading <= 1'b1; + shreg_now_latency <= 1'b1; + shreg_now_unloading <= 1'b0; + // + shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1}; + shreg_latency <= {{SYSTOLIC_PE_LATENCY {1'b0}}, 1'b1}; + shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; + // + end + // + FSM_STATE_MULT_CRUNCH: begin + // + shreg_load <= {shreg_load [SYSTOLIC_NUM_CYCLES-2:0], 1'b0}; + shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0}; + shreg_unload <= {shreg_unload [SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]}; + // + if (shreg_done_load) shreg_now_loading <= 1'b0; + if (shreg_done_latency) shreg_now_latency <= 1'b0; + if (shreg_done_latency) shreg_now_unloading <= 1'b1; + else if (shreg_done_unload) shreg_now_unloading <= 1'b0; + + end + // + default: begin + shreg_now_loading <= 1'b0; + shreg_now_latency <= 1'b0; + shreg_now_unloading <= 1'b0; + end + // + endcase + + /* * Systolic Array of Processing Elements */ @@ -215,195 +318,125 @@ module modexpa7_systolic_multiplier_array # assign pe_c_in[i] = fifo_c_dout[32 * (i + 1) - 1 -: 32]; assign pe_t[i] = fifo_t_dout[32 * (i + 1) - 1 -: 32]; // - //assign fifo_c_din[32 * (i + 1) - 1 -: 32] = pe_c_out_dly[i]; - // - //always @(posedge clk) pe_c_out_dly[i] <= pe_c_out[i]; - // end // endgenerate - + + /* + * FIFO Reset Logic + */ + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_START: fifo_c_rst <= 1'b1; + FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_c_rst <= 1'b0; + endcase + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_START: fifo_t_rst <= 1'b1; + FSM_STATE_MULT_CRUNCH: if (shreg_done_load) fifo_t_rst <= 1'b0; + endcase + + /* * Block Memory Interface */ // the very first address - wire [OPERAND_ADDR_WIDTH:0] bram_addr_zero = {OPERAND_ADDR_WIDTH+1{1'b0}}; + wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_zero = {OPERAND_ADDR_WIDTH {1'b0}}; + wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_zero = {OPERAND_ADDR_WIDTH+1{1'b0}}; // the very last address - wire [OPERAND_ADDR_WIDTH:0] bram_addr_last = p_num_words_latch; + wire [OPERAND_ADDR_WIDTH - 1 : 0] bram_addr_last = n_num_words_latch; + wire [OPERAND_ADDR_WIDTH : 0] bram_addr_ext_last = p_num_words_latch; // registers - reg [OPERAND_ADDR_WIDTH:0] p_addr; - reg [ 31:0] p_data_in; - reg p_wren; + reg [OPERAND_ADDR_WIDTH - 1 : 0] a_addr; + reg [OPERAND_ADDR_WIDTH : 0] p_addr; + reg [ 32 - 1 : 0] p_data_in; + reg p_wren; // handy values - wire [OPERAND_ADDR_WIDTH:0] p_addr_next = p_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH - 1 : 0] a_addr_next = a_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH : 0] p_addr_next = p_addr + 1'b1; // handy flags - wire p_addr_done = (p_addr == bram_addr_last) ? 1'b1 : 1'b0; - + wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire p_addr_done = (p_addr == bram_addr_ext_last) ? 1'b1 : 1'b0; // map top-level ports to internal registers + assign a_bram_addr = a_addr; assign p_bram_addr = p_addr; assign p_bram_in = p_data_in; assign p_bram_wr = p_wren; - - /* - * Systolic Cycle Counters - */ - - // handy values - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; - - // counters - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; - - // handy increment values - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; - - // handy stop flags - wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; - wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; - - always @(posedge clk) - // - case (fsm_next_state) - FSM_STATE_MULT_START, - FSM_STATE_MULT_RELOAD: - // - syst_cnt_load <= syst_cnt_zero; - - FSM_STATE_MULT_CRUNCH, - // - syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; - - endcase - - always @(posedge clk) - // - if (fsm_state == FSM_STATE_MULT_CRUNCH) begin - // - if (shreg_done_latency) - syst_cnt_unload <= syst_cnt_zero; - else if (shreg_now_unloading) - syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; - // - end - - - - /* - * Shift Registers - */ - reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load; - reg [SYSTOLIC_PE_LATENCY :0] shreg_latency; - reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload; - - wire shreg_done_load = shreg_load[syst_cnt_last]; - wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; - wire shreg_done_unload = shreg_unload[syst_cnt_last]; - - reg shreg_now_loading; - reg shreg_now_latency; - reg shreg_now_unloading; - + integer j; always @(posedge clk) // - case (fsm_state) + if (fsm_state == FSM_STATE_MULT_CRUNCH) // - //FSM_STATE_IDLE: begin - //shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; - //shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b0}; - //shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; - //end - // - FSM_STATE_MULT_START, - FSM_STATE_MULT_RELOAD: begin + for (j=0; j {1'b0, a_addr}) ? 32'd0 : pe_a_wide[32 * (j + 1) - 1 -: 32]; + pe_b[j] <= pe_b_wide[32 * (j + 1) - 1 -: 32]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + end +// /* +// * +// */ +// always @(posedge clk) +// // +// case (fsm_next_state) +// FSM_STATE_MULT_RELOAD: p_wren <= 1'b1; +// default: p_wren <= 1'b0; +// endcase +// /* - * - */ - always @(posedge clk) - // - case (fsm_next_state) - FSM_STATE_MULT_RELOAD: p_wren <= 1'b1; - default: p_wren <= 1'b0; - endcase - - /* - * + * Block Memory Address Control */ - always @(posedge clk) + always @(posedge clk) begin // case (fsm_state) FSM_STATE_MULT_START: p_addr <= bram_addr_zero; FSM_STATE_MULT_RELOAD: p_addr <= p_addr_next; + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_START: a_addr <= bram_addr_zero; + FSM_STATE_MULT_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; endcase + // + end /* - * Loader Control + * Loader Address Control */ reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr; assign loader_addr_rd = loader_addr; - integer j; always @(posedge clk) // case (fsm_next_state) - - FSM_STATE_MULT_START, + // + FSM_STATE_MULT_START, FSM_STATE_MULT_RELOAD: - // - for (j=0; j {1'b0, bram_addr_last}) + n_addr <= n_addr_next; + + endcase + // + case (fsm_state) + FSM_STATE_SAVE_START: r_addr <= bram_addr_zero; + FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next; + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero; + FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; + endcase + // + end + + + // + // Internal Memories + // + + /* memory inputs */ + reg [31: 0] ab_data_in; + reg [31: 0] q_data_in; + reg [31: 0] qn_data_in; + wire [31: 0] s_data_in; + wire [31: 0] sn_data_in; + reg [31: 0] r_data_in; + + /* memory outputs */ + wire [31: 0] ab_data_out; + wire [31: 0] q_data_out; + wire [31: 0] qn_data_out; + wire [31: 0] s_data_out; + wire [31: 0] sn_data_out; + + /* write enables */ + reg ab_wren; + reg q_wren; + reg qn_wren; + reg s_wren; + reg sn_wren; + reg r_wren; + + /* map */ + assign r_bram_in = r_data_in; + assign r_bram_wr = r_wren; + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) + bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) + bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)); + + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)); + + + // + // Wide Operand Loader + // + integer j; + + /* shift logic */ + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_B_SHIFT: begin + + /* update the rightmost part of loader buffer */ + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; + + /* shift the loader buffer to the left */ + for (j=1; j {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin + s_addr <= s_addr_next; + sn_addr <= sn_addr_next; + end + + if (qn_addr_ext == bram_addr_ext_last) begin + s_addr <= bram_addr_zero; + sn_addr <= bram_addr_zero; + end + + end + + FSM_STATE_MULT_Q_N_FINAL, + FSM_STATE_SAVE_START, + FSM_STATE_SAVE_WRITE: begin + s_addr <= !s_addr_done ? s_addr_next : s_addr; + sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr; + end + + endcase + + // + case (fsm_next_state) + FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next; + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero; + FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr; + endcase + + // + end + + always @(posedge clk) begin + // + if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin + ab_wren <= shreg_done_latency_dly; + ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + ab_wren <= 1'b0; + ab_data_in <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin + q_wren <= shreg_done_latency_dly; + q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + q_wren <= 1'b0; + q_data_in <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin + qn_wren <= shreg_done_latency_dly; + qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + qn_wren <= 1'b0; + qn_data_in <= 32'hXXXXXXXX; + end + // + case (fsm_state) + FSM_STATE_SAVE_START: r_wren <= 1'b1; + FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done; + default: r_wren <= 1'b0; + endcase + // + end + + + always @(posedge clk) + // + case (fsm_next_state) + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START, + FSM_STATE_MULT_A_B_RELOAD, + FSM_STATE_MULT_AB_N_COEFF_RELOAD, + FSM_STATE_MULT_Q_N_RELOAD: + // + syst_cnt_load <= syst_cnt_zero; + + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: + // + syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; + + endcase + + + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: begin + + if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero; + else if (shreg_now_unloading) + syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; + + end + endcase + + + // + // T and C_IN can be moved to a separate code block + // + always @(posedge clk) begin + // + if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) + // + for (j=0; j {1'b0, a_addr}) ? 32'd0 : a_bram_out; + pe_b[j] <= loader_dout[j]; + //pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; + //pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + //pe_t[j] <= 32'hXXXXXXXX; + //pe_c_in[j] <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) + // + for (j=0; j {1'b0, q_addr}) ? 32'd0 : q_data_out; + pe_b[j] <= loader_dout[j]; + //pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[j][syst_cnt_load_dly]; + //pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[j][syst_cnt_load_dly]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + //pe_t[j] <= 32'hXXXXXXXX; + //pe_c_in[j] <= 32'hXXXXXXXX; + end + // + + // + end + + + // + // Adder + // + /* + * This adder is used to calculate S = AB + QN. + * + */ + reg add1_ce; // clock enable + reg [31: 0] add1_s; // sum output + wire add1_c_in; // carry input + wire [31: 0] add1_a; // A-input + reg [31: 0] add1_b; // B-input + reg add1_c_in_mask; // flag to not carry anything into the very first word + reg add1_c_out; // carry output + + /* add masking into carry feedback chain */ + assign add1_c_in = add1_c_out & ~add1_c_in_mask; + + /* mask carry for the very first word of N */ + //always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; + + always @(posedge clk) + // + if (add1_ce) + // + {add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in}; + + assign add1_a = qn_data_in; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX; + else + add1_b <= 32'hXXXXXXXX; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0; + else + add1_c_in_mask <= 1'b0; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + add1_ce <= shreg_done_latency_dly; + else + add1_ce <= 1'b0; + + + assign s_data_in = add1_s; + assign sn_data_in = sub1_d; + + always @(posedge clk) begin + // + s_wren <= add1_ce; + sn_wren <= sub1_ce; + end + + + + // + // Subtractor + // + /* + * This subtractor is used to calculate SN = S - N. + * + */ + reg sub1_ce; // clock enable + reg [31: 0] sub1_d; // difference output + wire sub1_b_in; // borrow input + wire [31: 0] sub1_a; // A-input + reg [31: 0] sub1_b; // B-input + reg sub1_b_in_mask; // flag to not borrow anything from the very first word + reg sub1_b_out; // borrow output + + /* add masking into borrow feedback chain */ + assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; + + always @(posedge clk) + // + if (sub1_ce) + // + {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in}; + + assign sub1_a = add1_s; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX; + else + sub1_b <= 32'hXXXXXXXX; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0; + else + sub1_b_in_mask <= 1'b0; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr}); + else + sub1_ce <= 1'b0; + + + assign s_data_in = add1_s; + + always @(posedge clk) + // + s_wren <= add1_ce; + + + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_MULT_Q_N_FINAL) + flag_select_s <= sub1_b_out & ~add1_c_out; + + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_SAVE_START, + FSM_STATE_SAVE_WRITE: + r_data_in <= flag_select_s ? s_data_out : sn_data_out; + endcase + + + + // + // FSM Process + // + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; + else fsm_state <= fsm_next_state; + + + // + // FSM Transition Logic + // + always @* begin + // + fsm_next_state = FSM_STATE_STOP; + // + case (fsm_state) + + FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START; + else fsm_next_state = FSM_STATE_IDLE; + // + FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE; + else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL; + else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START; + // + FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE; + else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL; + else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START; + // + FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE; + else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL; + else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START; + // + FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD; + else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL; + else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START; + // + FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD; + else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL; + else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START; + // + FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD; + else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL; + else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START; + // + FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE; + FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL; + else fsm_next_state = FSM_STATE_SAVE_WRITE; + FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP; + // + FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; + + endcase + // + end + + +endmodule + +//====================================================================== +// End of file +//====================================================================== -- cgit v1.2.3