diff options
author | Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru> | 2019-10-01 15:05:11 +0300 |
---|---|---|
committer | Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru> | 2019-10-01 15:05:11 +0300 |
commit | 9e9689d7b00ecdcc1c651f5e369e00a53d62df3c (patch) | |
tree | f7bdddda835e26aff3642b99e1ee8b2f1a64434d | |
parent | 29fb6afd018c601a2e0c7376656d5e37beb565d6 (diff) |
Further work on the Montgomery modular multiplier. Can now to the "triangular"
part of multiplication, i.e. compute the "magic" reduction coefficient
Q = LSB(AB) * N_COEFF.
-rw-r--r-- | bench/tb_square.v | 391 | ||||
-rw-r--r-- | rtl/dsp/dsp_array.v | 42 | ||||
-rw-r--r-- | rtl/modexpng_mmm_fsm.vh | 10 | ||||
-rw-r--r-- | rtl/modexpng_part_recombinator.v | 455 |
4 files changed, 691 insertions, 207 deletions
diff --git a/bench/tb_square.v b/bench/tb_square.v index 61e5d8a..23831db 100644 --- a/bench/tb_square.v +++ b/bench/tb_square.v @@ -39,6 +39,8 @@ module tb_square; reg [17:0] T1[0:31]; reg [17:0] T2[0:31]; reg [17:0] AB[0:63]; + reg [17:0] N_COEFF[0:32]; + reg [17:0] Q[0:32]; // @@ -81,6 +83,26 @@ module tb_square; AB[56] = 18'h0e1c0; AB[57] = 18'h00989; AB[58] = 18'h01201; AB[59] = 18'h0e194; AB[60] = 18'h07f93; AB[61] = 18'h0e739; AB[62] = 18'h07cf6; AB[63] = 18'h019df; // + N_COEFF[ 0] = 18'h05a97; N_COEFF[ 1] = 18'h0ac69; N_COEFF[ 2] = 18'h0d51e; N_COEFF[ 3] = 18'h07326; + N_COEFF[ 4] = 18'h01053; N_COEFF[ 5] = 18'h0f68a; N_COEFF[ 6] = 18'h09c70; N_COEFF[ 7] = 18'h064f7; + N_COEFF[ 8] = 18'h01041; N_COEFF[ 9] = 18'h0c2bf; N_COEFF[10] = 18'h0f01f; N_COEFF[11] = 18'h01842; + N_COEFF[12] = 18'h0e69a; N_COEFF[13] = 18'h037ea; N_COEFF[14] = 18'h0b4a0; N_COEFF[15] = 18'h0c1ab; + N_COEFF[16] = 18'h0bd5b; N_COEFF[17] = 18'h09e5e; N_COEFF[18] = 18'h039bd; N_COEFF[19] = 18'h06430; + N_COEFF[20] = 18'h0b460; N_COEFF[21] = 18'h08bd4; N_COEFF[22] = 18'h09fcd; N_COEFF[23] = 18'h05391; + N_COEFF[24] = 18'h0fa45; N_COEFF[25] = 18'h08892; N_COEFF[26] = 18'h0732c; N_COEFF[27] = 18'h0baf6; + N_COEFF[28] = 18'h067a9; N_COEFF[29] = 18'h0b184; N_COEFF[30] = 18'h02089; N_COEFF[31] = 18'h0297b; + N_COEFF[32] = 18'h01810; + // + Q[ 0] = 18'h0ac02; Q[ 1] = 18'h0a026; Q[ 2] = 18'h06825; Q[ 3] = 18'h08f06; + Q[ 4] = 18'h03783; Q[ 5] = 18'h04cb5; Q[ 6] = 18'h0e8ea; Q[ 7] = 18'h083d2; + Q[ 8] = 18'h0fec9; Q[ 9] = 18'h066d9; Q[10] = 18'h0edad; Q[11] = 18'h06c12; + Q[12] = 18'h0a5fb; Q[13] = 18'h07295; Q[14] = 18'h06a0c; Q[15] = 18'h081a5; + Q[16] = 18'h03493; Q[17] = 18'h0a393; Q[18] = 18'h03da6; Q[19] = 18'h0beb1; + Q[20] = 18'h0d138; Q[21] = 18'h02815; Q[22] = 18'h0f191; Q[23] = 18'h03617; + Q[24] = 18'h08d4f; Q[25] = 18'h0f641; Q[26] = 18'h00e82; Q[27] = 18'h01774; + Q[28] = 18'h0bf39; Q[29] = 18'h0929d; Q[30] = 18'h05273; Q[31] = 18'h0c30a; + Q[32] = 18'h0eef3; + // end @@ -102,9 +124,10 @@ module tb_square; reg mac_fat_bram_xy_ena = 1'b0; reg mac_fat_bram_xy_reg_ena = 1'b0; reg [ 2:0] mac_fat_bram_xy_bank; - reg [ 7:0] mac_fat_bram_xy_addr[0:3]; - wire [17:0] mac_fat_bram_x_dout[0:3]; - wire [17:0] mac_fat_bram_y_dout[0:3]; + reg [ 2:0] mac_fat_bram_xy_bank_aux; + reg [ 7:0] mac_fat_bram_xy_addr[0:4]; + wire [17:0] mac_fat_bram_x_dout[0:4]; + wire [17:0] mac_fat_bram_y_dout[0:4]; reg tb_slim_bram_xy_ena = 1'b0; reg [ 1:0] tb_slim_bram_xy_bank; @@ -124,14 +147,14 @@ module tb_square; // mac_slim_bram_xy_addr_dly <= mac_slim_bram_xy_addr; - reg mac_slim_bram_xy_reg_ena_dly = 1'b0; + reg mac_slim_bram_xy_reg_ena_dly = 1'b0; always @(posedge clk) mac_slim_bram_xy_reg_ena_dly <= mac_slim_bram_xy_reg_ena; genvar z; - generate for (z=0; z<(NUM_MULTS/2); z=z+1) + generate for (z=0; z<((NUM_MULTS/2)+1); z=z+1) begin : gen_fat_bram // ip_bram_36k fat_bram_x @@ -145,7 +168,8 @@ module tb_square; .clkb (clk), .enb (mac_fat_bram_xy_ena), .regceb (mac_fat_bram_xy_reg_ena), - .addrb ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}), + .addrb ({(z < (NUM_MULTS/2) ? + mac_fat_bram_xy_bank : mac_fat_bram_xy_bank_aux), mac_fat_bram_xy_addr[z]}), .doutb (mac_fat_bram_x_dout[z]) ); // @@ -160,7 +184,8 @@ module tb_square; .clkb (clk), .enb (mac_fat_bram_xy_ena), .regceb (mac_fat_bram_xy_reg_ena), - .addrb ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}), + .addrb ({z < (NUM_MULTS/2) ? + mac_fat_bram_xy_bank : mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_addr[z]}), .doutb (mac_fat_bram_y_dout[z]) ); // @@ -232,6 +257,23 @@ module tb_square; wait_clock_tick; end + for (i=0; i<32; i=i+1) begin + tb_slim_bram_xy_bank = BANK_SLIM_N_COEFF; + tb_slim_bram_xy_addr = i[7:0]; + tb_slim_bram_x_din = N_COEFF[i]; + tb_slim_bram_y_din = N_COEFF[i]; + + wait_clock_tick; + end + for (i=32; i<33; i=i+1) begin + tb_slim_bram_xy_bank = BANK_SLIM_N_COEFF_EXT; + tb_slim_bram_xy_addr = 0; + tb_slim_bram_x_din = N_COEFF[i]; + tb_slim_bram_y_din = N_COEFF[i]; + + wait_clock_tick; + end + tb_fat_bram_xy_ena = 1'b0; tb_slim_bram_xy_ena = 1'b0; @@ -256,6 +298,7 @@ module tb_square; wait_clock_tick; verify_ab; + verify_q; end @@ -270,11 +313,11 @@ module tb_square; reg dsp_x_ce_p; reg dsp_x_ce_mode; - reg [8 -1:0] dsp_x_mode_z = {8{1'b1}}; + reg [9 -1:0] dsp_x_mode_z = {9{1'b1}}; - wire [4*18-1:0] dsp_x_a; + wire [5*18-1:0] dsp_x_a; reg [1*17-1:0] dsp_x_b; - wire [8*47-1:0] dsp_x_p; + wire [9*47-1:0] dsp_x_p; reg dsp_y_ce_a; reg dsp_y_ce_b; @@ -283,13 +326,13 @@ module tb_square; reg dsp_y_ce_p; reg dsp_y_ce_mode; - reg [8 -1:0] dsp_y_mode_z = {8{1'b1}}; + reg [9 -1:0] dsp_y_mode_z = {9{1'b1}}; - wire [4*18-1:0] dsp_y_a; + wire [5*18-1:0] dsp_y_a; reg [1*17-1:0] dsp_y_b; - wire [8*47-1:0] dsp_y_p; + wire [9*47-1:0] dsp_y_p; - generate for (z=0; z<(NUM_MULTS/2); z=z+1) + generate for (z=0; z<((NUM_MULTS/2)+1); z=z+1) begin : gen_dsp_xy_a_split assign dsp_x_a[18*z+:18] = mac_fat_bram_x_dout[z]; assign dsp_y_a[18*z+:18] = mac_fat_bram_y_dout[z]; @@ -301,10 +344,10 @@ module tb_square; {dsp_y_ce_b_dly, dsp_x_ce_b_dly} <= {dsp_y_ce_b, dsp_x_ce_b}; - reg [8 -1:0] dsp_xy_mode_z_adv1 = {8{1'b1}}; - reg [8 -1:0] dsp_xy_mode_z_adv2 = {8{1'b1}}; - reg [8 -1:0] dsp_xy_mode_z_adv3 = {8{1'b1}}; - reg [8 -1:0] dsp_xy_mode_z_adv4 = {8{1'b1}}; + reg [9 -1:0] dsp_xy_mode_z_adv1 = {9{1'b1}}; + reg [9 -1:0] dsp_xy_mode_z_adv2 = {9{1'b1}}; + reg [9 -1:0] dsp_xy_mode_z_adv3 = {9{1'b1}}; + reg [9 -1:0] dsp_xy_mode_z_adv4 = {9{1'b1}}; dsp_array dsp_x ( @@ -355,16 +398,47 @@ module tb_square; localparam [7:0] index_last = 8'd31; + localparam [7:0] index_last_minus1 = index_last - 1'b1; + + + // + // Column + // + reg [4:0] col_index; // current column index + reg [4:0] col_index_prev; // delayed column index value + reg [4:0] col_index_last; // index of the very last column + reg [4:0] col_index_next1; // precomputed next column index + //reg [4:0] col_index_next2; // precomputed next column index after next column index + reg col_is_last; // flag set during the very last column + + always @(posedge clk) + // + col_index_prev <= col_index; wire mult_square_addr_almost_done_comb; reg mult_square_addr_almost_done_flop; + + //wire mult_square_addr_surely_done_comb; + reg mult_square_addr_surely_done_flop; + + reg mult_triangle_addr_almost_done_comb; + reg mult_triangle_addr_almost_done_flop; + + //wire mult_triangle_addr_surely_done_comb; + reg mult_triangle_addr_surely_done_flop; + reg mult_triangle_addr_tardy_done_flop; - wire mult_square_addr_surely_done_comb; - reg mult_square_addr_surely_done_flop; + assign mult_square_addr_almost_done_comb = mac_slim_bram_xy_addr == index_last_minus1; + + always @* + // + //if (!col_is_last) + mult_triangle_addr_almost_done_comb = (mac_slim_bram_xy_addr[2:0] == index_last_minus1[2:0]) && (mac_slim_bram_xy_addr[7:3] == col_index); + //else + //mult_triangle_addr_almost_done_comb = (mac_slim_bram_xy_addr[2:0] == index_last[2:0]) && (mac_slim_bram_xy_addr[7:3] == col_index); + - assign mult_square_addr_almost_done_comb = mac_slim_bram_xy_addr == (index_last - 8'd1); - assign mult_square_addr_surely_done_comb = mac_slim_bram_xy_addr == index_last; always @(posedge clk) // @@ -372,60 +446,130 @@ module tb_square; FSM_STATE_MULT_SQUARE_COL_0_BUSY, FSM_STATE_MULT_SQUARE_COL_N_BUSY: - {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= - {mult_square_addr_surely_done_comb, mult_square_addr_almost_done_comb}; - + mult_square_addr_almost_done_flop <= mult_square_addr_almost_done_comb; + //{mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= + //{mult_square_addr_surely_done_comb, mult_square_addr_almost_done_comb}; default: - {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 2'b00; + mult_square_addr_almost_done_flop <= 1'b0; + //{mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 2'b00; endcase - - // - // Column - // - reg [4:0] col_index; - reg [4:0] col_index_prev; - reg [4:0] col_index_last; + always @(posedge clk) + // + mult_square_addr_surely_done_flop <= mult_square_addr_almost_done_flop; always @(posedge clk) // - col_index_prev <= col_index; + case (fsm_state) + + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: + mult_triangle_addr_almost_done_flop <= mult_triangle_addr_almost_done_comb; + //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= + //{mult_triangle_addr_surely_done_comb, mult_triangle_addr_almost_done_comb}; + + default: + mult_triangle_addr_almost_done_flop <= 1'b0; + //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 2'b00; + + endcase + + always @(posedge clk) begin + // + mult_triangle_addr_surely_done_flop <= mult_triangle_addr_almost_done_flop; + mult_triangle_addr_tardy_done_flop <= mult_triangle_addr_surely_done_flop; + // + end + + // // FSM Transition Logic // wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square; + wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_triangle; - always @(posedge clk) // case (fsm_state_next) + // FSM_STATE_MULT_SQUARE_COL_0_INIT, FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0; FSM_STATE_MULT_SQUARE_COL_0_TRIG, FSM_STATE_MULT_SQUARE_COL_N_TRIG, FSM_STATE_MULT_SQUARE_COL_0_BUSY, FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_addr <= !mult_square_addr_almost_done_flop ? mac_slim_bram_xy_addr + 1'b1 : 8'd0; + // + FSM_STATE_MULT_TRIANGLE_COL_0_INIT, + FSM_STATE_MULT_TRIANGLE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0; + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_slim_bram_xy_addr <= mult_triangle_addr_almost_done_flop || (col_is_last && mult_triangle_addr_surely_done_flop) ? + 8'd0 : mac_slim_bram_xy_addr + 1'b1; + // default: mac_slim_bram_xy_addr <= 8'dX; endcase + + wire [2:0] fat_bram_offset_rom[0:3]; + + generate for (z=1; z<NUM_MULTS; z=z+2) + begin : gen_fat_bram_offset + assign fat_bram_offset_rom[(z-1)/2] = z[2:0]; + end + endgenerate + integer j; - always @(posedge clk) + always @(posedge clk) begin // for (j=0; j<(NUM_MULTS/2); j=j+1) + // case (fsm_state_next) - FSM_STATE_MULT_SQUARE_COL_0_INIT: mac_fat_bram_xy_addr[j] <= 1 + 2 * j; - FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_fat_bram_xy_addr[j] <= 8 * (col_index + 1) + 1 + 2 * j; + // + // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions! + // + FSM_STATE_MULT_SQUARE_COL_0_INIT: mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]}; + FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]}; FSM_STATE_MULT_SQUARE_COL_0_TRIG, FSM_STATE_MULT_SQUARE_COL_N_TRIG, FSM_STATE_MULT_SQUARE_COL_0_BUSY, FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last); + // + FSM_STATE_MULT_TRIANGLE_COL_0_INIT: mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]}; + FSM_STATE_MULT_TRIANGLE_COL_N_INIT: mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]}; + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last); + // default: mac_fat_bram_xy_addr[j] <= 8'dX; endcase - - + // + case (fsm_state_next) + // + // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions! + // + FSM_STATE_MULT_SQUARE_COL_0_INIT: mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1}; + FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1}; + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_addr[4] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[4], index_last); + // + FSM_STATE_MULT_TRIANGLE_COL_0_INIT: mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1}; + FSM_STATE_MULT_TRIANGLE_COL_N_INIT: mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1}; + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_fat_bram_xy_addr[4] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[4], index_last); + // + default: mac_fat_bram_xy_addr[4] <= 8'dX; + endcase +// + end always @(posedge clk) // @@ -436,6 +580,13 @@ module tb_square; FSM_STATE_MULT_SQUARE_COL_N_TRIG, FSM_STATE_MULT_SQUARE_COL_0_BUSY, FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_bank <= BANK_SLIM_T1T2; + FSM_STATE_MULT_TRIANGLE_COL_0_INIT, + FSM_STATE_MULT_TRIANGLE_COL_N_INIT, + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_slim_bram_xy_bank <= col_is_last && (mult_triangle_addr_almost_done_flop || mult_triangle_addr_surely_done_flop) ? + BANK_SLIM_N_COEFF_EXT : BANK_SLIM_N_COEFF; default: mac_slim_bram_xy_bank <= 2'bXX; endcase @@ -447,8 +598,14 @@ module tb_square; FSM_STATE_MULT_SQUARE_COL_0_TRIG, FSM_STATE_MULT_SQUARE_COL_N_TRIG, FSM_STATE_MULT_SQUARE_COL_0_BUSY, - FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_bank <= BANK_FAT_T1T2; - default: mac_fat_bram_xy_bank <= 3'bXXX; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {2{BANK_FAT_T1T2}}; + FSM_STATE_MULT_TRIANGLE_COL_0_INIT, + FSM_STATE_MULT_TRIANGLE_COL_N_INIT, + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {BANK_FAT_ABH, BANK_FAT_ABL}; + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {2{BANK_FAT_ABL}}; + default: {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {2{3'bXXX}}; endcase @@ -462,7 +619,13 @@ module tb_square; FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_slim_bram_xy_ena <= 1'b1; FSM_STATE_MULT_SQUARE_COL_0_BUSY, FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_ena <= ~mult_square_addr_almost_done_flop; - default: mac_slim_bram_xy_ena <= 1'b0; + FSM_STATE_MULT_TRIANGLE_COL_0_INIT, + FSM_STATE_MULT_TRIANGLE_COL_N_INIT, + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: mac_slim_bram_xy_ena <= 1'b1; + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_slim_bram_xy_ena <= !col_is_last ? ~mult_triangle_addr_almost_done_flop : ~mult_triangle_addr_surely_done_flop; + default: mac_slim_bram_xy_ena <= 1'b0; endcase always @(posedge clk) @@ -473,7 +636,13 @@ module tb_square; FSM_STATE_MULT_SQUARE_COL_0_TRIG, FSM_STATE_MULT_SQUARE_COL_N_TRIG, FSM_STATE_MULT_SQUARE_COL_0_BUSY, - FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_ena <= 1'b1; + FSM_STATE_MULT_SQUARE_COL_N_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_0_INIT, + FSM_STATE_MULT_TRIANGLE_COL_N_INIT, + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_fat_bram_xy_ena <= 1'b1; default: mac_fat_bram_xy_ena <= 1'b0; endcase @@ -486,7 +655,7 @@ module tb_square; // mac_fat_bram_xy_reg_ena <= mac_fat_bram_xy_ena; - + always @(posedge clk) // if (mac_slim_bram_xy_reg_ena_dly) @@ -499,7 +668,7 @@ module tb_square; input [7:0] mac_fat_bram_xy_addr_current; input [7:0] mac_fat_bram_xy_addr_last; begin - if (mac_fat_bram_xy_addr_current > 0) + if (mac_fat_bram_xy_addr_current > 8'd0) mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_current - 1'b1; else mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_last; @@ -541,26 +710,41 @@ module tb_square; // case (fsm_state_next) // - FSM_STATE_MULT_SQUARE_COL_0_INIT: begin - col_index <= 5'd0; - col_index_last <= index_last[7:3]; + FSM_STATE_MULT_SQUARE_COL_0_INIT, + FSM_STATE_MULT_TRIANGLE_COL_0_INIT: begin + col_index <= 5'd0; + col_index_last <= index_last[7:3]; + col_index_next1 <= 5'd1; + //col_index_next2 <= 5'd2; + col_is_last <= 1'b0; + end // - FSM_STATE_MULT_SQUARE_COL_N_INIT: - col_index <= col_index + 1'b1; + FSM_STATE_MULT_SQUARE_COL_N_INIT, + FSM_STATE_MULT_TRIANGLE_COL_N_INIT: begin + col_index <= col_index_next1; + col_is_last <= col_index_next1 == col_index_last; + col_index_next1 <= col_index_next1 == col_index_last ? 5'd0 : col_index_next1 + 5'd1; + //col_index_next2 <= col_index_next2 + 1'b1; + end // endcase - assign fsm_state_after_mult_square = (col_index == col_index_last) ? FSM_STATE_MULT_SQUARE_HOLDOFF : FSM_STATE_MULT_SQUARE_COL_N_INIT; + assign fsm_state_after_mult_square = col_is_last ? FSM_STATE_MULT_SQUARE_HOLDOFF : FSM_STATE_MULT_SQUARE_COL_N_INIT; + assign fsm_state_after_mult_triangle = col_is_last ? FSM_STATE_MULT_TRIANGLE_HOLDOFF : FSM_STATE_MULT_TRIANGLE_COL_N_INIT; always @(posedge clk) // case (fsm_state_next) FSM_STATE_MULT_SQUARE_COL_0_TRIG, - FSM_STATE_MULT_SQUARE_COL_N_TRIG: dsp_xy_mode_z_adv4 <= {8{1'b0}}; + FSM_STATE_MULT_SQUARE_COL_N_TRIG: dsp_xy_mode_z_adv4 <= {9{1'b0}}; FSM_STATE_MULT_SQUARE_COL_0_BUSY, - FSM_STATE_MULT_SQUARE_COL_N_BUSY: dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly); - default: dsp_xy_mode_z_adv4 <= {8{1'b1}}; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly); + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: dsp_xy_mode_z_adv4 <= {9{1'b0}}; + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: dsp_xy_mode_z_adv4 <= {9{1'b1}}; + default: dsp_xy_mode_z_adv4 <= {9{1'b1}}; endcase always @(posedge clk) begin @@ -571,25 +755,45 @@ module tb_square; dsp_xy_mode_z_adv3 <= {dsp_xy_mode_z_adv4}; end - function [NUM_MULTS-1:0] calc_mac_mode_z_square; - input [ 4:0] col_index_value; - input [ 7:0] mac_slim_bram_xy_addr_value; + function [NUM_MULTS:0] calc_mac_mode_z_square; + input [ 4:0] col_index_value; + input [ 7:0] mac_slim_bram_xy_addr_value; begin if (mac_slim_bram_xy_addr_value[7:3] == col_index_value) case (mac_slim_bram_xy_addr_value[2:0]) - 3'b000: calc_mac_mode_z_square = 8'b11111110; - 3'b001: calc_mac_mode_z_square = 8'b11111101; - 3'b010: calc_mac_mode_z_square = 8'b11111011; - 3'b011: calc_mac_mode_z_square = 8'b11110111; - 3'b100: calc_mac_mode_z_square = 8'b11101111; - 3'b101: calc_mac_mode_z_square = 8'b11011111; - 3'b110: calc_mac_mode_z_square = 8'b10111111; - 3'b111: calc_mac_mode_z_square = 8'b01111111; + 3'b000: calc_mac_mode_z_square = {1'b1, 8'b11111110}; + 3'b001: calc_mac_mode_z_square = {1'b1, 8'b11111101}; + 3'b010: calc_mac_mode_z_square = {1'b1, 8'b11111011}; + 3'b011: calc_mac_mode_z_square = {1'b1, 8'b11110111}; + 3'b100: calc_mac_mode_z_square = {1'b1, 8'b11101111}; + 3'b101: calc_mac_mode_z_square = {1'b1, 8'b11011111}; + 3'b110: calc_mac_mode_z_square = {1'b1, 8'b10111111}; + 3'b111: calc_mac_mode_z_square = {1'b1, 8'b01111111}; endcase else - calc_mac_mode_z_square = {NUM_MULTS{1'b1}}; + calc_mac_mode_z_square = {1'b1, {NUM_MULTS{1'b1}}}; end endfunction + /* + function [NUM_MULTS:0] calc_mac_mode_z_triangle; + input [ 4:0] col_index_value; + input [ 7:0] mac_slim_bram_xy_addr_value; + begin + if (mac_slim_bram_xy_addr_value[7:3] == col_index_value) + case (mac_slim_bram_xy_addr_value[2:0]) + 3'b000: calc_mac_mode_z_square = {1'b1, 8'b11111110}; + 3'b001: calc_mac_mode_z_square = {1'b1, 8'b11111101}; + 3'b010: calc_mac_mode_z_square = {1'b1, 8'b11111011}; + 3'b011: calc_mac_mode_z_square = {1'b1, 8'b11110111}; + 3'b100: calc_mac_mode_z_square = {1'b1, 8'b11101111}; + 3'b101: calc_mac_mode_z_square = {1'b1, 8'b11011111}; + 3'b110: calc_mac_mode_z_square = {1'b1, 8'b10111111}; + 3'b111: calc_mac_mode_z_square = {1'b1, 8'b01111111}; + endcase + else + calc_mac_mode_z_square = {1'b1, {NUM_MULTS{1'b1}}}; + end + endfunction*/ reg recomb_x_ena = 1'b0; reg recomb_y_ena = 1'b0; @@ -623,6 +827,7 @@ module tb_square; .col_index (col_index), .col_index_last (col_index_last), .slim_bram_xy_addr (mac_slim_bram_xy_addr), + .slim_bram_xy_bank (mac_slim_bram_xy_bank), .fat_bram_xy_bank (recomb_fat_bram_xy_bank), .fat_bram_xy_addr (recomb_fat_bram_xy_addr), .fat_bram_x_dout (recomb_fat_bram_x_dout), @@ -631,14 +836,17 @@ module tb_square; ); reg [17:0] AB_READ[0:63]; + reg [17:0] Q_READ[0:32]; always @(posedge clk) // if (recomb_fat_bram_xy_dout_valid) // case (recomb_fat_bram_xy_bank) - 3'd1: AB_READ[recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout; - 3'd2: AB_READ[32 + recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout; + 3'd1: AB_READ[ (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout; + 3'd2: AB_READ[32 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout; + 3'd3: Q_READ [ (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout; + 3'd4: Q_READ [32 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout; endcase @@ -663,11 +871,8 @@ module tb_square; mgr_fat_bram_x_din <= {18{1'bX}}; mgr_fat_bram_y_din <= {18{1'bX}}; end - - - - - + + task verify_ab; reg verify_ab_ok; begin @@ -687,6 +892,28 @@ module tb_square; endtask + task verify_q; + reg verify_q_ok; + begin + verify_q_ok = 1; + for (i=0; i<33; i=i+1) + if (Q_READ[i] === Q[i]) + $display("Q / Q_READ [%02d] = 0x%05x / 0x%05x", i, Q[i], Q_READ[i]); + else begin + $display("Q / Q_READ [%02d] = 0x%05x / 0x%05x <???>", i, Q[i], Q_READ[i]); + verify_q_ok = 0; + end + if (verify_q_ok) + $display("Q is OK."); + else + $display("Q is WRONG!"); + end + endtask + + + wire mult_square_addr_done = mult_square_addr_surely_done_flop; + + wire mult_triangle_addr_done = !col_is_last ? mult_triangle_addr_surely_done_flop : mult_triangle_addr_tardy_done_flop; always @* begin // @@ -697,13 +924,23 @@ module tb_square; FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_TRIG ; FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ; - FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY; + FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY; FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_TRIG ; FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ; - FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY; + + FSM_STATE_MULT_SQUARE_HOLDOFF: fsm_state_next = recomb_rdy ? FSM_STATE_MULT_TRIANGLE_COL_0_INIT : FSM_STATE_MULT_SQUARE_HOLDOFF; + + FSM_STATE_MULT_TRIANGLE_COL_0_INIT: fsm_state_next = FSM_STATE_MULT_TRIANGLE_COL_0_TRIG ; + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_TRIANGLE_COL_0_BUSY ; + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY: fsm_state_next = mult_triangle_addr_done ? FSM_STATE_MULT_TRIANGLE_COL_N_INIT : FSM_STATE_MULT_TRIANGLE_COL_0_BUSY; + + FSM_STATE_MULT_TRIANGLE_COL_N_INIT: fsm_state_next = FSM_STATE_MULT_TRIANGLE_COL_N_TRIG ; + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_TRIANGLE_COL_N_BUSY ; + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: fsm_state_next = mult_triangle_addr_done ? fsm_state_after_mult_triangle : FSM_STATE_MULT_TRIANGLE_COL_N_BUSY; - FSM_STATE_MULT_SQUARE_HOLDOFF: fsm_state_next = recomb_rdy ? FSM_STATE_IDLE : FSM_STATE_MULT_SQUARE_HOLDOFF; + FSM_STATE_MULT_TRIANGLE_HOLDOFF: fsm_state_next = FSM_STATE_MULT_TRIANGLE_HOLDOFF;//recomb_rdy ? FSM_STATE_IDLE : FSM_STATE_MULT_SQUARE_HOLDOFF; default: fsm_state_next = FSM_STATE_IDLE ; diff --git a/rtl/dsp/dsp_array.v b/rtl/dsp/dsp_array.v index 178f87f..2a050d4 100644 --- a/rtl/dsp/dsp_array.v +++ b/rtl/dsp/dsp_array.v @@ -8,11 +8,11 @@ module dsp_array input ce_p, input ce_mode, - input [8 -1:0] mode_z, + input [9 -1:0] mode_z, - input [4*18-1:0] a, + input [5*18-1:0] a, input [1*17-1:0] b, - output [8*47-1:0] p + output [9*47-1:0] p ); `include "../modexpng_parameters_x8.vh" @@ -37,7 +37,7 @@ module dsp_array genvar z; generate for (z=0; z<(NUM_MULTS/2); z=z+1) // - begin : DSP48E1 + begin : gen_DSP48E1 // dsp_slice # ( @@ -64,7 +64,7 @@ module dsp_array .opmode ({1'b0, mode_z[2*z], 1'b0, 2'b01, 2'b01}), .alumode (4'b0000), - .casc_a_in ({17{1'b0}}), + .casc_a_in ({18{1'b0}}), .casc_b_in ({17{1'b0}}), .casc_a_out (casc_a[z]), @@ -107,5 +107,37 @@ module dsp_array // endgenerate + dsp_slice # + ( + .AB_INPUT("DIRECT"), + .B_REG(2) + ) + dsp_aux + ( + .clk (clk), + + .ce_a1 (ce_a0), + .ce_b1 (ce_b0), + .ce_a2 (ce_a1), + .ce_b2 (ce_b1), + .ce_m (ce_m), + .ce_p (ce_p), + .ce_mode (ce_mode), + + .a (a[4*18+:18]), + .b (b), + .p (p[47*2*4+:47]), + + .inmode (5'b00000), + .opmode ({1'b0, mode_z[2*4], 1'b0, 2'b01, 2'b01}), + .alumode (4'b0000), + + .casc_a_in ({18{1'b0}}), + .casc_b_in ({17{1'b0}}), + + .casc_a_out (), + .casc_b_out () + ); + endmodule diff --git a/rtl/modexpng_mmm_fsm.vh b/rtl/modexpng_mmm_fsm.vh index c237a0b..2700a42 100644 --- a/rtl/modexpng_mmm_fsm.vh +++ b/rtl/modexpng_mmm_fsm.vh @@ -19,6 +19,16 @@ localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_TRIG = 15; localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_BUSY = 16; localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_HOLDOFF = 17; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_TRIANGLE_COL_0_INIT = 21; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_TRIANGLE_COL_0_TRIG = 22; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_TRIANGLE_COL_0_BUSY = 23; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_TRIANGLE_COL_N_INIT = 24; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_TRIANGLE_COL_N_TRIG = 25; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_TRIANGLE_COL_N_BUSY = 26; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_TRIANGLE_HOLDOFF = 27; + localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_STOP = 999;
\ No newline at end of file diff --git a/rtl/modexpng_part_recombinator.v b/rtl/modexpng_part_recombinator.v index db4774b..c51e7ef 100644 --- a/rtl/modexpng_part_recombinator.v +++ b/rtl/modexpng_part_recombinator.v @@ -7,7 +7,8 @@ module modexpng_part_recombinator dsp_x_ce_p, dsp_y_ce_p, ena_x, ena_y, dsp_x_p, dsp_y_p, - col_index, col_index_last, slim_bram_xy_addr, + col_index, col_index_last, + slim_bram_xy_addr, slim_bram_xy_bank, fat_bram_xy_bank, fat_bram_xy_addr, fat_bram_x_dout, fat_bram_y_dout, fat_bram_xy_dout_valid ); @@ -28,11 +29,12 @@ module modexpng_part_recombinator input dsp_y_ce_p; input ena_x; input ena_y; - input [8*47-1:0] dsp_x_p; - input [8*47-1:0] dsp_y_p; + input [9*47-1:0] dsp_x_p; + input [9*47-1:0] dsp_y_p; input [ 4:0] col_index; input [ 4:0] col_index_last; input [ 7:0] slim_bram_xy_addr; + input [ 1:0] slim_bram_xy_bank; output [ 2:0] fat_bram_xy_bank; output [ 7:0] fat_bram_xy_addr; @@ -44,18 +46,18 @@ module modexpng_part_recombinator // // Latches // - reg [1*47-1:0] dsp_x_p_latch[0:7]; - reg [1*47-1:0] dsp_y_p_latch[0:7]; + reg [1*47-1:0] dsp_x_p_latch[0:8]; + reg [1*47-1:0] dsp_y_p_latch[0:8]; // // Mapping // - wire [46:0] dsp_x_p_split[0:7]; - wire [46:0] dsp_y_p_split[0:7]; + wire [46:0] dsp_x_p_split[0:8]; + wire [46:0] dsp_y_p_split[0:8]; genvar z; - generate for (z=0; z<NUM_MULTS; z=z+1) + generate for (z=0; z<(NUM_MULTS+1); z=z+1) begin : gen_dsp_xy_p_split assign dsp_x_p_split[z] = dsp_x_p[47*z+:47]; assign dsp_y_p_split[z] = dsp_y_p[47*z+:47]; @@ -83,6 +85,8 @@ module modexpng_part_recombinator // valid reg x_valid_lsb = 1'b0; reg y_valid_lsb = 1'b0; + reg x_aux_lsb = 1'b0; + reg y_aux_lsb = 1'b0; reg x_valid_msb = 1'b0; reg y_valid_msb = 1'b0; @@ -106,6 +110,10 @@ module modexpng_part_recombinator reg x_valid_latch_lsb = 1'b0; reg y_valid_latch_lsb = 1'b0; + // aux - latch + reg x_aux_latch_lsb = 1'b0; + reg y_aux_latch_lsb = 1'b0; + // bitmap - latch reg [7:0] x_bitmap_latch_lsb = {8{1'b0}}; reg [7:0] y_bitmap_latch_lsb = {8{1'b0}}; @@ -125,6 +133,7 @@ module modexpng_part_recombinator // reg xy_valid_lsb_adv[1:6]; reg xy_valid_msb_adv[1:6]; + reg xy_aux_lsb_adv[1:6]; reg [7:0] xy_bitmap_lsb_adv[1:6]; reg [7:0] xy_bitmap_msb_adv[1:6]; reg [2:0] xy_index_lsb_adv[1:6]; @@ -132,11 +141,25 @@ module modexpng_part_recombinator reg xy_purge_lsb_adv[1:6]; reg xy_purge_msb_adv[1:6]; - + reg [1:0] rcmb_mode; + + always @(posedge clk) + // + if (ena_x && ena_y) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_BUSY: rcmb_mode <= 2'd1; + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY: rcmb_mode <= 2'd2; + //FSM_STATE_MULT_RECTANGLE_COL_0_BUSY: rcmb_mode <= 2'd3; + default: rcmb_mode <= 2'd0; + endcase + + integer i; initial for (i=1; i<6; i=i+1) begin xy_valid_lsb_adv[i] = 1'b0; xy_valid_msb_adv[i] = 1'b0; + xy_aux_lsb_adv[i] = 1'b0; xy_bitmap_lsb_adv[i] = {8{1'b0}}; xy_bitmap_msb_adv[i] = {8{1'b0}}; xy_index_lsb_adv[i] = 3'dX; @@ -145,7 +168,7 @@ module modexpng_part_recombinator xy_purge_msb_adv[i] = 1'b0; end - function [0:0] calc_square_valid_lsb; + function calc_square_valid_lsb; input [4:0] col_index_value; input [4:0] col_index_last_value; input [7:0] slim_bram_xy_addr_value; @@ -159,6 +182,40 @@ module modexpng_part_recombinator end endfunction + function calc_triangle_valid_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + begin + // + if (slim_bram_xy_addr_value[7:3] == col_index_value) + calc_triangle_valid_lsb = 1'b1; + else + calc_triangle_valid_lsb = 1'b0; + // + end + endfunction + + function calc_triangle_aux_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + input [1:0] slim_bram_xy_bank_value; + begin + // + if (slim_bram_xy_bank_value == BANK_SLIM_N_COEFF_EXT) + calc_triangle_aux_lsb = 1'b1; + else + calc_triangle_aux_lsb = 1'b0; + // + //if (slim_bram_xy_addr_value[7:3] == col_index_value) + //calc_triangle_aux_lsb = 1'b1; + //else + //calc_triangle_aux_lsb = 1'b0; + // + end + endfunction + function [7:0] calc_square_bitmap_lsb; input [4:0] col_index_value; input [4:0] col_index_last_value; @@ -183,7 +240,32 @@ module modexpng_part_recombinator // end endfunction - + + function [7:0] calc_triangle_bitmap_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + begin + // + if (slim_bram_xy_addr_value[7:3] == col_index_value) + // + case (slim_bram_xy_addr_value[2:0]) + 3'b000: calc_triangle_bitmap_lsb = 8'b00000001; + 3'b001: calc_triangle_bitmap_lsb = 8'b00000010; + 3'b010: calc_triangle_bitmap_lsb = 8'b00000100; + 3'b011: calc_triangle_bitmap_lsb = 8'b00001000; + 3'b100: calc_triangle_bitmap_lsb = 8'b00010000; + 3'b101: calc_triangle_bitmap_lsb = 8'b00100000; + 3'b110: calc_triangle_bitmap_lsb = 8'b01000000; + 3'b111: calc_triangle_bitmap_lsb = 8'b10000000; + endcase + // + else + calc_triangle_bitmap_lsb = {8{1'b0}}; + // + end + endfunction + function [2:0] calc_square_index_lsb; input [4:0] col_index_value; input [4:0] col_index_last_value; @@ -208,6 +290,31 @@ module modexpng_part_recombinator // end endfunction + + function [2:0] calc_triangle_index_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + begin + // + if (slim_bram_xy_addr_value[7:3] == col_index_value) + // + case (slim_bram_xy_addr_value[2:0]) + 3'b000: calc_triangle_index_lsb = 3'd0; + 3'b001: calc_triangle_index_lsb = 3'd1; + 3'b010: calc_triangle_index_lsb = 3'd2; + 3'b011: calc_triangle_index_lsb = 3'd3; + 3'b100: calc_triangle_index_lsb = 3'd4; + 3'b101: calc_triangle_index_lsb = 3'd5; + 3'b110: calc_triangle_index_lsb = 3'd6; + 3'b111: calc_triangle_index_lsb = 3'd7; + endcase + // + else + calc_triangle_index_lsb = 3'dX; + // + end + endfunction function calc_square_purge_lsb; input [4:0] col_index_value; @@ -271,10 +378,10 @@ module modexpng_part_recombinator reg recomb_lsb_ce = 1'b0; + reg recomb_lsb_ce_aux; reg [ 2:0] recomb_lsb_ce_purge = 3'b000; - wire recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_purge[0]; + wire recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_aux | recomb_lsb_ce_purge[0]; reg recomb_lsb_clr; - reg recomb_lsb_vld = 1'b0; reg [46:0] recomb_lsb_din; wire [15:0] recomb_lsb_dout; @@ -283,12 +390,7 @@ module modexpng_part_recombinator reg [ 1:0] recomb_msb_ce_purge = 2'b00; wire recomb_msb_ce_combined = recomb_msb_ce | recomb_msb_ce_purge[0]; reg recomb_msb_clr; - reg recomb_msb_vld = 1'b0; - always @(posedge clk) - // - {recomb_msb_vld, recomb_lsb_vld} <= {recomb_msb_ce_combined, recomb_lsb_ce_combined}; - reg [46:0] recomb_msb_din; wire [15:0] recomb_msb_dout; @@ -313,6 +415,7 @@ module modexpng_part_recombinator always @(posedge clk) begin // recomb_lsb_ce <= x_valid_latch_lsb; + recomb_lsb_ce_aux <= x_aux_latch_lsb; recomb_msb_ce <= x_bitmap_latch_msb[0]; // if (x_purge_latch_lsb) @@ -342,6 +445,8 @@ module modexpng_part_recombinator // if (x_valid_latch_lsb) recomb_lsb_din <= dsp_x_p_latch[x_index_latch_lsb]; + else if (x_aux_latch_lsb) + recomb_lsb_din <= dsp_x_p_latch[8]; else recomb_lsb_din <= {47{1'b0}}; @@ -363,6 +468,7 @@ module modexpng_part_recombinator FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin // xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, slim_bram_xy_addr); + xy_aux_lsb_adv [6] <= 1'b0; xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr); xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, slim_bram_xy_addr); xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, slim_bram_xy_addr); @@ -373,9 +479,27 @@ module modexpng_part_recombinator // end // + FSM_STATE_MULT_TRIANGLE_COL_0_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_N_TRIG, + FSM_STATE_MULT_TRIANGLE_COL_0_BUSY, + FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: begin + // + xy_valid_lsb_adv [6] <= calc_triangle_valid_lsb (col_index, col_index_last, slim_bram_xy_addr); /// bank + xy_aux_lsb_adv [6] <= calc_triangle_aux_lsb (col_index, col_index_last, slim_bram_xy_addr, slim_bram_xy_bank); + xy_bitmap_lsb_adv[6] <= calc_triangle_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr); //! bank + xy_index_lsb_adv [6] <= calc_triangle_index_lsb (col_index, col_index_last, slim_bram_xy_addr); // ! bank!!! + xy_purge_lsb_adv [6] <= 1'b0; + // + xy_valid_msb_adv [6] <= 1'b0; + xy_bitmap_msb_adv[6] <= {8{1'b0}}; + xy_purge_msb_adv [6] <= 1'b0; + // + end + // default: begin // xy_valid_lsb_adv [6] <= 1'b0; + xy_aux_lsb_adv [6] <= 1'b0; xy_bitmap_lsb_adv[6] <= {8{1'b0}}; xy_index_lsb_adv [6] <= 3'dX; xy_purge_lsb_adv [6] <= 1'b0; @@ -392,11 +516,13 @@ module modexpng_part_recombinator always @(posedge clk) begin // {y_valid_lsb, x_valid_lsb} <= {2{xy_valid_lsb_adv [1]}}; + {y_aux_lsb, x_aux_lsb} <= {2{xy_aux_lsb_adv [1]}}; {y_bitmap_lsb, x_bitmap_lsb} <= {2{xy_bitmap_lsb_adv[1]}}; {y_index_lsb, x_index_lsb} <= {2{xy_index_lsb_adv [1]}}; {y_purge_lsb, x_purge_lsb} <= {2{xy_purge_lsb_adv [1]}}; // {y_valid_latch_lsb, x_valid_latch_lsb} <= {y_valid_lsb, x_valid_lsb}; + {y_aux_latch_lsb, x_aux_latch_lsb} <= {y_aux_lsb, x_aux_lsb}; {y_bitmap_latch_lsb, x_bitmap_latch_lsb} <= {y_bitmap_lsb, x_bitmap_lsb}; {y_index_latch_lsb, x_index_latch_lsb} <= {y_index_lsb, x_index_lsb}; {y_purge_latch_lsb, x_purge_latch_lsb} <= {y_purge_lsb, x_purge_lsb}; @@ -415,6 +541,7 @@ module modexpng_part_recombinator // for (i=1; i<6; i=i+1) begin xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1]; + xy_aux_lsb_adv [i] <= xy_aux_lsb_adv [i+1]; xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1]; xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1]; xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1]; @@ -436,7 +563,7 @@ module modexpng_part_recombinator else dsp_x_p_latch[i] <= {47{1'bX}}; // - else if (dsp_x_ce_p_dly1) + else if (dsp_x_ce_p_dly1) begin // for (i=0; i<8; i=i+1) // @@ -444,6 +571,11 @@ module modexpng_part_recombinator dsp_x_p_latch[i] <= dsp_x_p_split[i]; else if (x_valid_msb && x_bitmap_msb[i]) dsp_x_p_latch[i] <= dsp_x_p_split[i]; + // + if (x_aux_lsb) + dsp_x_p_latch[8] <= dsp_x_p_split[8]; + // + end reg recomb_x_lsb_dout_valid = 1'b0; reg recomb_x_msb_dout_valid = 1'b0; @@ -493,12 +625,187 @@ module modexpng_part_recombinator else rdy_reg <= rdy_adv; + + task advance_recomb_msb_dout_delay; + input [15:0] dout; + input [ 7:0] cnt; + begin + recomb_msb_dout_delay_0 <= dout; + recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0; + recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1; + // + recomb_msb_cnt_delay_0 <= cnt; + recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0; + recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1; + end + endtask + + task shift_recomb_msb_dout_carry; + input [15:0] dout; + begin + recomb_msb_dout_carry_0 <= dout; + recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0; + end + endtask + + task _update_fat_bram_regs; + input [ 2:0] bank; + input [ 7:0] addr; + input [17:0] dout_x; + input [17:0] dout_y; + input valid; + begin + fat_bram_xy_bank_reg <= bank; + fat_bram_xy_addr_reg <= addr; + fat_bram_x_dout_reg <= dout_x; + fat_bram_y_dout_reg <= dout_y; + fat_bram_xy_dout_valid_reg <= 1'b1; + end + endtask + + + task set_fat_bram_regs; + input [ 2:0] bank; + input [ 7:0] addr; + input [17:0] dout_x; + input [17:0] dout_y; + begin + _update_fat_bram_regs(bank, addr, dout_x, dout_y, 1'b1); + end + endtask + + task clear_fat_bram_regs; + begin + _update_fat_bram_regs(3'bXXX, 8'hXX, {18{1'bX}}, {18{1'bX}}, 1'b0); + end + endtask + + task _set_fat_bram_cnt_lsb; + input [7:0] cnt; + begin + fat_bram_xy_cnt_lsb <= cnt; + end + endtask + task _set_fat_bram_cnt_msb; + input [7:0] cnt; + begin + fat_bram_xy_cnt_msb <= cnt; + end + endtask + + task inc_fat_bram_cnt_lsb; + begin + _set_fat_bram_cnt_lsb(fat_bram_xy_cnt_lsb + 1'b1); + end + endtask + task inc_fat_bram_cnt_msb; + begin + _set_fat_bram_cnt_msb(fat_bram_xy_cnt_msb + 1'b1); + end + endtask + + task clr_fat_bram_cnt_lsb; + begin + _set_fat_bram_cnt_lsb(8'd0); + end + endtask + task clr_fat_bram_cnt_msb; + begin + _set_fat_bram_cnt_msb(8'd0); + end + endtask + + + + + + wire [1:0] rcmb_xy_dout_valid = {recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid}; + + always @(posedge clk) + // + if (ena_x & ena_y) begin + clr_fat_bram_cnt_lsb(); + clr_fat_bram_cnt_msb(); + end else begin // if not ready??? + // + case (rcmb_mode) + 2'd1: recombine_square(); + 2'd2: recombine_triangle(); + endcase + // + end + + task recombine_square; + begin + // + case (rcmb_xy_dout_valid) + // + 2'b01: inc_fat_bram_cnt_lsb(); + 2'b10: inc_fat_bram_cnt_msb(); + 2'b11: begin + if (fat_bram_xy_cnt_lsb == index_last) clr_fat_bram_cnt_lsb(); + else inc_fat_bram_cnt_lsb(); + inc_fat_bram_cnt_msb(); + end + // + endcase + // + case (rcmb_xy_dout_valid) + // + 2'b00: if (recomb_msb_cnt_delay_2 > 8'd0) set_fat_bram_regs(BANK_FAT_ABH, recomb_msb_cnt_delay_2, {2'b00, recomb_msb_dout_delay_2}, {18{1'bX}}); + else clear_fat_bram_regs(); + 2'b01: set_fat_bram_regs(BANK_FAT_ABL, fat_bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}}); + 2'b10: if (fat_bram_xy_cnt_msb < 8'd2) clear_fat_bram_regs(); + else set_fat_bram_regs(BANK_FAT_ABH, fat_bram_xy_cnt_msb, {2'b00, recomb_msb_dout}, {18{1'bX}}); + 2'b11: if (fat_bram_xy_cnt_lsb < index_last) set_fat_bram_regs(BANK_FAT_ABH, fat_bram_xy_cnt_lsb, {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}}, {18{1'bX}}); + else set_fat_bram_regs(BANK_FAT_ABL, fat_bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}}); + // + endcase + // + case (rcmb_xy_dout_valid) + // + 2'b00: if (recomb_msb_cnt_delay_2 > 8'd0) advance_recomb_msb_dout_delay(16'hXXXX, 8'd0); + 2'b10: if (fat_bram_xy_cnt_msb < 8'd2) shift_recomb_msb_dout_carry(recomb_msb_dout); + // + 2'b11: begin advance_recomb_msb_dout_delay(recomb_msb_dout, fat_bram_xy_cnt_msb); + if (fat_bram_xy_cnt_lsb < index_last) shift_recomb_msb_dout_carry({16{1'bX}}); + end + // + endcase + // + end + // + endtask + + + task recombine_triangle; + begin + // + case (rcmb_xy_dout_valid) + // + 2'b01: begin inc_fat_bram_cnt_lsb(); + if (fat_bram_xy_cnt_lsb == index_last) inc_fat_bram_cnt_msb(); + end + // + endcase + // + case (rcmb_xy_dout_valid) + // + 2'b00: clear_fat_bram_regs(); + 2'b01: if (fat_bram_xy_cnt_msb == 8'd0) set_fat_bram_regs(BANK_FAT_Q, fat_bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}}); + else set_fat_bram_regs(BANK_FAT_Q_EXT, fat_bram_xy_cnt_lsb, {2'b00, recomb_lsb_dout}, {18{1'bX}}); + // + endcase + // + end + endtask + + + always @(posedge clk) // if (ena_x & ena_y) begin rdy_adv <= 1'b0; - fat_bram_xy_cnt_lsb <= 8'd0; - fat_bram_xy_cnt_msb <= 8'd0; end else begin // case ({recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid}) @@ -509,115 +816,13 @@ module modexpng_part_recombinator // rdy_adv <= recomb_msb_cnt_delay_1 == 8'd0; // - recomb_msb_dout_delay_0 <= {18{1'bX}}; - recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0; - recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1; - // - recomb_msb_cnt_delay_0 <= 8'd0; - recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0; - recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1; - // - fat_bram_xy_bank_reg <= BANK_FAT_ABH; - fat_bram_xy_addr_reg <= recomb_msb_cnt_delay_2; - fat_bram_x_dout_reg <= recomb_msb_dout_delay_2; -// fat_bram_y_dout_reg <= {18{1'bX}}; - fat_bram_xy_dout_valid_reg <= 1'b1; - // - end else begin - // - fat_bram_xy_bank_reg <= 3'bXXX; - fat_bram_xy_addr_reg <= 8'hXX; - fat_bram_x_dout_reg <= {18{1'bX}}; - fat_bram_y_dout_reg <= {18{1'bX}}; - fat_bram_xy_dout_valid_reg <= 1'b0; - // - end - // - end - // - 2'b01: begin - // - fat_bram_xy_bank_reg <= BANK_FAT_ABL; - fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; - fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout}; -// fat_bram_y_dout_reg - fat_bram_xy_dout_valid_reg <= 1'b1; - // - fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1; - // - end - // - 2'b10: begin - // - if (fat_bram_xy_cnt_msb < 8'd2) begin - // - recomb_msb_dout_carry_0 <= recomb_msb_dout; - recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0; - // - fat_bram_xy_bank_reg <= 3'bXXX; - fat_bram_xy_addr_reg <= 8'hXX; - fat_bram_x_dout_reg <= {18{1'bX}}; - // fat_bram_y_dout_reg - fat_bram_xy_dout_valid_reg <= 1'b0; - // - end else begin - // - fat_bram_xy_bank_reg <= BANK_FAT_ABH; - fat_bram_xy_addr_reg <= fat_bram_xy_cnt_msb; - fat_bram_x_dout_reg <= {2'b00, recomb_msb_dout}; - // fat_bram_y_dout_reg - fat_bram_xy_dout_valid_reg <= 1'b1; - // - end - // - fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1; - // - end - // - 2'b11: begin - // - if (fat_bram_xy_cnt_lsb == index_last) begin - // - fat_bram_xy_bank_reg <= BANK_FAT_ABL; - fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; - fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout}; -// fat_bram_y_dout_reg <= {18{1'bX}}; - fat_bram_xy_dout_valid_reg <= 1'b1; - // - fat_bram_xy_cnt_lsb <= 8'd0; - // - end else begin - // - fat_bram_xy_bank_reg <= BANK_FAT_ABH; - fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; - fat_bram_x_dout_reg <= {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}}; -// fat_bram_y_dout_reg <= {18{1'bX}}; - fat_bram_xy_dout_valid_reg <= 1'b1; - // - fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1; - // - recomb_msb_dout_carry_0 <= {16{1'bX}}; - recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0; - // end // - recomb_msb_dout_delay_0 <= recomb_msb_dout; - recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0; - recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1; - // - recomb_msb_cnt_delay_0 <= fat_bram_xy_cnt_msb; - recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0; - recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1; - // - fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1; - // end - // endcase // end - - - + + endmodule |