From 2791a17430c5b0c3291be3824aa8cdf07f305e92 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Tue, 11 Feb 2020 15:54:22 +0300 Subject: More elegant way to do partial product recombination: * take advantage of the cascade paths between DSP slices * decrease latency of operation --- rtl/modexpng_recombinator_block.v | 83 +++++------------ rtl/modexpng_recombinator_cell.v | 185 +++++++++++++++++++------------------- 2 files changed, 115 insertions(+), 153 deletions(-) diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v index e3cb50f..62d84e1 100644 --- a/rtl/modexpng_recombinator_block.v +++ b/rtl/modexpng_recombinator_block.v @@ -496,10 +496,8 @@ module modexpng_recombinator_block reg rcmb_xy_lsb_ce = 1'b0; reg rcmb_xy_lsb_ce_aux = 1'b0; - reg rcmb_xy_lsb_ce_aux_dly = 1'b0; reg [ 2:0] rcmb_xy_lsb_ce_purge = 3'b000; - wire rcmb_xy_lsb_ce_combined = rcmb_xy_lsb_ce | rcmb_xy_lsb_ce_aux | rcmb_xy_lsb_ce_purge[0]; - wire rcmb_xy_lsb_ce_combined_ext = rcmb_xy_lsb_ce | rcmb_xy_lsb_ce_aux | rcmb_xy_lsb_ce_purge[0] | rcmb_xy_lsb_ce_aux_dly; + wire rcmb_xy_lsb_ce_combined = rcmb_xy_lsb_ce | rcmb_xy_lsb_ce_aux | rcmb_xy_lsb_ce_purge[0]; reg rcmb_xy_lsb_clr; wire rcmb_xy_lsb_cry = !xy_valid_latch_lsb && rcmb_xy_lsb_ce_purge[1]; @@ -512,9 +510,7 @@ module modexpng_recombinator_block reg rcmb_xy_msb_ce = 1'b0; reg [ 1:0] rcmb_xy_msb_ce_purge = 2'b00; - reg rcmb_xy_msb_ce_purge0_rectangle_dly = 1'b0; - wire rcmb_xy_msb_ce_combined = rcmb_xy_msb_ce | rcmb_xy_msb_ce_purge[0]; - wire rcmb_xy_msb_ce_combined_ext = rcmb_xy_msb_ce | rcmb_xy_msb_ce_purge[0] | rcmb_xy_msb_ce_purge0_rectangle_dly; + wire rcmb_xy_msb_ce_combined = rcmb_xy_msb_ce | rcmb_xy_msb_ce_purge[0]; reg rcmb_xy_msb_clr; reg [ MAC_W -1:0] rcmb_x_msb_din; @@ -522,46 +518,44 @@ module modexpng_recombinator_block wire [WORD_W -1:0] rcmb_x_msb_dout; wire [WORD_W -1:0] rcmb_y_msb_dout; - always @(posedge clk) rcmb_xy_lsb_ce_aux_dly <= rcmb_xy_lsb_ce_aux; - always @(posedge clk) rcmb_xy_msb_ce_purge0_rectangle_dly <= rcmb_mode == RCMB_MODE_RECTANGLE ? rcmb_xy_msb_ce_purge[0] : 1'b0; - - modexpng_recombinator_cell recomb_x_lsb_new + modexpng_recombinator_cell recomb_x_lsb ( .clk (clk), - .ce (rcmb_xy_lsb_ce_combined_ext), + .ce (rcmb_xy_lsb_ce_combined), .clr (rcmb_xy_lsb_clr), .din (rcmb_x_lsb_din), .dout (rcmb_x_lsb_dout), .doutw (rcmb_x_lsb_doutw) ); - modexpng_recombinator_cell recomb_y_lsb_new + modexpng_recombinator_cell recomb_y_lsb ( .clk (clk), - .ce (rcmb_xy_lsb_ce_combined_ext), + .ce (rcmb_xy_lsb_ce_combined), .clr (rcmb_xy_lsb_clr), .din (rcmb_y_lsb_din), .dout (rcmb_y_lsb_dout), .doutw (rcmb_y_lsb_doutw) ); - modexpng_recombinator_cell recomb_x_msb_new + modexpng_recombinator_cell recomb_x_msb ( .clk (clk), - .ce (rcmb_xy_msb_ce_combined_ext), + .ce (rcmb_xy_msb_ce_combined), .clr (rcmb_xy_msb_clr), .din (rcmb_x_msb_din), .dout (rcmb_x_msb_dout), .doutw () ); - modexpng_recombinator_cell recomb_y_msb_new + modexpng_recombinator_cell recomb_y_msb ( .clk (clk), - .ce (rcmb_xy_msb_ce_combined_ext), + .ce (rcmb_xy_msb_ce_combined), .clr (rcmb_xy_msb_clr), .din (rcmb_y_msb_din), .dout (rcmb_y_msb_dout), .doutw () ); + always @(posedge clk) begin // @@ -596,8 +590,8 @@ module modexpng_recombinator_block rcmb_x_lsb_din <= dsp_x_p_latch[NUM_MULTS_AUX-1]; rcmb_y_lsb_din <= dsp_y_p_latch[NUM_MULTS_AUX-1]; end else if (rcmb_xy_lsb_cry) begin - rcmb_x_lsb_din <= rcmb_x_msb_carry_1; - rcmb_y_lsb_din <= rcmb_y_msb_carry_1; + rcmb_x_lsb_din <= {{(MAC_W-WORD_W){1'b0}}, rcmb_x_msb_carry_1}; + rcmb_y_lsb_din <= {{(MAC_W-WORD_W){1'b0}}, rcmb_y_msb_carry_1}; end else begin rcmb_x_lsb_din <= {MAC_W{1'b0}}; rcmb_y_lsb_din <= {MAC_W{1'b0}}; @@ -759,52 +753,17 @@ module modexpng_recombinator_block end - reg rcmb_xy_lsb_ce_combined_dly1 = 1'b0; - reg rcmb_xy_msb_ce_combined_dly1 = 1'b0; - - reg rcmb_xy_lsb_ce_combined_dly2 = 1'b0; - reg rcmb_xy_msb_ce_combined_dly2 = 1'b0; - - reg rcmb_xy_lsb_ce_combined_dly3 = 1'b0; - reg rcmb_xy_msb_ce_combined_dly3 = 1'b0; - - reg rcmb_xy_lsb_ce_combined_dly4 = 1'b0; - reg rcmb_xy_msb_ce_combined_dly4 = 1'b0; - - reg rcmb_xy_lsb_ce_combined_dly5 = 1'b0; - reg rcmb_xy_msb_ce_combined_dly5 = 1'b0; - - reg rcmb_xy_lsb_ce_combined_dly6 = 1'b0; - reg rcmb_xy_msb_ce_combined_dly6 = 1'b0; + reg [4:1] rcmb_xy_lsb_ce_combined_dly; + reg [4:1] rcmb_xy_msb_ce_combined_dly; always @(posedge clk or negedge rst_n) // if (!rst_n) begin - rcmb_xy_lsb_ce_combined_dly1 <= 1'b0; - rcmb_xy_msb_ce_combined_dly1 <= 1'b0; - rcmb_xy_lsb_ce_combined_dly2 <= 1'b0; - rcmb_xy_msb_ce_combined_dly2 <= 1'b0; - rcmb_xy_lsb_ce_combined_dly3 <= 1'b0; - rcmb_xy_msb_ce_combined_dly3 <= 1'b0; - rcmb_xy_lsb_ce_combined_dly4 <= 1'b0; - rcmb_xy_msb_ce_combined_dly4 <= 1'b0; - rcmb_xy_lsb_ce_combined_dly5 <= 1'b0; - rcmb_xy_msb_ce_combined_dly5 <= 1'b0; - rcmb_xy_lsb_ce_combined_dly6 <= 1'b0; - rcmb_xy_msb_ce_combined_dly6 <= 1'b0; + rcmb_xy_lsb_ce_combined_dly <= 4'b0000; + rcmb_xy_msb_ce_combined_dly <= 4'b0000; end else begin - rcmb_xy_lsb_ce_combined_dly1 <= rcmb_xy_lsb_ce_combined; - rcmb_xy_msb_ce_combined_dly1 <= rcmb_xy_msb_ce_combined; - rcmb_xy_lsb_ce_combined_dly2 <= rcmb_xy_lsb_ce_combined_dly1; - rcmb_xy_msb_ce_combined_dly2 <= rcmb_xy_msb_ce_combined_dly1; - rcmb_xy_lsb_ce_combined_dly3 <= rcmb_xy_lsb_ce_combined_dly2; - rcmb_xy_msb_ce_combined_dly3 <= rcmb_xy_msb_ce_combined_dly2; - rcmb_xy_lsb_ce_combined_dly4 <= rcmb_xy_lsb_ce_combined_dly3; - rcmb_xy_msb_ce_combined_dly4 <= rcmb_xy_msb_ce_combined_dly3; - rcmb_xy_lsb_ce_combined_dly5 <= rcmb_xy_lsb_ce_combined_dly4; - rcmb_xy_msb_ce_combined_dly5 <= rcmb_xy_msb_ce_combined_dly4; - rcmb_xy_lsb_ce_combined_dly6 <= rcmb_xy_lsb_ce_combined_dly5; - rcmb_xy_msb_ce_combined_dly6 <= rcmb_xy_msb_ce_combined_dly5; + rcmb_xy_lsb_ce_combined_dly <= {rcmb_xy_lsb_ce_combined_dly[3:1], rcmb_xy_lsb_ce_combined}; + rcmb_xy_msb_ce_combined_dly <= {rcmb_xy_msb_ce_combined_dly[3:1], rcmb_xy_msb_ce_combined}; end reg rcmb_xy_lsb_valid = 1'b0; @@ -816,8 +775,8 @@ module modexpng_recombinator_block rcmb_xy_lsb_valid <= 1'b0; rcmb_xy_msb_valid <= 1'b0; end else begin - rcmb_xy_lsb_valid <= rcmb_xy_lsb_ce_combined_dly6; - rcmb_xy_msb_valid <= rcmb_xy_msb_ce_combined_dly6; + rcmb_xy_lsb_valid <= rcmb_xy_lsb_ce_combined_dly[4]; + rcmb_xy_msb_valid <= rcmb_xy_msb_ce_combined_dly[4]; end diff --git a/rtl/modexpng_recombinator_cell.v b/rtl/modexpng_recombinator_cell.v index 0c9ab00..28d17f2 100644 --- a/rtl/modexpng_recombinator_cell.v +++ b/rtl/modexpng_recombinator_cell.v @@ -58,39 +58,51 @@ module modexpng_recombinator_cell // - // din <=> {z[13:0], y[15:0], x[15:0]} + // Pipelined Clock Enable, Clear, Data Input // - wire [WORD_W -1:0] din_z = {2'b00, din[3 * WORD_W -3 : 2 * WORD_W]}; // [47:46][45:32] - wire [WORD_W -1:0] din_y = { din[2 * WORD_W -1 : WORD_W]}; // [31:16] - wire [WORD_W -1:0] din_x = { din[ WORD_W -1 : 0]}; // [15: 0] + reg ce_pipe = 1'b0; + reg clr_pipe; + reg [MAC_W-1:0] din_pipe; + + always @(posedge clk) + {ce_pipe, clr_pipe, din_pipe} <= {ce, clr, din}; // - // Delayed Clock Enables + // din_pipe <=> {z[13:0], y[15:0], x[15:0]} // - reg ce_dly1 = 1'b0, ce_dly2 = 1'b0, ce_dly3 = 1'b0, ce_dly4 = 1'b0, ce_dly5 = 1'b0, ce_dly6 = 1'b0; - always @(posedge clk) {ce_dly1, ce_dly2, ce_dly3, ce_dly4, ce_dly5, ce_dly6} <= {ce, ce_dly1, ce_dly2, ce_dly3, ce_dly4, ce_dly5}; + wire [WORD_W -1:0] din_z = {2'b00, din_pipe[3 * WORD_W -3 : 2 * WORD_W]}; // (47:46)[45:32] + wire [WORD_W -1:0] din_y = { din_pipe[2 * WORD_W -1 : WORD_W]}; // [31:16] + wire [WORD_W -1:0] din_x = { din_pipe[ WORD_W -1 : 0]}; // [15: 0] - - // - // Delayed Clear - // - reg clr_dly1, clr_dly2, clr_dly3, clr_dly4; - always @(posedge clk) {clr_dly1, clr_dly2, clr_dly3, clr_dly4} <= {clr, clr_dly1, clr_dly2, clr_dly3}; - // // Phase Flip-Flop // - reg phase_ff, phase_ff_dly1, phase_ff_dly2, phase_ff_dly3, phase_ff_dly4, phase_ff_dly5; + reg phase_ff = 1'b0; + always @(posedge clk) - if (ce) phase_ff <= ~phase_ff; - else if (clr) phase_ff <= 1'b0; + phase_ff <= ce_pipe ? ~phase_ff : 1'b0; - always @(posedge clk) - {phase_ff_dly1, phase_ff_dly2, phase_ff_dly3, phase_ff_dly4, phase_ff_dly5} <= {phase_ff, phase_ff_dly1, phase_ff_dly2, phase_ff_dly3, phase_ff_dly4}; - + + // + // Delayed Clock Enable, Clear, Data Input + // + wire master_ce_0; + reg master_ce_1 = 1'b0; + wire slave_ce_1; + reg slave_ce_2 = 1'b0; + reg dout_ce_3 = 1'b0; + reg dout_ce_4 = 1'b0; + + assign master_ce_0 = ce_pipe; + assign slave_ce_1 = master_ce_1; + always @(posedge clk) master_ce_1 <= ce_pipe & ~phase_ff; + always @(posedge clk) slave_ce_2 <= slave_ce_1; + always @(posedge clk) {dout_ce_3, dout_ce_4} <= {slave_ce_2, dout_ce_3}; + + // // Shift Registers // @@ -101,11 +113,11 @@ module modexpng_recombinator_cell always @(posedge clk) begin // - if (ce) {din_x_dly1, din_y_dly1, din_z_dly1} <= {din_x, din_y, din_z}; - else if (clr) {din_x_dly1, din_y_dly1, din_z_dly1} <= {WORD_ZERO, WORD_ZERO, WORD_ZERO}; + if (ce_pipe) {din_x_dly1, din_y_dly1, din_z_dly1} <= {din_x, din_y, din_z}; + else if (clr_pipe) {din_x_dly1, din_y_dly1, din_z_dly1} <= {WORD_ZERO, WORD_ZERO, WORD_ZERO}; // - if (ce) {din_z_dly2} <= {din_z_dly1}; - else if (clr) {din_z_dly2} <= {WORD_ZERO}; + if (ce_pipe) {din_z_dly2} <= {din_z_dly1}; + else if (clr_pipe) {din_z_dly2} <= {WORD_ZERO}; // end @@ -113,82 +125,65 @@ module modexpng_recombinator_cell // // DSP Input Registers // - reg [2 * WORD_W-1:0] master_ab_reg; - reg [2 * WORD_W-1:0] master_c_reg; - - reg [ WORD_W+1:0] slave_ab_reg; - reg [ WORD_W+1:0] slave_ab_next_reg; - + wire [2 * WORD_W-1:0] master_ab; + wire [2 * WORD_W-1:0] master_c; + wire [2 * WORD_W-1:0] slave_ab; + reg slave_c; + + assign master_ab = {din_y, din_y_dly1}; + assign master_c = {din_z_dly1, din_z_dly2}; + assign slave_ab = {din_x, din_x_dly1}; + // // DSP Cascade Bus // wire [DSP48E1_P_W-1:0] master_slave_p_int; - + // // DSP Output Buses // - wire [DSP48E1_P_W-1:0] master_p_int; + wire master_carry_out_int; wire [DSP48E1_P_W-1:0] slave_p_int; - - - // - // DSP Input Mapping - // - wire [DSP48E1_C_W-1:0] master_ab_int = {{(DSP48E1_C_W - 2 * WORD_W){1'b0}}, master_ab_reg}; - wire [DSP48E1_C_W-1:0] master_c_int = {{(DSP48E1_C_W - 2 * WORD_W){1'b0}}, master_c_reg}; - - wire [DSP48E1_C_W-1:0] slave_ab_int = {{(DSP48E1_C_W - (WORD_W+3)){1'b0}}, slave_ab_reg[WORD_W+1:WORD_W], 1'b1, slave_ab_reg[WORD_W-1:0]}; - wire [DSP48E1_C_W-1:0] slave_c_int = {DSP48E1_C_W{1'b0}}; + wire slave_carry_out_int; // - // Master DSP Input Logic + // Custom Carry Cascade // always @(posedge clk) // - if (ce) begin - master_ab_reg <= !phase_ff ? {din_y, din_y_dly1} : {din_x, din_x_dly1}; - master_c_reg <= !phase_ff ? {din_z_dly1, din_z_dly2} : {WORD_DNC, WORD_DNC}; - end else begin - master_ab_reg <= {WORD_DNC, WORD_DNC}; - master_c_reg <= {WORD_DNC, WORD_DNC}; - end - + if (slave_ce_2) slave_c <= master_carry_out_int; + // - // Slave DSP Input Logic + // DSP Input Mapping // - always @(posedge clk) begin - // - slave_ab_reg <= {(WORD_W+2){1'bX}}; - slave_ab_next_reg <= {(WORD_W+2){1'bX}}; - // - if (ce_dly3 && phase_ff_dly3) slave_ab_next_reg <= {master_p_int[2*WORD_W+1:WORD_W]}; - // - if (ce_dly3 && phase_ff_dly3) slave_ab_reg <= {2'b00, master_p_int[WORD_W-1:0]}; - if (ce_dly4 && phase_ff_dly4) slave_ab_reg <= slave_ab_next_reg; - // - end - + wire [DSP48E1_C_W-1:0] master_ab_int = {master_ab, {(DSP48E1_C_W - 2*WORD_W){1'b0}}}; + wire [DSP48E1_C_W-1:0] master_c_int = {master_c, {(DSP48E1_C_W - 2*WORD_W){1'b0}}}; + + wire [DSP48E1_C_W-1:0] slave_ab_int = {slave_ab, {(DSP48E1_C_W - 2*WORD_W){1'b0}}}; + wire [DSP48E1_C_W-1:0] slave_c_int = {{(2*WORD_W-1){1'b0}}, slave_c, {(DSP48E1_C_W-2*WORD_W){1'b1}}}; + // - // OPMODE Logic + // DPS Modes // - reg [DSP48E1_OPMODE_W-1:0] master_opmode; - reg [DSP48E1_OPMODE_W-1:0] slave_opmode; + wire [DSP48E1_OPMODE_W -1:0] master_opmode; + wire [DSP48E1_CARRYINSEL_W-1:0] master_carryinsel; + + reg [DSP48E1_OPMODE_W -1:0] slave_opmode; + reg [DSP48E1_CARRYINSEL_W-1:0] slave_carryinsel; + + assign master_opmode = DSP48E1_OPMODE_Z0_YC_XAB; + assign master_carryinsel = DSP48E1_CARRYINSEL_CARRYIN; always @(posedge clk) begin - // - if (ce) master_opmode <= !phase_ff ? DSP48E1_OPMODE_Z0_YC_XAB : DSP48E1_OPMODE_ZP_Y0_XAB; - else master_opmode <= DSP48E1_OPMODE_DNC; - // - if (ce_dly4) slave_opmode <= clr_dly4 ? DSP48E1_OPMODE_Z0_Y0_XAB : DSP48E1_OPMODE_ZP17_Y0_XAB; - else slave_opmode <= DSP48E1_OPMODE_DNC; - // + slave_opmode <= clr_pipe ? DSP48E1_OPMODE_ZPCIN_Y0_XAB : DSP48E1_OPMODE_ZPCIN_YC_XAB; + slave_carryinsel <= clr_pipe ? DSP48E1_CARRYINSEL_CARRYIN : DSP48E1_CARRYINSEL_CARRYCASCOUT; end - + // // DSP Slice Instances @@ -196,42 +191,50 @@ module modexpng_recombinator_cell `MODEXPNG_DSP_SLICE_ADDSUB dsp_master_inst ( .clk (clk), - .ce_abc (ce_dly1), - .ce_p (ce_dly2), - .ce_ctrl (ce_dly1), + .ce_abc (master_ce_0), + .ce_p (master_ce_1), + .ce_ctrl (master_ce_0), .ab (master_ab_int), .c (master_c_int), - .p (master_p_int), + .p (), .op_mode (master_opmode), .alu_mode (DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN), - .carry_in_sel (DSP48E1_CARRYINSEL_CARRYIN), + .carry_in_sel (master_carryinsel), .casc_p_in (), - .casc_p_out (), - .carry_out () + .casc_p_out (master_slave_p_int), + .carry_out (master_carry_out_int) ); `MODEXPNG_DSP_SLICE_ADDSUB dsp_slave_inst ( .clk (clk), - .ce_abc (ce_dly5), - .ce_p (ce_dly6), - .ce_ctrl (ce_dly5), + .ce_abc (slave_ce_1), + .ce_p (slave_ce_2), + .ce_ctrl (slave_ce_1), .ab (slave_ab_int), .c (slave_c_int), .p (slave_p_int), .op_mode (slave_opmode), .alu_mode (DSP48E1_ALUMODE_Z_PLUS_X_AND_Y_AND_CIN), - .carry_in_sel (DSP48E1_CARRYINSEL_CARRYIN), - .casc_p_in (), + .carry_in_sel (slave_carryinsel), + .casc_p_in (master_slave_p_int), .casc_p_out (), - .carry_out () + .carry_out (slave_carry_out_int) ); // // Output Register - // - assign dout = {slave_p_int[WORD_W-1:0]}; - assign doutw = {slave_p_int[WORD_W+1], dout}; + // + reg [WORD_W:0] doutx_reg; + + assign dout = doutx_reg[WORD_W-1:0]; + assign doutw = doutx_reg; + + always @(posedge clk) begin + doutx_reg <= {1'bX, WORD_DNC}; + if (dout_ce_4) doutx_reg <= {slave_carry_out_int, slave_p_int[DSP48E1_P_W - 0*WORD_W -1 -: WORD_W]}; + if (dout_ce_3) doutx_reg <= {1'b0, slave_p_int[DSP48E1_P_W - 1*WORD_W -1 -: WORD_W]}; + end endmodule -- cgit v1.2.3