diff options
author | Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru> | 2019-10-01 15:01:43 +0300 |
---|---|---|
committer | Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru> | 2019-10-01 15:01:43 +0300 |
commit | 29fb6afd018c601a2e0c7376656d5e37beb565d6 (patch) | |
tree | dc11ee0c8e5a30113052254be23594da74a8a572 /rtl | |
parent | ec07464d239f7f6379a682ac57b58b863d3f0374 (diff) |
Started working on the pipelined Montgomery modular multiplier. Currently can
do the "square" part of the multiplication, i.e. compute the twice larger
intermediate product AB = A * B.
Diffstat (limited to 'rtl')
-rw-r--r-- | rtl/dev/temp.txt | 384 | ||||
-rw-r--r-- | rtl/dsp/dsp_array.v | 111 | ||||
-rw-r--r-- | rtl/dsp/dsp_slice.v | 125 | ||||
-rw-r--r-- | rtl/modexpng_mac.v | 54 | ||||
-rw-r--r-- | rtl/modexpng_mac_array.v | 116 | ||||
-rw-r--r-- | rtl/modexpng_mem.v | 93 | ||||
-rw-r--r-- | rtl/modexpng_mmm_col_index.v | 90 | ||||
-rw-r--r-- | rtl/modexpng_mmm_din_addr.v | 167 | ||||
-rw-r--r-- | rtl/modexpng_mmm_dout_addr.v | 167 | ||||
-rw-r--r-- | rtl/modexpng_mmm_fsm.vh | 24 | ||||
-rw-r--r-- | rtl/modexpng_mmm_pad.v | 153 | ||||
-rw-r--r-- | rtl/modexpng_mmm_transporter.v | 157 | ||||
-rw-r--r-- | rtl/modexpng_mmm_x8_dual.v | 550 | ||||
-rw-r--r-- | rtl/modexpng_parameters.vh | 39 | ||||
-rw-r--r-- | rtl/modexpng_parameters_x8.vh | 1 | ||||
-rw-r--r-- | rtl/modexpng_part_recombinator.v | 623 | ||||
-rw-r--r-- | rtl/modexpng_recombinator_block.v | 35 |
17 files changed, 2889 insertions, 0 deletions
diff --git a/rtl/dev/temp.txt b/rtl/dev/temp.txt new file mode 100644 index 0000000..987bd86 --- /dev/null +++ b/rtl/dev/temp.txt @@ -0,0 +1,384 @@ + // + // Helper Functions + // + /* + function [INDEX_WIDTH-1:0] calc_preset_a_index; + input [INDEX_WIDTH-4:0] col_in; + input integer x_in; + integer index_out; + begin + index_out = col_in * NUM_MULTS + x_in; + calc_preset_a_index = index_out[INDEX_WIDTH-1:0]; + end + endfunction + + function [INDEX_WIDTH-1:0] calc_rotate_a_index; + input [INDEX_WIDTH-1:0] current_index_in; + input [INDEX_WIDTH-1:0] last_index_in; + begin + if (current_index_in > {INDEX_WIDTH{1'b0}}) + calc_rotate_a_index = current_index_in - 1'b1; + else + calc_rotate_a_index = last_index_in; + end + endfunction + */ + + /* + // + // Narrow Counters + // + reg [INDEX_WIDTH-1:0] din_addr_narrow_reg; + reg [INDEX_WIDTH-1:0] din_addr_narrow_dly; + localparam [INDEX_WIDTH-1:0] din_addr_narrow_zero = {INDEX_WIDTH{1'b0}}; + wire [INDEX_WIDTH-1:0] din_addr_narrow_next = (din_addr_narrow_reg < index_last) ? + din_addr_narrow_reg + 1'b1 : din_addr_narrow_zero; + wire din_addr_narrow_done = din_addr_narrow_reg == index_last; + + assign din_addr_narrow = din_addr_narrow_reg; + + always @(posedge clk) + // + din_addr_narrow_dly <= din_addr_narrow_reg; + + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero; + FSM_STATE_MULT_SQUARE_COL_0_BUSY: din_addr_narrow_reg <= din_addr_narrow_next; + FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_narrow_reg <= din_addr_narrow_next; + endcase + + + // + // Helper Functions + // + function [NUM_MULTS-1:0] calc_mac_clear_bitmask; + input [2:0] t; + begin + case (t) + 3'd0: calc_mac_clear_bitmask = 8'b00000001; + 3'd1: calc_mac_clear_bitmask = 8'b00000010; + 3'd2: calc_mac_clear_bitmask = 8'b00000100; + 3'd3: calc_mac_clear_bitmask = 8'b00001000; + 3'd4: calc_mac_clear_bitmask = 8'b00010000; + 3'd5: calc_mac_clear_bitmask = 8'b00100000; + 3'd6: calc_mac_clear_bitmask = 8'b01000000; + 3'd7: calc_mac_clear_bitmask = 8'b10000000; + endcase + end + endfunction + + function [NUM_MULTS:0] calc_mac_clear_square; + input [INDEX_WIDTH-4:0] current_col_index; + input [INDEX_WIDTH-1:0] b_addr_prev; + begin + if (b_addr_prev[INDEX_WIDTH-1:3] == current_col_index) + calc_mac_clear_square = {1'b0, calc_mac_clear_bitmask(b_addr_prev[2:0])}; + else + calc_mac_clear_square = {1'b0, {NUM_MULTS{1'b0}}}; + end + endfunction + + + // + // Wide Counters + // + reg [INDEX_WIDTH-1:0] din_addr_wide_reg[0:NUM_MULTS-1]; + + integer xi; + always @(posedge clk) + // + for (xi=0; xi<NUM_MULTS; xi=xi+1) + // + case (fsm_state_next) + // + FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(0, xi); + FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(col_index + 1'b1, xi); + // + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_wide_reg[xi] <= calc_rotate_a_index(din_addr_wide_reg[xi], index_last); + // + endcase + + + // + // Enables + // + reg din_ena_narrow_reg = 1'b0; + reg [NUM_MULTS-1:0] din_ena_wide_reg = {NUM_MULTS{1'b0}}; + + assign din_ena_narrow = din_ena_narrow_reg; + assign din_ena_wide = din_ena_wide_reg; + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) din_ena_narrow_reg <= 1'b0; + else case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_narrow_reg <= 1'b1; + default: din_ena_narrow_reg <= 1'b0; + endcase + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) din_ena_wide_reg <= {NUM_MULTS{1'b0}}; + else case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_wide_reg <= {NUM_MULTS{1'b1}}; + default: din_ena_wide_reg <= {NUM_MULTS{1'b0}}; + endcase + + + // + // Modes + // + reg [2-1:0] din_mode_wide_reg; + reg [2-1:0] din_mode_narrow_reg; + reg [2-1:0] dout_mode_wide_reg; + reg [2-1:0] dout_mode_narrow_reg; + + assign din_mode_wide = din_mode_wide_reg; + assign din_mode_narrow = din_mode_narrow_reg; + + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_wide_reg <= MODEXPNG_MODE_A; + default: din_mode_wide_reg <= 2'bXX; + endcase + + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_narrow_reg <= MODEXPNG_MODE_B; + default: din_mode_narrow_reg <= 2'bXX; + endcase + + + // + // MAC Array + // + wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_a[0:NUM_MULTS]; + wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_b; + reg [ NUM_MULTS :0] mac_ce; + reg [ NUM_MULTS :0] mac_clr; + wire [ MODEXPNG_MAC_WIDTH-1:0] mac_p[0:NUM_MULTS]; + reg [ NUM_MULTS :0] mac_rdy_lsb; + reg [ NUM_MULTS :0] mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1:0]; + + //reg [ NUM_MULTS :0] mac_ce_dly[MODEXPNG_MAC_LATENCY-1:0]; + //wire [ NUM_MULTS :0] mac_rdy; + + + + + + assign mac_din_b = din_narrow; + + + genvar x; + generate for (x=0; x<=NUM_MULTS; x=x+1) + begin : gen_macs + // + //assign mac_rdy[x] = mac_ce_dly[MODEXPNG_MAC_LATENCY-1][x]; + // + modexpng_mac mac_inst + ( + .clk (clk), + .ce (mac_ce[x]), + .clr (mac_clr[x]), + .a (mac_din_a[x]), + .b (mac_din_b), + .p (mac_p[x]) + ); + // + end + // + endgenerate + + generate for (x=0; x<NUM_MULTS; x=x+1) + begin : gen_mac_din_a + // + assign mac_din_a[x] = din_wide[x*MODEXPNG_WORD_WIDTH+:MODEXPNG_WORD_WIDTH]; + // + end + endgenerate + + generate for (x=0; x<NUM_MULTS; x=x+1) + begin : gen_din_addr_wide + // + assign din_addr_wide[x*INDEX_WIDTH+:INDEX_WIDTH] = din_addr_wide_reg[x]; + // + end + endgenerate + + + // + // MAC Clock Enable Logic + // + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) mac_ce <= {1'b0, {NUM_MULTS{1'b0}}}; + else case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_ce <= {1'b0, {NUM_MULTS{1'b1}}}; + default: mac_ce <= {1'b0, {NUM_MULTS{1'b0}}}; + endcase + + + // + // MAC Valid Logic + // + integer y; + + always @(posedge clk) + // + for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin + mac_rdy_lsb_dly[0][xi] <= mac_rdy_lsb[xi]; + for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1) + mac_rdy_lsb_dly[y][xi] <= mac_rdy_lsb_dly[y-1][xi]; + end + + always @(posedge clk) begin + // + fsm_state_dly[0] <= fsm_state; + for (y=1; y<=MODEXPNG_MAC_LATENCY; y=y+1) + fsm_state_dly[y] <= fsm_state_dly[y-1]; + end + + */ + + /* + always @(posedge clk) + // + for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin + mac_ce_dly[0][xi] <= mac_ce[xi]; + for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1) + mac_ce_dly[y][xi] <= mac_ce_dly[y-1][xi]; + end + */ + /* + always @(posedge clk) + // + for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin + mac_clr_dly[0][xi] <= mac_clr[xi]; + for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1) + mac_clr_dly[y][xi] <= mac_clr_dly[y-1][xi]; + end + */ + + /* + // + // MAC Clear Logic + // + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_clr <= {1'b0, {NUM_MULTS{1'b1}}}; + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_clr <= calc_mac_clear_square(col_index, din_addr_narrow_dly); + default: mac_clr <= {1'bX, {NUM_MULTS{1'bX}}}; + endcase + + + // + // MAC Ready Logic + // + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_rdy_lsb <= calc_mac_clear_square(col_index, din_addr_narrow); + default: mac_rdy_lsb <= {1'bX, {NUM_MULTS{1'bX}}}; + endcase + + + + // + // Recombinators + // + reg rcmb_lsb_ce; + reg rcmb_lsb_clr; + reg [MODEXPNG_MAC_WIDTH-1: 0] rcmb_lsb_din; + wire [15: 0] rcmb_lsb_dout; + + modexpng_part_recombinator recomb_lsb + ( + .clk (clk), + .ce (rcmb_lsb_ce), + .clr (rcmb_lsb_clr), + .din (rcmb_lsb_din), + .dout (rcmb_lsb_dout) + ); + + + reg calc_rcmb_lsb_ce; + always @* + // + calc_rcmb_lsb_ce = | mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0]; + + reg [MODEXPNG_MAC_WIDTH-1:0] calc_rcmb_lsb_din; + + always @* + // + casez (mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0]) + 8'b00000001: calc_rcmb_lsb_din = mac_p[0]; + 8'b00000010: calc_rcmb_lsb_din = mac_p[1]; + 8'b00000100: calc_rcmb_lsb_din = mac_p[2]; + 8'b00001000: calc_rcmb_lsb_din = mac_p[3]; + 8'b00010000: calc_rcmb_lsb_din = mac_p[4]; + 8'b00100000: calc_rcmb_lsb_din = mac_p[5]; + 8'b01000000: calc_rcmb_lsb_din = mac_p[6]; + 8'b10000000: calc_rcmb_lsb_din = mac_p[7]; + default: calc_rcmb_lsb_din = {MODEXPNG_MAC_WIDTH{1'bX}}; + endcase + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) + rcmb_lsb_ce <= 1'b0; + else case (fsm_state_dly[MODEXPNG_MAC_LATENCY]) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_ce <= calc_rcmb_lsb_ce; + default: rcmb_lsb_ce <= 1'b0; + endcase + + always @(posedge clk) + // + case (fsm_state_dly[MODEXPNG_MAC_LATENCY]) + FSM_STATE_MULT_SQUARE_COL_0_TRIG: rcmb_lsb_clr <= 1'b1; + default: rcmb_lsb_clr <= 1'b0; + endcase + + always @(posedge clk) + // + case (fsm_state_dly[MODEXPNG_MAC_LATENCY]) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_din <= calc_rcmb_lsb_din; + default: rcmb_lsb_din <= {MODEXPNG_MAC_WIDTH{1'bX}}; + endcase + + + +*/ diff --git a/rtl/dsp/dsp_array.v b/rtl/dsp/dsp_array.v new file mode 100644 index 0000000..178f87f --- /dev/null +++ b/rtl/dsp/dsp_array.v @@ -0,0 +1,111 @@ +module dsp_array +( + input clk, + + input ce_a, + input ce_b, + input ce_m, + input ce_p, + input ce_mode, + + input [8 -1:0] mode_z, + + input [4*18-1:0] a, + input [1*17-1:0] b, + output [8*47-1:0] p +); + + `include "../modexpng_parameters_x8.vh" + + wire [17:0] casc_a[0:3]; + wire [16:0] casc_b[0:3]; + + wire ce_a0 = ce_a; + reg ce_a1 = 1'b0; + reg ce_a2 = 1'b0; + + wire ce_b0 = ce_b; + reg ce_b1 = 1'b0; + + always @(posedge clk) begin + ce_a1 <= ce_a0; + ce_a2 <= ce_a1; + ce_b1 <= ce_b0; + end + + + genvar z; + generate for (z=0; z<(NUM_MULTS/2); z=z+1) + // + begin : DSP48E1 + // + dsp_slice # + ( + .AB_INPUT("DIRECT"), + .B_REG(2) + ) + dsp_direct + ( + .clk (clk), + + .ce_a1 (ce_a0), + .ce_b1 (ce_b0), + .ce_a2 (ce_a1), + .ce_b2 (ce_b1), + .ce_m (ce_m), + .ce_p (ce_p), + .ce_mode (ce_mode), + + .a (a[z*18+:18]), + .b (b), + .p (p[47*2*z+:47]), + + .inmode (5'b00000), + .opmode ({1'b0, mode_z[2*z], 1'b0, 2'b01, 2'b01}), + .alumode (4'b0000), + + .casc_a_in ({17{1'b0}}), + .casc_b_in ({17{1'b0}}), + + .casc_a_out (casc_a[z]), + .casc_b_out (casc_b[z]) + ); + // + dsp_slice # + ( + .AB_INPUT("CASCADE"), + .B_REG(1) + ) + dsp_cascade + ( + .clk (clk), + + .ce_a1 (ce_a1), + .ce_b1 (1'b0), + .ce_a2 (ce_a2), + .ce_b2 (ce_b1), + .ce_m (ce_m), + .ce_p (ce_p), + .ce_mode (ce_mode), + + .a (a[z*18+:18]), + .b (b), + .p (p[47*(2*z+1)+:47]), + + .inmode (5'b00000), + .opmode ({1'b0, mode_z[2*z+1], 1'b0, 2'b01, 2'b01}), + .alumode (4'b0000), + + .casc_a_in (casc_a[z]), + .casc_b_in (casc_b[z]), + + .casc_a_out (), + .casc_b_out () + ); + // + end + // + endgenerate + + +endmodule diff --git a/rtl/dsp/dsp_slice.v b/rtl/dsp/dsp_slice.v new file mode 100644 index 0000000..9f1298b --- /dev/null +++ b/rtl/dsp/dsp_slice.v @@ -0,0 +1,125 @@ +module dsp_slice # +( + AB_INPUT = "DIRECT", + B_REG = 2 +) +( + input clk, + input ce_a1, + input ce_b1, + input ce_a2, + input ce_b2, + input ce_m, + input ce_p, + input ce_mode, + input [17:0] a, + input [16:0] b, + output [46:0] p, + input [ 4:0] inmode, + input [ 6:0] opmode, + input [ 3:0] alumode, + input [17:0] casc_a_in, + input [16:0] casc_b_in, + output [17:0] casc_a_out, + output [16:0] casc_b_out +); + + wire [30-18-1:0] casc_a_dummy; + wire [18-17-1:0] casc_b_dummy; + wire [48-47-1:0] p_dummy; + + DSP48E1 # + ( + .AREG (2), + .BREG (B_REG), + .CREG (0), + .DREG (0), + .ADREG (0), + .MREG (1), + .PREG (1), + .ACASCREG (1), + .BCASCREG (1), + .INMODEREG (0), + .OPMODEREG (1), + .ALUMODEREG (0), + .CARRYINREG (0), + .CARRYINSELREG (0), + + .A_INPUT (AB_INPUT), + .B_INPUT (AB_INPUT), + + .USE_DPORT ("FALSE"), + .USE_MULT ("DYNAMIC"), + .USE_SIMD ("ONE48"), + + .MASK (48'h3fffffffffff), + .PATTERN (48'h000000000000), + .SEL_MASK ("MASK"), + .SEL_PATTERN ("PATTERN"), + + .USE_PATTERN_DETECT ("NO_PATDET"), + .AUTORESET_PATDET ("NO_RESET") + ) + DSP48E1_inst + ( + .CLK (clk), + + .CEA1 (ce_a1), + .CEB1 (ce_b1), + .CEA2 (ce_a2), + .CEB2 (ce_b2), + .CEAD (1'b0), + .CEC (1'b0), + .CED (1'b0), + .CEM (ce_m), + .CEP (ce_p), + .CEINMODE (1'b0), + .CECTRL (ce_mode), + .CEALUMODE (1'b0), + .CECARRYIN (1'b0), + + .A ({{(30-18){1'b0}}, a}), + .B ({{(18-17){1'b0}}, b}), + .C ({48{1'b0}}), + .D ({25{1'b0}}), + .P ({p_dummy, p}), + + .INMODE (inmode), + .OPMODE (opmode), + .ALUMODE (alumode), + + .ACIN ({{(30-18){1'b0}}, casc_a_in}), + .BCIN ({{(18-17){1'b0}}, casc_b_in}), + .ACOUT ({casc_a_dummy, casc_a_out}), + .BCOUT ({casc_b_dummy, casc_b_out}), + .PCIN ({48{1'b0}}), + .PCOUT (), + .CARRYCASCIN (1'b0), + .CARRYCASCOUT (), + + .RSTA (1'b0), + .RSTB (1'b0), + .RSTC (1'b0), + .RSTD (1'b0), + .RSTM (1'b0), + .RSTP (1'b0), + .RSTINMODE (1'b0), + .RSTCTRL (1'b0), + .RSTALUMODE (1'b0), + .RSTALLCARRYIN (1'b0), + + .UNDERFLOW (), + .OVERFLOW (), + .PATTERNDETECT (), + .PATTERNBDETECT (), + + .CARRYIN (1'b0), + .CARRYOUT (), + .CARRYINSEL (3'b000), + + .MULTSIGNIN (1'b0), + .MULTSIGNOUT () + ); + + +endmodule diff --git a/rtl/modexpng_mac.v b/rtl/modexpng_mac.v new file mode 100644 index 0000000..9105dab --- /dev/null +++ b/rtl/modexpng_mac.v @@ -0,0 +1,54 @@ +module modexpng_mac +( + clk, + ce, clr, + casc_a, + a_in, b_in, p_out, + a_casc_in, a_casc_out +); + + input clk; + input ce; + input clr; + input casc_a; + input [16:0] a_in; + input [16:0] b_in; + output [46:0] p_out; + input [16:0] a_casc_in; + output [16:0] a_casc_out; + + reg [16:0] a_reg; + reg [16:0] b_reg; + assign a_casc_out = a_reg; + always @(posedge clk) + // + if (ce) {b_reg, a_reg} <= {b_in, casc_a ? a_casc_in : a_in}; + + reg ce_dly1; + reg ce_dly2; + always @(posedge clk) + // + {ce_dly2, ce_dly1} <= {ce_dly1, ce}; + + reg clr_dly1; + reg clr_dly2; + always @(posedge clk) begin + // + if (ce) clr_dly1 <= clr; + if (ce_dly1) clr_dly2 <= clr_dly1; + // + end + + reg [33:0] m_reg; + wire [46:0] m_reg_ext = {{13{1'b0}}, m_reg}; + always @(posedge clk) + // + if (ce_dly1) m_reg <= {{17{1'b0}}, a_reg} * {{17{1'b0}}, b_reg}; + + reg [46:0] p_reg; + assign p_out = p_reg; + always @(posedge clk) + // + if (ce_dly2) p_reg <= clr_dly2 ? m_reg_ext : p_reg + m_reg_ext; + +endmodule diff --git a/rtl/modexpng_mac_array.v b/rtl/modexpng_mac_array.v new file mode 100644 index 0000000..067929e --- /dev/null +++ b/rtl/modexpng_mac_array.v @@ -0,0 +1,116 @@ +module modexpng_mac_array +( + clk, + ce, clr, + ce_aux, clr_aux, + casc_a, casc_a_aux, + a_in, b_in, p_out, + a_in_aux, p_out_aux +); + + + // + // Includes + // + `include "modexpng_parameters.vh" + `include "modexpng_parameters_x8.vh" + + + // + // Ports + // + input clk; + input ce; + input [NUM_MULTS -1:0] clr; + input ce_aux; + input clr_aux; + input [NUM_MULTS -2:0] casc_a; + input casc_a_aux; + input [NUM_MULTS * WORD_WIDTH -1:0] a_in; + input [ 1 * WORD_WIDTH -1:0] b_in; + output [NUM_MULTS * MAC_WIDTH -1:0] p_out; + input [ 1 * WORD_WIDTH -1:0] a_in_aux; + output [ 1 * MAC_WIDTH -1:0] p_out_aux; + + + // + // A-Cascade Paths + // + wire [WORD_WIDTH-1:0] a_casc_int[0:NUM_MULTS-2]; + wire [WORD_WIDTH-1:0] a_casc_int_aux; + + + // + // LSB + // + modexpng_mac mac_lsb + ( + .clk (clk), + .ce (ce), + .clr (clr[0]), + .casc_a (1'b0), + .a_in (a_in[0+:WORD_WIDTH]), + .b_in (b_in), + .p_out (p_out[0+:MAC_WIDTH]), + .a_casc_in ({WORD_WIDTH{1'b0}}), + .a_casc_out (a_casc_int[0]) + ); + + + // + // INT + // + genvar z; + generate for (z=1; z<(NUM_MULTS-1); z=z+1) + begin : gen_modexpng_mac_int + modexpng_mac mac_int + ( + .clk (clk), + .ce (ce), + .clr (clr[z]), + .casc_a (casc_a[z-1]), + .a_in (a_in[z*WORD_WIDTH+:WORD_WIDTH]), + .b_in (b_in), + .p_out (p_out[z*MAC_WIDTH+:MAC_WIDTH]), + .a_casc_in (a_casc_int[z-1]), + .a_casc_out (a_casc_int[z]) + ); + end + endgenerate + + + // + // MSB + // + modexpng_mac mac_msb + ( + .clk (clk), + .ce (ce), + .clr (clr[NUM_MULTS-1]), + .casc_a (casc_a[NUM_MULTS-2]), + .a_in (a_in[(NUM_MULTS-1)*WORD_WIDTH+:WORD_WIDTH]), + .b_in (b_in), + .p_out (p_out[(NUM_MULTS-1)*MAC_WIDTH+:MAC_WIDTH]), + .a_casc_in (a_casc_int[NUM_MULTS-2]), + .a_casc_out (a_casc_int_aux) + ); + + + // + // AUX + // + modexpng_mac mac_aux + ( + .clk (clk), + .ce (ce_aux), + .clr (clr_aux), + .casc_a (casc_a_aux), + .a_in (a_in_aux), + .b_in (b_in), + .p_out (p_out_aux), + .a_casc_in (a_casc_int_aux), + .a_casc_out () + ); + + +endmodule diff --git a/rtl/modexpng_mem.v b/rtl/modexpng_mem.v new file mode 100644 index 0000000..ca89214 --- /dev/null +++ b/rtl/modexpng_mem.v @@ -0,0 +1,93 @@ +// +// TODO: Add license text! +// + +module modexpng_mem # +( + parameter MEM_WIDTH = 17, + parameter MEM_ADDR_BITS = 6 +) +( + input clk, + + input [MEM_ADDR_BITS-1:0] a_addr, + input a_en, + input a_wr, + input [MEM_WIDTH -1:0] a_in, + output [MEM_WIDTH -1:0] a_out, + + input [MEM_ADDR_BITS-1:0] b_addr, + input b_en, + input b_reg_en, + output [MEM_WIDTH -1:0] b_out +); + + + // + // BRAM + // + (* RAM_STYLE="BLOCK" *) + reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1]; + + + // + // Initialization for Simulation + // + /* + integer c; + initial begin + for (c=0; c<(2**MEM_ADDR_BITS); c=c+1) + bram[c] = {MEM_WIDTH{1'b0}}; + end + */ + + + + // + // Output Registers + // + reg [MEM_WIDTH-1:0] bram_b; + reg [MEM_WIDTH-1:0] bram_b_reg; + + assign a_out = 32'hDEADCE11; + assign b_out = bram_b_reg; + + + // + // Note, that when both ports are accessing the same location, conflict can + // potentionally arise. See Xilinx UG473 (pages 19-20, "Conflict + // Avoidance") for more information. In our configuration to avoid that the + // write port must be coded to operate in READ_FIRST mode. If the write + // port is overwriting the same address the read port is accessing, the + // write port must read the previously stored data (not the data it is + // writing, as that would be WRITE_FIRST mode). + // + + + // + // Write-Only Port A + // + always @(posedge clk) + // + if (a_en) + // + if (a_wr) bram[a_addr] <= a_in; + + + // + // Read-Only Port B + // + always @(posedge clk) + // + if (b_en) + // + bram_b <= bram[b_addr]; + + always @(posedge clk) + // + if (b_reg_en) + // + bram_b_reg <= bram_b; + + +endmodule diff --git a/rtl/modexpng_mmm_col_index.v b/rtl/modexpng_mmm_col_index.v new file mode 100644 index 0000000..b904795 --- /dev/null +++ b/rtl/modexpng_mmm_col_index.v @@ -0,0 +1,90 @@ +module modexpng_mmm_col_index +( + clk, + index_last, + fsm_state_next, + col_index, + col_index_done, + col_index_zero, + col_index_next, + col_index_prev +); + + + // + // Includes + // + //`include "modexpng_parameters.vh" + //`include "modexpng_parameters_x8.vh" + `include "modexpng_mmm_fsm.vh" + + + // + // Parameters + // + parameter INDEX_WIDTH = 6; + + + // + // Ports + // + input clk; + input [ INDEX_WIDTH-1:0] index_last; + input [FSM_STATE_WIDTH-1:0] fsm_state_next; + output [ INDEX_WIDTH-4:0] col_index; + output col_index_done; + output [ INDEX_WIDTH-4:0] col_index_zero; + output [ INDEX_WIDTH-4:0] col_index_next; + output [ INDEX_WIDTH-4:0] col_index_prev; + + + // + // Registers + // + reg [INDEX_WIDTH-4:0] col_index_reg; + reg [INDEX_WIDTH-4:0] col_index_last; + reg [INDEX_WIDTH-4:0] col_index_dly; + + + // + // Mapping + // + assign col_index = col_index_reg; + assign col_index_prev = col_index_dly; + + + // + // Handy Wires + // + assign col_index_done = col_index == col_index_last; + assign col_index_zero = {(INDEX_WIDTH-3){1'b0}}; + assign col_index_next = col_index + 1'b1; + + + // + // Increment Logic + // + always @(posedge clk) + // + case (fsm_state_next) + // + FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin + col_index_reg <= col_index_zero; + col_index_last <= index_last[INDEX_WIDTH-1:3]; + end + // + FSM_STATE_MULT_SQUARE_COL_N_TRIG: + col_index_reg <= col_index_next; + // + endcase + + + // + // Delay Logic + // + always @(posedge clk) + // + col_index_dly <= col_index; + + +endmodule diff --git a/rtl/modexpng_mmm_din_addr.v b/rtl/modexpng_mmm_din_addr.v new file mode 100644 index 0000000..565c7e0 --- /dev/null +++ b/rtl/modexpng_mmm_din_addr.v @@ -0,0 +1,167 @@ +module modexpng_mmm_din_addr +( + clk, rst_n, + index_last, + fsm_state_next, + col_index_zero, col_index_next, + din_addr, din_bank, din_ena, din_reg_ena, + din_addr_cnt, din_addr_cnt_last, + din_addr_cnt_lower_prev, din_addr_cnt_upper_prev +); + + + // + // Includes + // + `include "modexpng_parameters.vh" + //`include "modexpng_parameters_x8.vh" + `include "modexpng_mmm_fsm.vh" + + + // + // Parameters + // + parameter INDEX_WIDTH = 6; + + + // + // Ports + // + input clk; + input rst_n; + input [ INDEX_WIDTH-1:0] index_last; + input [FSM_STATE_WIDTH-1:0] fsm_state_next; + input [ INDEX_WIDTH-4:0] col_index_zero; + input [ INDEX_WIDTH-4:0] col_index_next; + output [ INDEX_WIDTH-4:0] din_addr; + output [ 3-1:0] din_bank; + output [ 1-1:0] din_ena; + output [ 1-1:0] din_reg_ena; + output [ INDEX_WIDTH-1:0] din_addr_cnt; + output [ INDEX_WIDTH-1:0] din_addr_cnt_last; + output [ 3-1:0] din_addr_cnt_lower_prev; + output [ INDEX_WIDTH-4:0] din_addr_cnt_upper_prev; + + + // + // Address + // + reg [INDEX_WIDTH-1:0] din_addr_reg; + wire [INDEX_WIDTH-1:0] din_addr_zero = {INDEX_WIDTH{1'b0}}; + reg [INDEX_WIDTH-1:0] din_addr_last; + wire [INDEX_WIDTH-1:0] din_addr_prev = (din_addr_reg == din_addr_zero) ? din_addr_last : din_addr_reg - 1'b1; + + reg [INDEX_WIDTH-1:0] din_addr_cnt_reg; + wire [INDEX_WIDTH-1:0] din_addr_cnt_zero = {INDEX_WIDTH{1'b0}}; + wire [INDEX_WIDTH-1:0] din_addr_cnt_next = din_addr_cnt_reg + 1'b1; + reg [INDEX_WIDTH-1:0] din_addr_cnt_last_reg; + wire [ 3-1:0] din_addr_cnt_lower = din_addr_cnt_reg[ 3-1:0]; + wire [INDEX_WIDTH-4:0] din_addr_cnt_upper = din_addr_cnt_reg[INDEX_WIDTH-1:3]; + reg [ 3-1:0] din_addr_cnt_lower_dly; + reg [INDEX_WIDTH-4:0] din_addr_cnt_upper_dly; + + reg [ 3-1:0] din_bank_reg; + + + // + // Enables + // + reg din_ena_reg = 1'b0; + reg din_reg_ena_reg = 1'b0; + + always @(posedge clk or negedge rst_n) + // + if (!rst_n) + din_ena_reg <= 1'b0; + else case (fsm_state_next) + // + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: + din_ena_reg <= 1'b1; + // + default: + din_ena_reg <= 1'b0; + // + endcase + + always @(posedge clk or negedge rst_n) + // + if (!rst_n) + din_reg_ena_reg <= 1'b0; + else + din_reg_ena_reg <= din_ena_reg; + + + // + // Address Mapping + // + assign din_addr = din_addr_reg[INDEX_WIDTH-1:3]; + + assign din_addr_cnt = din_addr_cnt_reg; + assign din_addr_cnt_last = din_addr_cnt_last_reg; + assign din_addr_cnt_lower_prev = din_addr_cnt_lower_dly; + assign din_addr_cnt_upper_prev = din_addr_cnt_upper_dly; + + assign din_bank = din_bank_reg; + + + // + // Enable Mapping + // + assign din_ena = din_ena_reg; + assign din_reg_ena = din_reg_ena_reg; + + + // + // Delay + // + always @(posedge clk) begin + din_addr_cnt_lower_dly <= din_addr_cnt_lower; + din_addr_cnt_upper_dly <= din_addr_cnt_upper; + end + + + always @(posedge clk) + // + case (fsm_state_next) + // + FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin + din_addr_reg <= {col_index_zero, {3{1'b0}}}; + din_addr_last <= index_last; + din_addr_cnt_reg <= din_addr_cnt_zero; + din_addr_cnt_last_reg <= index_last; + end + // + FSM_STATE_MULT_SQUARE_COL_N_TRIG: begin + din_addr_reg <= {col_index_next, {3{1'b0}}}; + din_addr_cnt_reg <= din_addr_cnt_zero; + end + // + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin + din_addr_reg <= din_addr_prev; + din_addr_cnt_reg <= din_addr_cnt_next; + end + // + //default: + // + endcase + + always @(posedge clk) + // + case (fsm_state_next) + // + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: + din_bank_reg = BANK_XY_T1T2; + // + default: + din_bank_reg = BANK_XY_ANY; + // + endcase + +endmodule diff --git a/rtl/modexpng_mmm_dout_addr.v b/rtl/modexpng_mmm_dout_addr.v new file mode 100644 index 0000000..3749d82 --- /dev/null +++ b/rtl/modexpng_mmm_dout_addr.v @@ -0,0 +1,167 @@ +module modexpng_mmm_dout_addr +( + clk, rst_n, + //index_last, + fsm_state, + load_xy_addr, + load_addr_zero, + load_nn_coeff_addr_done, + /* + + col_index_zero, col_index_next,*/ + x_dout_addr, y_dout_addr, + x_dout_ena, y_dout_ena, + x_dout_bank, y_dout_bank + +); + + + // + // Includes + // + `include "modexpng_parameters.vh" + `include "modexpng_parameters_x8.vh" + `include "modexpng_mmm_fsm.vh" + + + // + // Parameters + // + parameter INDEX_WIDTH = 6; + + + // + // Ports + // + input clk; + input rst_n; + //input [ INDEX_WIDTH-1:0] index_last; + input [FSM_STATE_WIDTH-1:0] fsm_state; + input [INDEX_WIDTH:0] load_xy_addr; // address + input load_addr_zero; + input load_nn_coeff_addr_done; + //input [ INDEX_WIDTH-4:0] col_index_zero; + //input [ INDEX_WIDTH-4:0] col_index_next; + output [INDEX_WIDTH-4:0] x_dout_addr; + output [INDEX_WIDTH-4:0] y_dout_addr; + + output [NUM_MULTS-1:0] x_dout_ena; + output [NUM_MULTS-1:0] y_dout_ena; + + output [3-1:0] x_dout_bank; + output [3-1:0] y_dout_bank; + + + // + // Registers + // + reg [INDEX_WIDTH-4:0] x_dout_addr_reg; //clog2 + reg [INDEX_WIDTH-4:0] y_dout_addr_reg; //clog2 + + reg [NUM_MULTS-1:0] x_dout_ena_reg = {NUM_MULTS{1'b0}}; + reg [NUM_MULTS-1:0] y_dout_ena_reg = {NUM_MULTS{1'b0}}; + + reg [NUM_MULTS-1:0] x_dout_ena_int; + reg [NUM_MULTS-1:0] y_dout_ena_int; + + reg [3-1:0] x_dout_bank_reg; + reg [3-1:0] y_dout_bank_reg; + + + // + // Mapping + // + assign x_dout_addr = x_dout_addr_reg; + assign y_dout_addr = y_dout_addr_reg; + + assign x_dout_ena = x_dout_ena_reg; + assign y_dout_ena = y_dout_ena_reg; + + assign x_dout_bank = x_dout_bank_reg; + assign y_dout_bank = y_dout_bank_reg; + + + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3: begin + x_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3]; + y_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3]; + end + // + FSM_STATE_LOAD_NN_COEFF_3: begin + x_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0]; + y_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0]; + end + // + default: begin + x_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}}; + y_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}}; + end + // + endcase + + wire [NUM_MULTS-1:0] load_xy_ena_init = {{NUM_MULTS-1{1'b0}}, 1'b1}; + + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_T1T2_2: begin + x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1]}; + y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]}; + end + // + FSM_STATE_LOAD_NN_COEFF_2: begin + x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1] & ~load_nn_coeff_addr_done}; + y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]}; + end + // + endcase + + + always @(posedge clk or negedge rst_n) + // + if (!rst_n) begin + x_dout_ena_reg <= {NUM_MULTS{1'b0}}; + y_dout_ena_reg <= {NUM_MULTS{1'b0}}; + end else case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3, + FSM_STATE_LOAD_NN_COEFF_3: begin + x_dout_ena_reg <= x_dout_ena_int; + y_dout_ena_reg <= y_dout_ena_int; + end + // + default: begin + x_dout_ena_reg <= {NUM_MULTS{1'b0}}; + y_dout_ena_reg <= {NUM_MULTS{1'b0}}; + end + // + endcase + + + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3: begin + x_dout_bank_reg <= BANK_X_T1; + y_dout_bank_reg <= BANK_Y_T2; + end + // + FSM_STATE_LOAD_NN_COEFF_3: begin + x_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_X_N : BANK_XY_AUX; + y_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_Y_N_COEFF : BANK_XY_AUX; + end + // + default: begin + x_dout_bank_reg <= BANK_XY_ANY; + y_dout_bank_reg <= BANK_XY_ANY; + end + // + endcase + + +endmodule diff --git a/rtl/modexpng_mmm_fsm.vh b/rtl/modexpng_mmm_fsm.vh new file mode 100644 index 0000000..c237a0b --- /dev/null +++ b/rtl/modexpng_mmm_fsm.vh @@ -0,0 +1,24 @@ +localparam FSM_STATE_WIDTH = 32; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_IDLE = 0; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_1 = 1; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_2 = 2; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_3 = 3; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_1 = 4; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_2 = 5; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_3 = 6; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_INIT = 11; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_TRIG = 12; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_BUSY = 13; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_INIT = 14; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_TRIG = 15; +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_BUSY = 16; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_HOLDOFF = 17; + +localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_STOP = 999; +
\ No newline at end of file diff --git a/rtl/modexpng_mmm_pad.v b/rtl/modexpng_mmm_pad.v new file mode 100644 index 0000000..a2a21ff --- /dev/null +++ b/rtl/modexpng_mmm_pad.v @@ -0,0 +1,153 @@ +module modexpng_mmm_pad +( + clk, rst_n, + fsm_state, + load_xy_addr_lsb, + pad_x_rd_addr, pad_y_rd_addr, + pad_x_rd_ena, pad_y_rd_ena, + pad_x_rd_dout, pad_y_rd_dout, + load_x_din, load_y_din +); + + + // + // Includes + // + `include "modexpng_parameters.vh" + //`include "modexpng_parameters_x8.vh" + `include "modexpng_mmm_fsm.vh" + + + // + // Parameters + // + parameter INDEX_WIDTH = 6; + + + // + // Ports + // + input clk; + input rst_n; + input [FSM_STATE_WIDTH-1:0] fsm_state; + + input [INDEX_WIDTH-1:0] load_xy_addr_lsb; + + input [WORD_WIDTH-1:0] load_x_din; + input [WORD_WIDTH-1:0] load_y_din; + + input [INDEX_WIDTH-1:0] pad_x_rd_addr; + input [INDEX_WIDTH-1:0] pad_y_rd_addr; + + input pad_x_rd_ena; + input pad_y_rd_ena; + + output [WORD_WIDTH-1:0] pad_x_rd_dout; + output [WORD_WIDTH-1:0] pad_y_rd_dout; + + + // + // Registers + // + reg [INDEX_WIDTH-1:0] pad_x_wr_addr; + reg [INDEX_WIDTH-1:0] pad_y_wr_addr; + reg pad_x_wr_ena; + reg pad_y_wr_ena; + reg [ WORD_WIDTH-1:0] pad_x_wr_din; + reg [ WORD_WIDTH-1:0] pad_y_wr_din; + + bram_1wo_1ro_readfirst_ce # + ( + .MEM_WIDTH (WORD_WIDTH), + .MEM_ADDR_BITS (INDEX_WIDTH) + ) + pad_x + ( + .clk (clk), + + .a_addr (pad_x_wr_addr), + .a_en (pad_x_wr_ena), + .a_wr (pad_x_wr_ena), + .a_in (pad_x_wr_din), + .a_out (), // unused + + .b_addr (pad_x_rd_addr), + .b_en (pad_x_rd_ena), + .b_out (pad_x_rd_dout) + ); + + bram_1wo_1ro_readfirst_ce # + ( + .MEM_WIDTH (WORD_WIDTH), + .MEM_ADDR_BITS (INDEX_WIDTH) + ) + pad_y + ( + .clk (clk), + + .a_addr (pad_y_wr_addr), + .a_en (pad_y_wr_ena), + .a_wr (pad_y_wr_ena), + .a_in (pad_y_wr_din), + .a_out (), // unused + + .b_addr (pad_y_rd_addr), + .b_en (pad_y_rd_ena), + .b_out (pad_y_rd_dout) + ); + + + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3: begin + pad_x_wr_addr <= load_xy_addr_lsb; + pad_y_wr_addr <= load_xy_addr_lsb; + end + // + default: begin + pad_x_wr_addr <= {INDEX_WIDTH{1'bX}}; + pad_y_wr_addr <= {INDEX_WIDTH{1'bX}}; + end + // + endcase + + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3: begin + pad_x_wr_din <= load_x_din; + pad_y_wr_din <= load_y_din; + end + // + default: begin + pad_x_wr_din <= load_x_din; + pad_y_wr_din <= load_y_din; + end + // + endcase + + + always @(posedge clk or negedge rst_n) + // + if (!rst_n) begin + pad_x_wr_ena <= 1'b0; + pad_y_wr_ena <= 1'b0; + end else case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3: begin + pad_x_wr_ena <= 1'b1; + pad_y_wr_ena <= 1'b1; + end + // + default: begin + pad_x_wr_ena <= 1'b0; + pad_y_wr_ena <= 1'b0; + end + // + endcase + + +endmodule diff --git a/rtl/modexpng_mmm_transporter.v b/rtl/modexpng_mmm_transporter.v new file mode 100644 index 0000000..a8f309a --- /dev/null +++ b/rtl/modexpng_mmm_transporter.v @@ -0,0 +1,157 @@ +module modexpng_mmm_transporter +( + clk, + ena, + index_last, + fsm_state, + fsm_state_next, + load_phase, + load_xy_addr, + load_xy_addr_vld, + load_xy_req, + load_addr_zero, + load_t1t2_addr_done, + load_nn_coeff_addr_done +); + + + // + // Includes + // + //`include "modexpng_parameters.vh" + //`include "modexpng_parameters_x8.vh" + `include "modexpng_mmm_fsm.vh" + + + // + // Parameters + // + parameter INDEX_WIDTH = 6; + + + // + // Ports + // + input clk; + input ena; + input [ INDEX_WIDTH-1:0] index_last; + input [FSM_STATE_WIDTH-1:0] fsm_state; + input [FSM_STATE_WIDTH-1:0] fsm_state_next; + output load_phase; + output [ INDEX_WIDTH:0] load_xy_addr; + output load_xy_addr_vld; + output load_xy_req; + output load_addr_zero; + output load_t1t2_addr_done; + output load_nn_coeff_addr_done; + + + // + // Load Address Generator + // + reg load_phase_reg; + reg [INDEX_WIDTH:0] load_xy_addr_reg; + reg load_xy_addr_vld_reg; + reg load_xy_req_reg; + + + // + // Mapping + // + assign load_phase = load_phase_reg; + assign load_xy_addr = load_xy_addr_reg; + assign load_xy_addr_vld = load_xy_addr_vld_reg; + assign load_xy_req = load_xy_req_reg; + + + // + // Handy Quantities + // + wire [INDEX_WIDTH:0] load_xy_addr_zero = {{INDEX_WIDTH{1'b0}}, 1'b0}; + wire [INDEX_WIDTH:0] load_xy_addr_next = load_xy_addr_reg + 1'b1; + wire [INDEX_WIDTH:0] load_xy_addr_xxx = {{INDEX_WIDTH{1'bX}}, 1'bX}; + + + // + // More Handy Quantities + // + reg [INDEX_WIDTH:0] load_t1t2_addr_last; + reg [INDEX_WIDTH:0] load_nn_coeff_addr_last; + + + // + // Flags + // + assign load_addr_zero = load_xy_addr_reg == load_xy_addr_zero; + assign load_t1t2_addr_done = load_xy_addr_reg == load_t1t2_addr_last; + assign load_nn_coeff_addr_done = load_xy_addr_reg == load_nn_coeff_addr_last; + + + // + // Last Index Latch + // + always @(posedge clk) + // + if (ena && (fsm_state == FSM_STATE_IDLE)) begin + load_t1t2_addr_last <= {1'b0, index_last}; + load_nn_coeff_addr_last <= {1'b0, index_last} + 1'b1; + end + + + // + // Update Load Phase + // + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_LOAD_T1T2_1, + FSM_STATE_LOAD_T1T2_2, + FSM_STATE_LOAD_T1T2_3: load_phase_reg <= 1'b0; + FSM_STATE_LOAD_NN_COEFF_1, + FSM_STATE_LOAD_NN_COEFF_2, + FSM_STATE_LOAD_NN_COEFF_3: load_phase_reg <= 1'b1; + default: load_phase_reg <= 1'bX; + endcase + + + // + // Update Load Address + // + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_LOAD_T1T2_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_T1T2_3) ? load_xy_addr_next : load_xy_addr_zero; + FSM_STATE_LOAD_T1T2_2, + FSM_STATE_LOAD_T1T2_3: load_xy_addr_reg <= load_xy_addr_reg; + FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_NN_COEFF_3) ? load_xy_addr_next : load_xy_addr_zero; + FSM_STATE_LOAD_NN_COEFF_2, + FSM_STATE_LOAD_NN_COEFF_3: load_xy_addr_reg <= load_xy_addr_reg; + default load_xy_addr_reg <= load_xy_addr_xxx; + endcase + + + // + // Update Address Valid Flag + // + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_LOAD_T1T2_1, + FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_vld_reg <= 1'b1; + default load_xy_addr_vld_reg <= 1'b0; + endcase + + + // + // Update Load Request Flag + // + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_LOAD_T1T2_2, + FSM_STATE_LOAD_NN_COEFF_2: load_xy_req_reg <= 1'b1; + default load_xy_req_reg <= 1'b0; + endcase + + +endmodule diff --git a/rtl/modexpng_mmm_x8_dual.v b/rtl/modexpng_mmm_x8_dual.v new file mode 100644 index 0000000..99a37fa --- /dev/null +++ b/rtl/modexpng_mmm_x8_dual.v @@ -0,0 +1,550 @@ +module modexpng_mmm_x8_dual +( + clk, rst_n, + ena, rdy, + mode, transfer, + index_last, + x_din, y_din, x_dout, y_dout, + x_din_addr, y_din_addr, x_dout_addr, y_dout_addr, + x_din_ena, y_din_ena, x_dout_ena, y_dout_ena, x_din_reg_ena, y_din_reg_ena, + x_din_bank, y_din_bank, x_dout_bank, y_dout_bank, + load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req, + load_x_din, load_y_din +); + + + // + // Includes + // + `include "modexpng_parameters.vh" + `include "modexpng_parameters_x8.vh" + `include "modexpng_mmm_fsm.vh" + + + // + // Parameters + // + parameter INDEX_WIDTH = 6; + + + // + // Ports + // + input clk; + input rst_n; + + input ena; + output rdy; + + input mode; // multiply: 0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2 + // load/unload: 0 = load, 1 = unload + input transfer; // 0 = multiply, 1 = load/unload + + input [INDEX_WIDTH-1:0] index_last; + + input [NUM_MULTS*WORD_WIDTH-1:0] x_din; + input [NUM_MULTS*WORD_WIDTH-1:0] y_din; + output [NUM_MULTS*WORD_WIDTH-1:0] x_dout; + output [NUM_MULTS*WORD_WIDTH-1:0] y_dout; + + output [INDEX_WIDTH-4:0] x_din_addr; + output [INDEX_WIDTH-4:0] y_din_addr; + output [INDEX_WIDTH-4:0] x_dout_addr; + output [INDEX_WIDTH-4:0] y_dout_addr; + + output [ 1-1:0] x_din_ena; + output [ 1-1:0] y_din_ena; + output [NUM_MULTS-1:0] x_dout_ena; + output [NUM_MULTS-1:0] y_dout_ena; + output [ 1-1:0] x_din_reg_ena; + output [ 1-1:0] y_din_reg_ena; + + output [3-1:0] x_din_bank; + output [3-1:0] y_din_bank; + output [3-1:0] x_dout_bank; + output [3-1:0] y_dout_bank; + + output load_phase; // 0 = T1, T2; 1 = N, N_COEFF + output [ INDEX_WIDTH:0] load_xy_addr; // address + output load_xy_addr_vld; // address valid + output load_xy_req; // data request + + input [WORD_WIDTH-1:0] load_x_din; // data input + input [WORD_WIDTH-1:0] load_y_din; // data input + + + // + // FSM State and Next States + // + reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE; + reg [FSM_STATE_WIDTH-1:0] fsm_state_next; + reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle; + reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square; + + + // + // FSM Idle Next State + // + always @* + // + case ({transfer, mode}) + 2'b00, + 2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG; + 2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1; + 2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload? + endcase + + + // + // Column Counter + // + wire [ INDEX_WIDTH-4:0] col_index; + wire col_index_done; + wire [ INDEX_WIDTH-4:0] col_index_zero; + wire [ INDEX_WIDTH-4:0] col_index_next; + wire [ INDEX_WIDTH-4:0] col_index_prev; + + modexpng_mmm_col_index # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + mmm_col_index + ( + .clk (clk), + .index_last (index_last), + .fsm_state_next (fsm_state_next), + .col_index (col_index), + .col_index_done (col_index_done), + .col_index_zero (col_index_zero), + .col_index_next (col_index_next), + .col_index_prev (col_index_prev) + ); + + + // + // Load Address Generator + // + wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0]; + wire load_addr_zero; + wire load_t1t2_addr_done; + wire load_nn_coeff_addr_done; + + modexpng_mmm_transporter # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + transporter + ( + .clk (clk), + .ena (ena), + .index_last (index_last), + .fsm_state (fsm_state), + .fsm_state_next (fsm_state_next), + .load_phase (load_phase), + .load_xy_addr (load_xy_addr), + .load_xy_addr_vld (load_xy_addr_vld), + .load_xy_req (load_xy_req), + .load_addr_zero (load_addr_zero), + .load_t1t2_addr_done (load_t1t2_addr_done), + .load_nn_coeff_addr_done (load_nn_coeff_addr_done) + ); + + + // + // X, Y Address + // + wire [INDEX_WIDTH-1:0] x_din_addr_cnt; + wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last; + wire [ 3-1:0] x_din_addr_cnt_lower_prev; + wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev; + + modexpng_mmm_din_addr # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + din_addr_x + ( + .clk (clk), + .rst_n (rst_n), + .index_last (index_last), + .fsm_state_next (fsm_state_next), + .col_index_zero (col_index_zero), + .col_index_next (col_index_next), + .din_addr (x_din_addr), + .din_bank (x_din_bank), + .din_ena (x_din_ena), + .din_reg_ena (x_din_reg_ena), + .din_addr_cnt (x_din_addr_cnt), + .din_addr_cnt_last (x_din_addr_cnt_last), + .din_addr_cnt_lower_prev (x_din_addr_cnt_lower_prev), + .din_addr_cnt_upper_prev (x_din_addr_cnt_upper_prev) + ); + + modexpng_mmm_dout_addr # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + dout_addr_xy + ( + .clk (clk), + .rst_n (rst_n), + .fsm_state (fsm_state), + .load_xy_addr (load_xy_addr), + .load_addr_zero (load_addr_zero), + .load_nn_coeff_addr_done (load_nn_coeff_addr_done), + .x_dout_addr (x_dout_addr), + .y_dout_addr (y_dout_addr), + .x_dout_ena (x_dout_ena), + .y_dout_ena (y_dout_ena), + .x_dout_bank (x_dout_bank), + .y_dout_bank (y_dout_bank) + ); + + + // + // Helper Memories ("Scratchpad") + // + reg [INDEX_WIDTH-1:0] pad_xy_rd_addr; + reg pad_xy_rd_ena = 1'b0; + wire [ WORD_WIDTH-1:0] pad_x_rd_dout; + wire [ WORD_WIDTH-1:0] pad_y_rd_dout; + + wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}}; + wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1; + + modexpng_mmm_pad pad + ( + .clk (clk), + .rst_n (rst_n), + .fsm_state (fsm_state), + .load_xy_addr_lsb (load_xy_addr_lsb), + .load_x_din (load_x_din), + .load_y_din (load_y_din), + .pad_x_rd_addr (pad_xy_rd_addr), + .pad_y_rd_addr (pad_xy_rd_addr), + .pad_x_rd_ena (pad_xy_rd_ena), + .pad_y_rd_ena (pad_xy_rd_ena), + .pad_x_rd_dout (pad_x_rd_dout), + .pad_y_rd_dout (pad_y_rd_dout) + ); + + + always @(posedge clk or negedge rst_n) + // + if (!rst_n) begin + pad_xy_rd_ena <= 1'b0; + end else case (fsm_state_next) + + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: + pad_xy_rd_ena <= 1'b1; + + default: + pad_xy_rd_ena <= 1'b0; + + endcase + + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: + pad_xy_rd_addr <= pad_xy_rd_addr_zero; + + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: + pad_xy_rd_addr <= pad_xy_rd_addr_next; + + default: + pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}}; + + endcase + + + + + // + // Flags + // + + wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last; + + always @* + // + fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;; + + + // + // MAC Arrays + // + reg mac_x_ce = 1'b0; + reg mac_x_ce_aux = 1'b0; + reg [NUM_MULTS -1:0] mac_x_clr; + reg mac_x_clr_aux; + reg [NUM_MULTS -2:0] mac_x_casc_a; + reg mac_x_casc_a_aux; + wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a; + reg [ 1 * WORD_WIDTH -1:0] mac_x_a_aux; + //wire [ 1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1]; + reg [ 1 * WORD_WIDTH -1:0] mac_x_b; + wire [NUM_MULTS * MAC_WIDTH -1:0] mac_x_p; + wire [ 1 * MAC_WIDTH -1:0] mac_x_p_aux; + + reg mac_y_ce = 1'b0; + reg mac_y_ce_aux = 1'b0; + reg [NUM_MULTS -1:0] mac_y_clr; + reg mac_y_clr_aux; + reg [NUM_MULTS -2:0] mac_y_casc_a; + reg mac_y_casc_a_aux; + wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a; + reg [ 1 * WORD_WIDTH -1:0] mac_y_a_aux; + //wire [ 1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1]; + reg [ 1 * WORD_WIDTH -1:0] mac_y_b; + wire [NUM_MULTS * MAC_WIDTH -1:0] mac_y_p; + wire [ 1 * MAC_WIDTH -1:0] mac_y_p_aux; + + modexpng_mac_array mac_array_x + ( + .clk (clk), + .ce (mac_x_ce), + .ce_aux (mac_x_ce_aux), + .clr (mac_x_clr), + .clr_aux (mac_x_clr_aux), + .casc_a (mac_x_casc_a), + .casc_a_aux (mac_x_casc_a_aux), + .a_in (mac_x_a), + .a_in_aux (mac_x_a_aux), + .b_in (mac_x_b), + .p_out (mac_x_p), + .p_out_aux (mac_x_p_aux) + ); + + modexpng_mac_array mac_array_y + ( + .clk (clk), + .ce (mac_y_ce), + .ce_aux (mac_y_ce_aux), + .clr (mac_y_clr), + .clr_aux (mac_y_clr_aux), + .casc_a (mac_y_casc_a), + .casc_a_aux (mac_y_casc_a_aux), + .a_in (mac_y_a), + .a_in_aux (mac_y_a_aux), + .b_in (mac_y_b), + .p_out (mac_y_p), + .p_out_aux (mac_y_p_aux) + ); + + genvar gen_z; + + generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1) + begin : gen_xy_din + //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z]; + //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z]; + //gen_xy_dout + assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH]; + + //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z]; + //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z]; + end + endgenerate + + + // + // MAC Clock Enable Logic + // + reg mac_xy_ce_adv = 1'b0; + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0; + else case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1; + default: mac_xy_ce_adv <= 1'b0; + endcase + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00; + else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}}; + + + // + // MAC Clear Logic + // + wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value = + calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev); + + reg [NUM_MULTS-1:0] mac_xy_clr_adv; + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}}; + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value; + default: mac_xy_clr_adv <= {NUM_MULTS{1'bX}}; + endcase + + always @(posedge clk) + // + {mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}}; + + + // + // MAC Cascade Logic + // + reg [NUM_MULTS-2:0] mac_xy_casc_a_adv; + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}}; + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}}; + default: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}}; + endcase + + always @(posedge clk) + // + {mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}}; + + + + // + // DOUT Mapping + // + generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1) + begin : gen_xy_dout + assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z]; + assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z]; + end + endgenerate + + + // + // DOUT + // + reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1]; + reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1]; + + + + + integer int_z; + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3, + FSM_STATE_LOAD_NN_COEFF_3: + for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin + x_dout_reg[int_z] <= load_x_din; + y_dout_reg[int_z] <= load_y_din; + end + // + default: + for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin + x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}}; + y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}}; + end + // + endcase + + + + // + // FSM Process + // + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; + else fsm_state <= fsm_state_next; + + + // + // FSM Transition Logic + // + always @* begin + // + fsm_state_next = FSM_STATE_IDLE; + // + case (fsm_state) + FSM_STATE_IDLE: fsm_state_next = ena ? fsm_state_after_idle : FSM_STATE_IDLE; + + FSM_STATE_LOAD_T1T2_1: fsm_state_next = FSM_STATE_LOAD_T1T2_2 ; + FSM_STATE_LOAD_T1T2_2: fsm_state_next = FSM_STATE_LOAD_T1T2_3 ; + FSM_STATE_LOAD_T1T2_3: fsm_state_next = load_t1t2_addr_done ? FSM_STATE_LOAD_NN_COEFF_1 : FSM_STATE_LOAD_T1T2_1; + + FSM_STATE_LOAD_NN_COEFF_1: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ; + FSM_STATE_LOAD_NN_COEFF_2: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ; + FSM_STATE_LOAD_NN_COEFF_3: fsm_state_next = load_nn_coeff_addr_done ? FSM_STATE_STOP : FSM_STATE_LOAD_NN_COEFF_1; + + FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ; + FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY; + FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY; + + /* + FSM_STATE_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_0_BUSY ; + FSM_STATE_TRIANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG : FSM_STATE_TRIANGLE_COL_0_BUSY; + FSM_STATE_TRIANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_N_BUSY ; + FSM_STATE_TRIANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle : FSM_STATE_TRIANGLE_COL_N_BUSY; + + FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_0_BUSY ; + FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY; + FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_N_BUSY ; + FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle : FSM_STATE_RECTANGLE_COL_N_BUSY; + */ + + FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ; + + endcase + // + end + + + // + // Ready Output + // + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) rdy_reg <= 1'b1; + else case (fsm_state) + FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0; + FSM_STATE_STOP: rdy_reg <= 1'b1; + endcase + + function [ NUM_MULTS-1:0] calc_mac_clear_square; + input [INDEX_WIDTH-4:0] col_index_delayed; + input [ 3-1:0] x_din_addr_cnt_lower_delayed; + input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed; + begin + if (x_din_addr_cnt_upper_delayed == col_index_delayed) + case (x_din_addr_cnt_lower_delayed) + 3'b000: calc_mac_clear_square = 8'b00000001; + 3'b001: calc_mac_clear_square = 8'b00000010; + 3'b010: calc_mac_clear_square = 8'b00000100; + 3'b011: calc_mac_clear_square = 8'b00001000; + 3'b100: calc_mac_clear_square = 8'b00010000; + 3'b101: calc_mac_clear_square = 8'b00100000; + 3'b110: calc_mac_clear_square = 8'b01000000; + 3'b111: calc_mac_clear_square = 8'b10000000; + endcase + else + calc_mac_clear_square = {NUM_MULTS{1'b0}}; + end + endfunction + + +endmodule diff --git a/rtl/modexpng_parameters.vh b/rtl/modexpng_parameters.vh new file mode 100644 index 0000000..f846119 --- /dev/null +++ b/rtl/modexpng_parameters.vh @@ -0,0 +1,39 @@ +//localparam WORD_WIDTH = 17; +//localparam MAC_WIDTH = 47; + +//localparam BANK_ADDR_WIDTH = 3; // TODO: Replace everywhere! + +localparam [2:0] BANK_FAT_T1T2 = 3'd0; +localparam [2:0] BANK_FAT_ABL = 3'd1; +localparam [2:0] BANK_FAT_ABH = 3'd2; +localparam [2:0] BANK_FAT_Q = 3'd3; +localparam [2:0] BANK_FAT_Q_EXT = 3'd4; +localparam [2:0] BANK_FAT_ML = 3'd5; +localparam [2:0] BANK_FAT_MH = 3'd6; +localparam [2:0] BANK_FAT_MH_EXT = 3'd7; + +localparam [1:0] BANK_SLIM_T1T2 = 2'd0; +localparam [1:0] BANK_SLIM_N = 2'd1; +localparam [1:0] BANK_SLIM_N_COEFF = 2'd2; +localparam [1:0] BANK_SLIM_N_COEFF_EXT = 2'd3; + + +//localparam BANK_Y_T2 = 3'd0; +//localparam BANK_XY_T1T2 = 3'd0; + +//localparam BANK_XY_AB_LSB = 3'd1; +//localparam BANK_XY_AB_MSB = 3'd2; + +//localparam BANK_X_N = 3'd3; +//localparam BANK_Y_N_COEFF = 3'd3; + +//localparam BANK_XY_M = 3'd4; + +//localparam BANK_XY_Q_LSB = 3'd5; +//localparam BANK_XY_Q_MSB = 3'd6; + +//localparam BANK_XY_AUX = 3'd7; + +//localparam BANK_XY_ANY = 3'bXXX; + +//localparam BANK_XY_AUX_ADDR_N_COEFF = 0; diff --git a/rtl/modexpng_parameters_x8.vh b/rtl/modexpng_parameters_x8.vh new file mode 100644 index 0000000..8734354 --- /dev/null +++ b/rtl/modexpng_parameters_x8.vh @@ -0,0 +1 @@ +localparam NUM_MULTS = 8; diff --git a/rtl/modexpng_part_recombinator.v b/rtl/modexpng_part_recombinator.v new file mode 100644 index 0000000..db4774b --- /dev/null +++ b/rtl/modexpng_part_recombinator.v @@ -0,0 +1,623 @@ +module modexpng_part_recombinator +( + clk, + rdy, + fsm_state_next, + index_last, + dsp_x_ce_p, dsp_y_ce_p, + ena_x, ena_y, + dsp_x_p, dsp_y_p, + col_index, col_index_last, slim_bram_xy_addr, + fat_bram_xy_bank, fat_bram_xy_addr, fat_bram_x_dout, fat_bram_y_dout, fat_bram_xy_dout_valid +); + + + // + // Headers + // + `include "../rtl/modexpng_mmm_fsm.vh" + `include "../rtl/modexpng_parameters.vh" + `include "../rtl/modexpng_parameters_x8.vh" + + + input clk; + output rdy; + input [FSM_STATE_WIDTH-1:0] fsm_state_next; + input [7:0] index_last; + input dsp_x_ce_p; + input dsp_y_ce_p; + input ena_x; + input ena_y; + input [8*47-1:0] dsp_x_p; + input [8*47-1:0] dsp_y_p; + input [ 4:0] col_index; + input [ 4:0] col_index_last; + input [ 7:0] slim_bram_xy_addr; + + output [ 2:0] fat_bram_xy_bank; + output [ 7:0] fat_bram_xy_addr; + output [ 17:0] fat_bram_x_dout; + output [ 17:0] fat_bram_y_dout; + output fat_bram_xy_dout_valid; + + + // + // Latches + // + reg [1*47-1:0] dsp_x_p_latch[0:7]; + reg [1*47-1:0] dsp_y_p_latch[0:7]; + + + // + // Mapping + // + wire [46:0] dsp_x_p_split[0:7]; + wire [46:0] dsp_y_p_split[0:7]; + + genvar z; + generate for (z=0; z<NUM_MULTS; z=z+1) + begin : gen_dsp_xy_p_split + assign dsp_x_p_split[z] = dsp_x_p[47*z+:47]; + assign dsp_y_p_split[z] = dsp_y_p[47*z+:47]; + end + endgenerate + + + // + // Delays + // + reg dsp_y_ce_p_dly1 = 1'b0; + reg dsp_x_ce_p_dly1 = 1'b0; + + always @(posedge clk) begin + // + {dsp_y_ce_p_dly1, dsp_x_ce_p_dly1} <= {dsp_y_ce_p, dsp_x_ce_p}; + // + end + + + // + // Registers + // + + // valid + reg x_valid_lsb = 1'b0; + reg y_valid_lsb = 1'b0; + reg x_valid_msb = 1'b0; + reg y_valid_msb = 1'b0; + + // bitmap + reg [7:0] x_bitmap_lsb = {8{1'b0}}; + reg [7:0] y_bitmap_lsb = {8{1'b0}}; + reg [7:0] x_bitmap_msb = {8{1'b0}}; + reg [7:0] y_bitmap_msb = {8{1'b0}}; + + // index + reg [2:0] x_index_lsb = 3'dX; + reg [2:0] y_index_lsb = 3'dX; + + // purge + reg x_purge_lsb = 1'b0; + reg y_purge_lsb = 1'b0; + reg x_purge_msb = 1'b0; + reg y_purge_msb = 1'b0; + + // valid - latch + reg x_valid_latch_lsb = 1'b0; + reg y_valid_latch_lsb = 1'b0; + + // bitmap - latch + reg [7:0] x_bitmap_latch_lsb = {8{1'b0}}; + reg [7:0] y_bitmap_latch_lsb = {8{1'b0}}; + reg [7:0] x_bitmap_latch_msb = {8{1'b0}}; + reg [7:0] y_bitmap_latch_msb = {8{1'b0}}; + + // index - latch + reg [2:0] x_index_latch_lsb = 3'dX; + reg [2:0] y_index_latch_lsb = 3'dX; + + // purge - index + reg x_purge_latch_lsb = 1'b0; + reg y_purge_latch_lsb = 1'b0; + reg x_purge_latch_msb = 1'b0; + reg y_purge_latch_msb = 1'b0; + + // + reg xy_valid_lsb_adv[1:6]; + reg xy_valid_msb_adv[1:6]; + reg [7:0] xy_bitmap_lsb_adv[1:6]; + reg [7:0] xy_bitmap_msb_adv[1:6]; + reg [2:0] xy_index_lsb_adv[1:6]; + reg [2:0] xy_index_msb_adv[1:6]; + reg xy_purge_lsb_adv[1:6]; + reg xy_purge_msb_adv[1:6]; + + + integer i; + initial for (i=1; i<6; i=i+1) begin + xy_valid_lsb_adv[i] = 1'b0; + xy_valid_msb_adv[i] = 1'b0; + xy_bitmap_lsb_adv[i] = {8{1'b0}}; + xy_bitmap_msb_adv[i] = {8{1'b0}}; + xy_index_lsb_adv[i] = 3'dX; + xy_index_msb_adv[i] = 3'dX; + xy_purge_lsb_adv[i] = 1'b0; + xy_purge_msb_adv[i] = 1'b0; + end + + function [0:0] calc_square_valid_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + begin + // + if (slim_bram_xy_addr_value[7:3] == col_index_value) + calc_square_valid_lsb = 1'b1; + else + calc_square_valid_lsb = 1'b0; + // + end + endfunction + + function [7:0] calc_square_bitmap_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + begin + // + if (slim_bram_xy_addr_value[7:3] == col_index_value) + // + case (slim_bram_xy_addr_value[2:0]) + 3'b000: calc_square_bitmap_lsb = 8'b00000001; + 3'b001: calc_square_bitmap_lsb = 8'b00000010; + 3'b010: calc_square_bitmap_lsb = 8'b00000100; + 3'b011: calc_square_bitmap_lsb = 8'b00001000; + 3'b100: calc_square_bitmap_lsb = 8'b00010000; + 3'b101: calc_square_bitmap_lsb = 8'b00100000; + 3'b110: calc_square_bitmap_lsb = 8'b01000000; + 3'b111: calc_square_bitmap_lsb = 8'b10000000; + endcase + // + else + calc_square_bitmap_lsb = {8{1'b0}}; + // + end + endfunction + + function [2:0] calc_square_index_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + begin + // + if (slim_bram_xy_addr_value[7:3] == col_index_value) + // + case (slim_bram_xy_addr_value[2:0]) + 3'b000: calc_square_index_lsb = 3'd0; + 3'b001: calc_square_index_lsb = 3'd1; + 3'b010: calc_square_index_lsb = 3'd2; + 3'b011: calc_square_index_lsb = 3'd3; + 3'b100: calc_square_index_lsb = 3'd4; + 3'b101: calc_square_index_lsb = 3'd5; + 3'b110: calc_square_index_lsb = 3'd6; + 3'b111: calc_square_index_lsb = 3'd7; + endcase + // + else + calc_square_index_lsb = 3'dX; + // + end + endfunction + + function calc_square_purge_lsb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + begin + // + if (slim_bram_xy_addr_value[7:3] == col_index_value) + calc_square_purge_lsb = slim_bram_xy_addr_value[7:3] == col_index_last_value; + else + calc_square_purge_lsb = 1'b0; + // + end + endfunction + + function calc_square_valid_msb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + input [7:0] index_last_value; + begin + // + if (slim_bram_xy_addr_value == index_last_value) + calc_square_valid_msb = 1'b1; + else + calc_square_valid_msb = 1'b0; + // + end + endfunction + + function [7:0] calc_square_bitmap_msb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + input [7:0] index_last_value; + begin + // + if (slim_bram_xy_addr_value == index_last_value) begin + calc_square_bitmap_msb[7] = col_index_value != col_index_last_value; + calc_square_bitmap_msb[6:0] = 7'b1111111; + end else + calc_square_bitmap_msb[7:0] = 8'b00000000; + // + end + endfunction + + function calc_square_purge_msb; + input [4:0] col_index_value; + input [4:0] col_index_last_value; + input [7:0] slim_bram_xy_addr_value; + input [7:0] index_last_value; + begin + // + if (slim_bram_xy_addr_value == index_last_value) + calc_square_purge_msb = col_index_value == col_index_last_value; + else + calc_square_purge_msb = 1'b0; + // + end + endfunction + + + reg recomb_lsb_ce = 1'b0; + reg [ 2:0] recomb_lsb_ce_purge = 3'b000; + wire recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_purge[0]; + reg recomb_lsb_clr; + reg recomb_lsb_vld = 1'b0; + + reg [46:0] recomb_lsb_din; + wire [15:0] recomb_lsb_dout; + + reg recomb_msb_ce = 1'b0; + reg [ 1:0] recomb_msb_ce_purge = 2'b00; + wire recomb_msb_ce_combined = recomb_msb_ce | recomb_msb_ce_purge[0]; + reg recomb_msb_clr; + reg recomb_msb_vld = 1'b0; + + always @(posedge clk) + // + {recomb_msb_vld, recomb_lsb_vld} <= {recomb_msb_ce_combined, recomb_lsb_ce_combined}; + + reg [46:0] recomb_msb_din; + wire [15:0] recomb_msb_dout; + + modexpng_recombinator_block recomb_x_lsb + ( + .clk (clk), + .ce (recomb_lsb_ce_combined), + .clr (recomb_lsb_clr), + .din (recomb_lsb_din), + .dout (recomb_lsb_dout) + ); + + modexpng_recombinator_block recomb_x_msb + ( + .clk (clk), + .ce (recomb_msb_ce_combined), + .clr (recomb_msb_clr), + .din (recomb_msb_din), + .dout (recomb_msb_dout) + ); + + always @(posedge clk) begin + // + recomb_lsb_ce <= x_valid_latch_lsb; + recomb_msb_ce <= x_bitmap_latch_msb[0]; + // + if (x_purge_latch_lsb) + recomb_lsb_ce_purge <= 3'b111; + else + recomb_lsb_ce_purge <= {1'b0, recomb_lsb_ce_purge[2:1]}; + // + if (x_purge_latch_msb && x_bitmap_latch_msb[0] && !x_bitmap_latch_msb[1]) + recomb_msb_ce_purge = 2'b11; + else + recomb_msb_ce_purge <= {1'b0, recomb_msb_ce_purge[1]}; + // + end + + + always @(posedge clk) + // + if (ena_x & ena_y) begin + recomb_lsb_clr <= 1'b1; + recomb_msb_clr <= 1'b1; + end else begin + if (recomb_lsb_ce) recomb_lsb_clr <= 1'b0; + if (recomb_msb_ce) recomb_msb_clr <= 1'b0; + end + + always @(posedge clk) + // + if (x_valid_latch_lsb) + recomb_lsb_din <= dsp_x_p_latch[x_index_latch_lsb]; + else + recomb_lsb_din <= {47{1'b0}}; + + always @(posedge clk) + // + if (x_bitmap_latch_msb[0]) + recomb_msb_din <= dsp_x_p_latch[0]; + else + recomb_msb_din <= {47{1'b0}}; + + + always @(posedge clk) + // + case (fsm_state_next) + // + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin + // + xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, slim_bram_xy_addr); + xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr); + xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, slim_bram_xy_addr); + xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, slim_bram_xy_addr); + // + xy_valid_msb_adv [6] <= calc_square_valid_msb (col_index, col_index_last, slim_bram_xy_addr, index_last); + xy_bitmap_msb_adv[6] <= calc_square_bitmap_msb(col_index, col_index_last, slim_bram_xy_addr, index_last); + xy_purge_msb_adv [6] <= calc_square_purge_msb (col_index, col_index_last, slim_bram_xy_addr, index_last); + // + end + // + default: begin + // + xy_valid_lsb_adv [6] <= 1'b0; + xy_bitmap_lsb_adv[6] <= {8{1'b0}}; + xy_index_lsb_adv [6] <= 3'dX; + xy_purge_lsb_adv [6] <= 1'b0; + // + xy_valid_msb_adv [6] <= 1'b0; + xy_bitmap_msb_adv[6] <= {8{1'b0}}; + xy_purge_msb_adv [6] <= 1'b0; + // + end + // + endcase + + + always @(posedge clk) begin + // + {y_valid_lsb, x_valid_lsb} <= {2{xy_valid_lsb_adv [1]}}; + {y_bitmap_lsb, x_bitmap_lsb} <= {2{xy_bitmap_lsb_adv[1]}}; + {y_index_lsb, x_index_lsb} <= {2{xy_index_lsb_adv [1]}}; + {y_purge_lsb, x_purge_lsb} <= {2{xy_purge_lsb_adv [1]}}; + // + {y_valid_latch_lsb, x_valid_latch_lsb} <= {y_valid_lsb, x_valid_lsb}; + {y_bitmap_latch_lsb, x_bitmap_latch_lsb} <= {y_bitmap_lsb, x_bitmap_lsb}; + {y_index_latch_lsb, x_index_latch_lsb} <= {y_index_lsb, x_index_lsb}; + {y_purge_latch_lsb, x_purge_latch_lsb} <= {y_purge_lsb, x_purge_lsb}; + // + {y_valid_msb, x_valid_msb} <= {2{xy_valid_msb_adv[1]}}; + {y_bitmap_msb, x_bitmap_msb} <= {2{xy_bitmap_msb_adv[1]}}; + {y_purge_msb, x_purge_msb} <= {2{xy_purge_msb_adv[1]}}; + // + if (x_valid_msb) begin + x_bitmap_latch_msb <= x_bitmap_msb; + x_purge_latch_msb <= x_purge_msb; + end else begin + x_bitmap_latch_msb <= {1'b0, x_bitmap_latch_msb[7:1]}; + end + // + // + for (i=1; i<6; i=i+1) begin + xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1]; + xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1]; + xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1]; + xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1]; + // + xy_valid_msb_adv [i] <= xy_valid_msb_adv [i+1]; + xy_bitmap_msb_adv[i] <= xy_bitmap_msb_adv[i+1]; + xy_purge_msb_adv [i] <= xy_purge_msb_adv [i+1]; + end + // + end + + always @(posedge clk) + // + if (x_bitmap_latch_msb[1]) // only shift 7 times + // + for (i=0; i<8; i=i+1) + if (i < 7) + dsp_x_p_latch[i] <= dsp_x_p_latch[i+1]; + else + dsp_x_p_latch[i] <= {47{1'bX}}; + // + else if (dsp_x_ce_p_dly1) + // + for (i=0; i<8; i=i+1) + // + if (x_bitmap_lsb[i]) + dsp_x_p_latch[i] <= dsp_x_p_split[i]; + else if (x_valid_msb && x_bitmap_msb[i]) + dsp_x_p_latch[i] <= dsp_x_p_split[i]; + + reg recomb_x_lsb_dout_valid = 1'b0; + reg recomb_x_msb_dout_valid = 1'b0; + + always @(posedge clk) begin + recomb_x_lsb_dout_valid <= recomb_lsb_ce_combined; + recomb_x_msb_dout_valid <= recomb_msb_ce_combined; + end + + + + reg [ 2:0] fat_bram_xy_bank_reg; + reg [ 7:0] fat_bram_xy_addr_reg; + reg [ 7:0] fat_bram_xy_cnt_lsb; + reg [ 7:0] fat_bram_xy_cnt_msb; + reg [17:0] fat_bram_x_dout_reg; + reg [17:0] fat_bram_y_dout_reg; + reg fat_bram_xy_dout_valid_reg = 1'b0; + + reg [15:0] recomb_msb_dout_carry_0; + reg [15:0] recomb_msb_dout_carry_1; + + reg [15:0] recomb_msb_dout_delay_0; + reg [15:0] recomb_msb_dout_delay_1; + reg [15:0] recomb_msb_dout_delay_2; + + reg [ 7:0] recomb_msb_cnt_delay_0 = 8'd0; + reg [ 7:0] recomb_msb_cnt_delay_1 = 8'd0; + reg [ 7:0] recomb_msb_cnt_delay_2 = 8'd0; + + assign fat_bram_xy_bank = fat_bram_xy_bank_reg; + assign fat_bram_xy_addr = fat_bram_xy_addr_reg; + assign fat_bram_x_dout = fat_bram_x_dout_reg; + assign fat_bram_y_dout = fat_bram_y_dout_reg; + assign fat_bram_xy_dout_valid = fat_bram_xy_dout_valid_reg; + + reg rdy_reg = 1'b1; + reg rdy_adv = 1'b1; + + assign rdy = rdy_reg; + + + always @(posedge clk) + // + if (ena_x & ena_y) + rdy_reg <= 1'b0; + else + rdy_reg <= rdy_adv; + + always @(posedge clk) + // + if (ena_x & ena_y) begin + rdy_adv <= 1'b0; + fat_bram_xy_cnt_lsb <= 8'd0; + fat_bram_xy_cnt_msb <= 8'd0; + end else begin + // + case ({recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid}) + // + 2'b00: begin + // + if (recomb_msb_cnt_delay_2 > 8'd0) begin + // + rdy_adv <= recomb_msb_cnt_delay_1 == 8'd0; + // + recomb_msb_dout_delay_0 <= {18{1'bX}}; + recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0; + recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1; + // + recomb_msb_cnt_delay_0 <= 8'd0; + recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0; + recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1; + // + fat_bram_xy_bank_reg <= BANK_FAT_ABH; + fat_bram_xy_addr_reg <= recomb_msb_cnt_delay_2; + fat_bram_x_dout_reg <= recomb_msb_dout_delay_2; +// fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b1; + // + end else begin + // + fat_bram_xy_bank_reg <= 3'bXXX; + fat_bram_xy_addr_reg <= 8'hXX; + fat_bram_x_dout_reg <= {18{1'bX}}; + fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b0; + // + end + // + end + // + 2'b01: begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABL; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; + fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout}; +// fat_bram_y_dout_reg + fat_bram_xy_dout_valid_reg <= 1'b1; + // + fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1; + // + end + // + 2'b10: begin + // + if (fat_bram_xy_cnt_msb < 8'd2) begin + // + recomb_msb_dout_carry_0 <= recomb_msb_dout; + recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0; + // + fat_bram_xy_bank_reg <= 3'bXXX; + fat_bram_xy_addr_reg <= 8'hXX; + fat_bram_x_dout_reg <= {18{1'bX}}; + // fat_bram_y_dout_reg + fat_bram_xy_dout_valid_reg <= 1'b0; + // + end else begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABH; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_msb; + fat_bram_x_dout_reg <= {2'b00, recomb_msb_dout}; + // fat_bram_y_dout_reg + fat_bram_xy_dout_valid_reg <= 1'b1; + // + end + // + fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1; + // + end + // + 2'b11: begin + // + if (fat_bram_xy_cnt_lsb == index_last) begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABL; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; + fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout}; +// fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b1; + // + fat_bram_xy_cnt_lsb <= 8'd0; + // + end else begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABH; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; + fat_bram_x_dout_reg <= {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}}; +// fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b1; + // + fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1; + // + recomb_msb_dout_carry_0 <= {16{1'bX}}; + recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0; + // + end + // + recomb_msb_dout_delay_0 <= recomb_msb_dout; + recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0; + recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1; + // + recomb_msb_cnt_delay_0 <= fat_bram_xy_cnt_msb; + recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0; + recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1; + // + fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1; + // + end + // + endcase + // + end + + + + +endmodule diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v new file mode 100644 index 0000000..efe0ac5 --- /dev/null +++ b/rtl/modexpng_recombinator_block.v @@ -0,0 +1,35 @@ +module modexpng_recombinator_block +( + clk, + ce, clr, + din, dout +); + + input clk; + input ce; + input clr; + input [46:0] din; + output [15:0] dout; + + reg [14:0] z; + reg [16:0] y; + reg [17:0] x; + //reg [15:0] w; + + //assign dout = w; + assign dout = x[15:0]; + + wire [14:0] din_z = din[46:32]; // TODO: maybe determine more precise bound here + wire [15:0] din_y = din[31:16]; + wire [15:0] din_x = din[15: 0]; + + always @(posedge clk) + // + if (ce) begin + z <= din_z; + y <= clr ? {1'b0, din_y} : {1'b0, din_y} + {2'b00, z}; + x <= clr ? {2'b00, din_x} : {2'b00, din_x} + {1'b0, y} + {{16{1'b0}}, x[17:16]}; + //w <= clr ? {16{1'bX}} : x[15:0]; + end + +endmodule |