aboutsummaryrefslogtreecommitdiff
path: root/rtl
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2019-10-01 15:01:43 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2019-10-01 15:01:43 +0300
commit29fb6afd018c601a2e0c7376656d5e37beb565d6 (patch)
treedc11ee0c8e5a30113052254be23594da74a8a572 /rtl
parentec07464d239f7f6379a682ac57b58b863d3f0374 (diff)
Started working on the pipelined Montgomery modular multiplier. Currently can
do the "square" part of the multiplication, i.e. compute the twice larger intermediate product AB = A * B.
Diffstat (limited to 'rtl')
-rw-r--r--rtl/dev/temp.txt384
-rw-r--r--rtl/dsp/dsp_array.v111
-rw-r--r--rtl/dsp/dsp_slice.v125
-rw-r--r--rtl/modexpng_mac.v54
-rw-r--r--rtl/modexpng_mac_array.v116
-rw-r--r--rtl/modexpng_mem.v93
-rw-r--r--rtl/modexpng_mmm_col_index.v90
-rw-r--r--rtl/modexpng_mmm_din_addr.v167
-rw-r--r--rtl/modexpng_mmm_dout_addr.v167
-rw-r--r--rtl/modexpng_mmm_fsm.vh24
-rw-r--r--rtl/modexpng_mmm_pad.v153
-rw-r--r--rtl/modexpng_mmm_transporter.v157
-rw-r--r--rtl/modexpng_mmm_x8_dual.v550
-rw-r--r--rtl/modexpng_parameters.vh39
-rw-r--r--rtl/modexpng_parameters_x8.vh1
-rw-r--r--rtl/modexpng_part_recombinator.v623
-rw-r--r--rtl/modexpng_recombinator_block.v35
17 files changed, 2889 insertions, 0 deletions
diff --git a/rtl/dev/temp.txt b/rtl/dev/temp.txt
new file mode 100644
index 0000000..987bd86
--- /dev/null
+++ b/rtl/dev/temp.txt
@@ -0,0 +1,384 @@
+ //
+ // Helper Functions
+ //
+ /*
+ function [INDEX_WIDTH-1:0] calc_preset_a_index;
+ input [INDEX_WIDTH-4:0] col_in;
+ input integer x_in;
+ integer index_out;
+ begin
+ index_out = col_in * NUM_MULTS + x_in;
+ calc_preset_a_index = index_out[INDEX_WIDTH-1:0];
+ end
+ endfunction
+
+ function [INDEX_WIDTH-1:0] calc_rotate_a_index;
+ input [INDEX_WIDTH-1:0] current_index_in;
+ input [INDEX_WIDTH-1:0] last_index_in;
+ begin
+ if (current_index_in > {INDEX_WIDTH{1'b0}})
+ calc_rotate_a_index = current_index_in - 1'b1;
+ else
+ calc_rotate_a_index = last_index_in;
+ end
+ endfunction
+ */
+
+ /*
+ //
+ // Narrow Counters
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_narrow_reg;
+ reg [INDEX_WIDTH-1:0] din_addr_narrow_dly;
+ localparam [INDEX_WIDTH-1:0] din_addr_narrow_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] din_addr_narrow_next = (din_addr_narrow_reg < index_last) ?
+ din_addr_narrow_reg + 1'b1 : din_addr_narrow_zero;
+ wire din_addr_narrow_done = din_addr_narrow_reg == index_last;
+
+ assign din_addr_narrow = din_addr_narrow_reg;
+
+ always @(posedge clk)
+ //
+ din_addr_narrow_dly <= din_addr_narrow_reg;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: din_addr_narrow_reg <= din_addr_narrow_next;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_narrow_reg <= din_addr_narrow_next;
+ endcase
+
+
+ //
+ // Helper Functions
+ //
+ function [NUM_MULTS-1:0] calc_mac_clear_bitmask;
+ input [2:0] t;
+ begin
+ case (t)
+ 3'd0: calc_mac_clear_bitmask = 8'b00000001;
+ 3'd1: calc_mac_clear_bitmask = 8'b00000010;
+ 3'd2: calc_mac_clear_bitmask = 8'b00000100;
+ 3'd3: calc_mac_clear_bitmask = 8'b00001000;
+ 3'd4: calc_mac_clear_bitmask = 8'b00010000;
+ 3'd5: calc_mac_clear_bitmask = 8'b00100000;
+ 3'd6: calc_mac_clear_bitmask = 8'b01000000;
+ 3'd7: calc_mac_clear_bitmask = 8'b10000000;
+ endcase
+ end
+ endfunction
+
+ function [NUM_MULTS:0] calc_mac_clear_square;
+ input [INDEX_WIDTH-4:0] current_col_index;
+ input [INDEX_WIDTH-1:0] b_addr_prev;
+ begin
+ if (b_addr_prev[INDEX_WIDTH-1:3] == current_col_index)
+ calc_mac_clear_square = {1'b0, calc_mac_clear_bitmask(b_addr_prev[2:0])};
+ else
+ calc_mac_clear_square = {1'b0, {NUM_MULTS{1'b0}}};
+ end
+ endfunction
+
+
+ //
+ // Wide Counters
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_wide_reg[0:NUM_MULTS-1];
+
+ integer xi;
+ always @(posedge clk)
+ //
+ for (xi=0; xi<NUM_MULTS; xi=xi+1)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(0, xi);
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(col_index + 1'b1, xi);
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_wide_reg[xi] <= calc_rotate_a_index(din_addr_wide_reg[xi], index_last);
+ //
+ endcase
+
+
+ //
+ // Enables
+ //
+ reg din_ena_narrow_reg = 1'b0;
+ reg [NUM_MULTS-1:0] din_ena_wide_reg = {NUM_MULTS{1'b0}};
+
+ assign din_ena_narrow = din_ena_narrow_reg;
+ assign din_ena_wide = din_ena_wide_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) din_ena_narrow_reg <= 1'b0;
+ else case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_narrow_reg <= 1'b1;
+ default: din_ena_narrow_reg <= 1'b0;
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+ else case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_wide_reg <= {NUM_MULTS{1'b1}};
+ default: din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+ endcase
+
+
+ //
+ // Modes
+ //
+ reg [2-1:0] din_mode_wide_reg;
+ reg [2-1:0] din_mode_narrow_reg;
+ reg [2-1:0] dout_mode_wide_reg;
+ reg [2-1:0] dout_mode_narrow_reg;
+
+ assign din_mode_wide = din_mode_wide_reg;
+ assign din_mode_narrow = din_mode_narrow_reg;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_wide_reg <= MODEXPNG_MODE_A;
+ default: din_mode_wide_reg <= 2'bXX;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_narrow_reg <= MODEXPNG_MODE_B;
+ default: din_mode_narrow_reg <= 2'bXX;
+ endcase
+
+
+ //
+ // MAC Array
+ //
+ wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_a[0:NUM_MULTS];
+ wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_b;
+ reg [ NUM_MULTS :0] mac_ce;
+ reg [ NUM_MULTS :0] mac_clr;
+ wire [ MODEXPNG_MAC_WIDTH-1:0] mac_p[0:NUM_MULTS];
+ reg [ NUM_MULTS :0] mac_rdy_lsb;
+ reg [ NUM_MULTS :0] mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1:0];
+
+ //reg [ NUM_MULTS :0] mac_ce_dly[MODEXPNG_MAC_LATENCY-1:0];
+ //wire [ NUM_MULTS :0] mac_rdy;
+
+
+
+
+
+ assign mac_din_b = din_narrow;
+
+
+ genvar x;
+ generate for (x=0; x<=NUM_MULTS; x=x+1)
+ begin : gen_macs
+ //
+ //assign mac_rdy[x] = mac_ce_dly[MODEXPNG_MAC_LATENCY-1][x];
+ //
+ modexpng_mac mac_inst
+ (
+ .clk (clk),
+ .ce (mac_ce[x]),
+ .clr (mac_clr[x]),
+ .a (mac_din_a[x]),
+ .b (mac_din_b),
+ .p (mac_p[x])
+ );
+ //
+ end
+ //
+ endgenerate
+
+ generate for (x=0; x<NUM_MULTS; x=x+1)
+ begin : gen_mac_din_a
+ //
+ assign mac_din_a[x] = din_wide[x*MODEXPNG_WORD_WIDTH+:MODEXPNG_WORD_WIDTH];
+ //
+ end
+ endgenerate
+
+ generate for (x=0; x<NUM_MULTS; x=x+1)
+ begin : gen_din_addr_wide
+ //
+ assign din_addr_wide[x*INDEX_WIDTH+:INDEX_WIDTH] = din_addr_wide_reg[x];
+ //
+ end
+ endgenerate
+
+
+ //
+ // MAC Clock Enable Logic
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+ else case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_ce <= {1'b0, {NUM_MULTS{1'b1}}};
+ default: mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+ endcase
+
+
+ //
+ // MAC Valid Logic
+ //
+ integer y;
+
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_rdy_lsb_dly[0][xi] <= mac_rdy_lsb[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_rdy_lsb_dly[y][xi] <= mac_rdy_lsb_dly[y-1][xi];
+ end
+
+ always @(posedge clk) begin
+ //
+ fsm_state_dly[0] <= fsm_state;
+ for (y=1; y<=MODEXPNG_MAC_LATENCY; y=y+1)
+ fsm_state_dly[y] <= fsm_state_dly[y-1];
+ end
+
+ */
+
+ /*
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_ce_dly[0][xi] <= mac_ce[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_ce_dly[y][xi] <= mac_ce_dly[y-1][xi];
+ end
+ */
+ /*
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_clr_dly[0][xi] <= mac_clr[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_clr_dly[y][xi] <= mac_clr_dly[y-1][xi];
+ end
+ */
+
+ /*
+ //
+ // MAC Clear Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_clr <= {1'b0, {NUM_MULTS{1'b1}}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_clr <= calc_mac_clear_square(col_index, din_addr_narrow_dly);
+ default: mac_clr <= {1'bX, {NUM_MULTS{1'bX}}};
+ endcase
+
+
+ //
+ // MAC Ready Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_rdy_lsb <= calc_mac_clear_square(col_index, din_addr_narrow);
+ default: mac_rdy_lsb <= {1'bX, {NUM_MULTS{1'bX}}};
+ endcase
+
+
+
+ //
+ // Recombinators
+ //
+ reg rcmb_lsb_ce;
+ reg rcmb_lsb_clr;
+ reg [MODEXPNG_MAC_WIDTH-1: 0] rcmb_lsb_din;
+ wire [15: 0] rcmb_lsb_dout;
+
+ modexpng_part_recombinator recomb_lsb
+ (
+ .clk (clk),
+ .ce (rcmb_lsb_ce),
+ .clr (rcmb_lsb_clr),
+ .din (rcmb_lsb_din),
+ .dout (rcmb_lsb_dout)
+ );
+
+
+ reg calc_rcmb_lsb_ce;
+ always @*
+ //
+ calc_rcmb_lsb_ce = | mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0];
+
+ reg [MODEXPNG_MAC_WIDTH-1:0] calc_rcmb_lsb_din;
+
+ always @*
+ //
+ casez (mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0])
+ 8'b00000001: calc_rcmb_lsb_din = mac_p[0];
+ 8'b00000010: calc_rcmb_lsb_din = mac_p[1];
+ 8'b00000100: calc_rcmb_lsb_din = mac_p[2];
+ 8'b00001000: calc_rcmb_lsb_din = mac_p[3];
+ 8'b00010000: calc_rcmb_lsb_din = mac_p[4];
+ 8'b00100000: calc_rcmb_lsb_din = mac_p[5];
+ 8'b01000000: calc_rcmb_lsb_din = mac_p[6];
+ 8'b10000000: calc_rcmb_lsb_din = mac_p[7];
+ default: calc_rcmb_lsb_din = {MODEXPNG_MAC_WIDTH{1'bX}};
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0)
+ rcmb_lsb_ce <= 1'b0;
+ else case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_ce <= calc_rcmb_lsb_ce;
+ default: rcmb_lsb_ce <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: rcmb_lsb_clr <= 1'b1;
+ default: rcmb_lsb_clr <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_din <= calc_rcmb_lsb_din;
+ default: rcmb_lsb_din <= {MODEXPNG_MAC_WIDTH{1'bX}};
+ endcase
+
+
+
+*/
diff --git a/rtl/dsp/dsp_array.v b/rtl/dsp/dsp_array.v
new file mode 100644
index 0000000..178f87f
--- /dev/null
+++ b/rtl/dsp/dsp_array.v
@@ -0,0 +1,111 @@
+module dsp_array
+(
+ input clk,
+
+ input ce_a,
+ input ce_b,
+ input ce_m,
+ input ce_p,
+ input ce_mode,
+
+ input [8 -1:0] mode_z,
+
+ input [4*18-1:0] a,
+ input [1*17-1:0] b,
+ output [8*47-1:0] p
+);
+
+ `include "../modexpng_parameters_x8.vh"
+
+ wire [17:0] casc_a[0:3];
+ wire [16:0] casc_b[0:3];
+
+ wire ce_a0 = ce_a;
+ reg ce_a1 = 1'b0;
+ reg ce_a2 = 1'b0;
+
+ wire ce_b0 = ce_b;
+ reg ce_b1 = 1'b0;
+
+ always @(posedge clk) begin
+ ce_a1 <= ce_a0;
+ ce_a2 <= ce_a1;
+ ce_b1 <= ce_b0;
+ end
+
+
+ genvar z;
+ generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+ //
+ begin : DSP48E1
+ //
+ dsp_slice #
+ (
+ .AB_INPUT("DIRECT"),
+ .B_REG(2)
+ )
+ dsp_direct
+ (
+ .clk (clk),
+
+ .ce_a1 (ce_a0),
+ .ce_b1 (ce_b0),
+ .ce_a2 (ce_a1),
+ .ce_b2 (ce_b1),
+ .ce_m (ce_m),
+ .ce_p (ce_p),
+ .ce_mode (ce_mode),
+
+ .a (a[z*18+:18]),
+ .b (b),
+ .p (p[47*2*z+:47]),
+
+ .inmode (5'b00000),
+ .opmode ({1'b0, mode_z[2*z], 1'b0, 2'b01, 2'b01}),
+ .alumode (4'b0000),
+
+ .casc_a_in ({17{1'b0}}),
+ .casc_b_in ({17{1'b0}}),
+
+ .casc_a_out (casc_a[z]),
+ .casc_b_out (casc_b[z])
+ );
+ //
+ dsp_slice #
+ (
+ .AB_INPUT("CASCADE"),
+ .B_REG(1)
+ )
+ dsp_cascade
+ (
+ .clk (clk),
+
+ .ce_a1 (ce_a1),
+ .ce_b1 (1'b0),
+ .ce_a2 (ce_a2),
+ .ce_b2 (ce_b1),
+ .ce_m (ce_m),
+ .ce_p (ce_p),
+ .ce_mode (ce_mode),
+
+ .a (a[z*18+:18]),
+ .b (b),
+ .p (p[47*(2*z+1)+:47]),
+
+ .inmode (5'b00000),
+ .opmode ({1'b0, mode_z[2*z+1], 1'b0, 2'b01, 2'b01}),
+ .alumode (4'b0000),
+
+ .casc_a_in (casc_a[z]),
+ .casc_b_in (casc_b[z]),
+
+ .casc_a_out (),
+ .casc_b_out ()
+ );
+ //
+ end
+ //
+ endgenerate
+
+
+endmodule
diff --git a/rtl/dsp/dsp_slice.v b/rtl/dsp/dsp_slice.v
new file mode 100644
index 0000000..9f1298b
--- /dev/null
+++ b/rtl/dsp/dsp_slice.v
@@ -0,0 +1,125 @@
+module dsp_slice #
+(
+ AB_INPUT = "DIRECT",
+ B_REG = 2
+)
+(
+ input clk,
+ input ce_a1,
+ input ce_b1,
+ input ce_a2,
+ input ce_b2,
+ input ce_m,
+ input ce_p,
+ input ce_mode,
+ input [17:0] a,
+ input [16:0] b,
+ output [46:0] p,
+ input [ 4:0] inmode,
+ input [ 6:0] opmode,
+ input [ 3:0] alumode,
+ input [17:0] casc_a_in,
+ input [16:0] casc_b_in,
+ output [17:0] casc_a_out,
+ output [16:0] casc_b_out
+);
+
+ wire [30-18-1:0] casc_a_dummy;
+ wire [18-17-1:0] casc_b_dummy;
+ wire [48-47-1:0] p_dummy;
+
+ DSP48E1 #
+ (
+ .AREG (2),
+ .BREG (B_REG),
+ .CREG (0),
+ .DREG (0),
+ .ADREG (0),
+ .MREG (1),
+ .PREG (1),
+ .ACASCREG (1),
+ .BCASCREG (1),
+ .INMODEREG (0),
+ .OPMODEREG (1),
+ .ALUMODEREG (0),
+ .CARRYINREG (0),
+ .CARRYINSELREG (0),
+
+ .A_INPUT (AB_INPUT),
+ .B_INPUT (AB_INPUT),
+
+ .USE_DPORT ("FALSE"),
+ .USE_MULT ("DYNAMIC"),
+ .USE_SIMD ("ONE48"),
+
+ .MASK (48'h3fffffffffff),
+ .PATTERN (48'h000000000000),
+ .SEL_MASK ("MASK"),
+ .SEL_PATTERN ("PATTERN"),
+
+ .USE_PATTERN_DETECT ("NO_PATDET"),
+ .AUTORESET_PATDET ("NO_RESET")
+ )
+ DSP48E1_inst
+ (
+ .CLK (clk),
+
+ .CEA1 (ce_a1),
+ .CEB1 (ce_b1),
+ .CEA2 (ce_a2),
+ .CEB2 (ce_b2),
+ .CEAD (1'b0),
+ .CEC (1'b0),
+ .CED (1'b0),
+ .CEM (ce_m),
+ .CEP (ce_p),
+ .CEINMODE (1'b0),
+ .CECTRL (ce_mode),
+ .CEALUMODE (1'b0),
+ .CECARRYIN (1'b0),
+
+ .A ({{(30-18){1'b0}}, a}),
+ .B ({{(18-17){1'b0}}, b}),
+ .C ({48{1'b0}}),
+ .D ({25{1'b0}}),
+ .P ({p_dummy, p}),
+
+ .INMODE (inmode),
+ .OPMODE (opmode),
+ .ALUMODE (alumode),
+
+ .ACIN ({{(30-18){1'b0}}, casc_a_in}),
+ .BCIN ({{(18-17){1'b0}}, casc_b_in}),
+ .ACOUT ({casc_a_dummy, casc_a_out}),
+ .BCOUT ({casc_b_dummy, casc_b_out}),
+ .PCIN ({48{1'b0}}),
+ .PCOUT (),
+ .CARRYCASCIN (1'b0),
+ .CARRYCASCOUT (),
+
+ .RSTA (1'b0),
+ .RSTB (1'b0),
+ .RSTC (1'b0),
+ .RSTD (1'b0),
+ .RSTM (1'b0),
+ .RSTP (1'b0),
+ .RSTINMODE (1'b0),
+ .RSTCTRL (1'b0),
+ .RSTALUMODE (1'b0),
+ .RSTALLCARRYIN (1'b0),
+
+ .UNDERFLOW (),
+ .OVERFLOW (),
+ .PATTERNDETECT (),
+ .PATTERNBDETECT (),
+
+ .CARRYIN (1'b0),
+ .CARRYOUT (),
+ .CARRYINSEL (3'b000),
+
+ .MULTSIGNIN (1'b0),
+ .MULTSIGNOUT ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mac.v b/rtl/modexpng_mac.v
new file mode 100644
index 0000000..9105dab
--- /dev/null
+++ b/rtl/modexpng_mac.v
@@ -0,0 +1,54 @@
+module modexpng_mac
+(
+ clk,
+ ce, clr,
+ casc_a,
+ a_in, b_in, p_out,
+ a_casc_in, a_casc_out
+);
+
+ input clk;
+ input ce;
+ input clr;
+ input casc_a;
+ input [16:0] a_in;
+ input [16:0] b_in;
+ output [46:0] p_out;
+ input [16:0] a_casc_in;
+ output [16:0] a_casc_out;
+
+ reg [16:0] a_reg;
+ reg [16:0] b_reg;
+ assign a_casc_out = a_reg;
+ always @(posedge clk)
+ //
+ if (ce) {b_reg, a_reg} <= {b_in, casc_a ? a_casc_in : a_in};
+
+ reg ce_dly1;
+ reg ce_dly2;
+ always @(posedge clk)
+ //
+ {ce_dly2, ce_dly1} <= {ce_dly1, ce};
+
+ reg clr_dly1;
+ reg clr_dly2;
+ always @(posedge clk) begin
+ //
+ if (ce) clr_dly1 <= clr;
+ if (ce_dly1) clr_dly2 <= clr_dly1;
+ //
+ end
+
+ reg [33:0] m_reg;
+ wire [46:0] m_reg_ext = {{13{1'b0}}, m_reg};
+ always @(posedge clk)
+ //
+ if (ce_dly1) m_reg <= {{17{1'b0}}, a_reg} * {{17{1'b0}}, b_reg};
+
+ reg [46:0] p_reg;
+ assign p_out = p_reg;
+ always @(posedge clk)
+ //
+ if (ce_dly2) p_reg <= clr_dly2 ? m_reg_ext : p_reg + m_reg_ext;
+
+endmodule
diff --git a/rtl/modexpng_mac_array.v b/rtl/modexpng_mac_array.v
new file mode 100644
index 0000000..067929e
--- /dev/null
+++ b/rtl/modexpng_mac_array.v
@@ -0,0 +1,116 @@
+module modexpng_mac_array
+(
+ clk,
+ ce, clr,
+ ce_aux, clr_aux,
+ casc_a, casc_a_aux,
+ a_in, b_in, p_out,
+ a_in_aux, p_out_aux
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input ce;
+ input [NUM_MULTS -1:0] clr;
+ input ce_aux;
+ input clr_aux;
+ input [NUM_MULTS -2:0] casc_a;
+ input casc_a_aux;
+ input [NUM_MULTS * WORD_WIDTH -1:0] a_in;
+ input [ 1 * WORD_WIDTH -1:0] b_in;
+ output [NUM_MULTS * MAC_WIDTH -1:0] p_out;
+ input [ 1 * WORD_WIDTH -1:0] a_in_aux;
+ output [ 1 * MAC_WIDTH -1:0] p_out_aux;
+
+
+ //
+ // A-Cascade Paths
+ //
+ wire [WORD_WIDTH-1:0] a_casc_int[0:NUM_MULTS-2];
+ wire [WORD_WIDTH-1:0] a_casc_int_aux;
+
+
+ //
+ // LSB
+ //
+ modexpng_mac mac_lsb
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[0]),
+ .casc_a (1'b0),
+ .a_in (a_in[0+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[0+:MAC_WIDTH]),
+ .a_casc_in ({WORD_WIDTH{1'b0}}),
+ .a_casc_out (a_casc_int[0])
+ );
+
+
+ //
+ // INT
+ //
+ genvar z;
+ generate for (z=1; z<(NUM_MULTS-1); z=z+1)
+ begin : gen_modexpng_mac_int
+ modexpng_mac mac_int
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[z]),
+ .casc_a (casc_a[z-1]),
+ .a_in (a_in[z*WORD_WIDTH+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[z*MAC_WIDTH+:MAC_WIDTH]),
+ .a_casc_in (a_casc_int[z-1]),
+ .a_casc_out (a_casc_int[z])
+ );
+ end
+ endgenerate
+
+
+ //
+ // MSB
+ //
+ modexpng_mac mac_msb
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[NUM_MULTS-1]),
+ .casc_a (casc_a[NUM_MULTS-2]),
+ .a_in (a_in[(NUM_MULTS-1)*WORD_WIDTH+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[(NUM_MULTS-1)*MAC_WIDTH+:MAC_WIDTH]),
+ .a_casc_in (a_casc_int[NUM_MULTS-2]),
+ .a_casc_out (a_casc_int_aux)
+ );
+
+
+ //
+ // AUX
+ //
+ modexpng_mac mac_aux
+ (
+ .clk (clk),
+ .ce (ce_aux),
+ .clr (clr_aux),
+ .casc_a (casc_a_aux),
+ .a_in (a_in_aux),
+ .b_in (b_in),
+ .p_out (p_out_aux),
+ .a_casc_in (a_casc_int_aux),
+ .a_casc_out ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mem.v b/rtl/modexpng_mem.v
new file mode 100644
index 0000000..ca89214
--- /dev/null
+++ b/rtl/modexpng_mem.v
@@ -0,0 +1,93 @@
+//
+// TODO: Add license text!
+//
+
+module modexpng_mem #
+(
+ parameter MEM_WIDTH = 17,
+ parameter MEM_ADDR_BITS = 6
+)
+(
+ input clk,
+
+ input [MEM_ADDR_BITS-1:0] a_addr,
+ input a_en,
+ input a_wr,
+ input [MEM_WIDTH -1:0] a_in,
+ output [MEM_WIDTH -1:0] a_out,
+
+ input [MEM_ADDR_BITS-1:0] b_addr,
+ input b_en,
+ input b_reg_en,
+ output [MEM_WIDTH -1:0] b_out
+);
+
+
+ //
+ // BRAM
+ //
+ (* RAM_STYLE="BLOCK" *)
+ reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+ //
+ // Initialization for Simulation
+ //
+ /*
+ integer c;
+ initial begin
+ for (c=0; c<(2**MEM_ADDR_BITS); c=c+1)
+ bram[c] = {MEM_WIDTH{1'b0}};
+ end
+ */
+
+
+
+ //
+ // Output Registers
+ //
+ reg [MEM_WIDTH-1:0] bram_b;
+ reg [MEM_WIDTH-1:0] bram_b_reg;
+
+ assign a_out = 32'hDEADCE11;
+ assign b_out = bram_b_reg;
+
+
+ //
+ // Note, that when both ports are accessing the same location, conflict can
+ // potentionally arise. See Xilinx UG473 (pages 19-20, "Conflict
+ // Avoidance") for more information. In our configuration to avoid that the
+ // write port must be coded to operate in READ_FIRST mode. If the write
+ // port is overwriting the same address the read port is accessing, the
+ // write port must read the previously stored data (not the data it is
+ // writing, as that would be WRITE_FIRST mode).
+ //
+
+
+ //
+ // Write-Only Port A
+ //
+ always @(posedge clk)
+ //
+ if (a_en)
+ //
+ if (a_wr) bram[a_addr] <= a_in;
+
+
+ //
+ // Read-Only Port B
+ //
+ always @(posedge clk)
+ //
+ if (b_en)
+ //
+ bram_b <= bram[b_addr];
+
+ always @(posedge clk)
+ //
+ if (b_reg_en)
+ //
+ bram_b_reg <= bram_b;
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_col_index.v b/rtl/modexpng_mmm_col_index.v
new file mode 100644
index 0000000..b904795
--- /dev/null
+++ b/rtl/modexpng_mmm_col_index.v
@@ -0,0 +1,90 @@
+module modexpng_mmm_col_index
+(
+ clk,
+ index_last,
+ fsm_state_next,
+ col_index,
+ col_index_done,
+ col_index_zero,
+ col_index_next,
+ col_index_prev
+);
+
+
+ //
+ // Includes
+ //
+ //`include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ output [ INDEX_WIDTH-4:0] col_index;
+ output col_index_done;
+ output [ INDEX_WIDTH-4:0] col_index_zero;
+ output [ INDEX_WIDTH-4:0] col_index_next;
+ output [ INDEX_WIDTH-4:0] col_index_prev;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-4:0] col_index_reg;
+ reg [INDEX_WIDTH-4:0] col_index_last;
+ reg [INDEX_WIDTH-4:0] col_index_dly;
+
+
+ //
+ // Mapping
+ //
+ assign col_index = col_index_reg;
+ assign col_index_prev = col_index_dly;
+
+
+ //
+ // Handy Wires
+ //
+ assign col_index_done = col_index == col_index_last;
+ assign col_index_zero = {(INDEX_WIDTH-3){1'b0}};
+ assign col_index_next = col_index + 1'b1;
+
+
+ //
+ // Increment Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+ col_index_reg <= col_index_zero;
+ col_index_last <= index_last[INDEX_WIDTH-1:3];
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+ col_index_reg <= col_index_next;
+ //
+ endcase
+
+
+ //
+ // Delay Logic
+ //
+ always @(posedge clk)
+ //
+ col_index_dly <= col_index;
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_din_addr.v b/rtl/modexpng_mmm_din_addr.v
new file mode 100644
index 0000000..565c7e0
--- /dev/null
+++ b/rtl/modexpng_mmm_din_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_din_addr
+(
+ clk, rst_n,
+ index_last,
+ fsm_state_next,
+ col_index_zero, col_index_next,
+ din_addr, din_bank, din_ena, din_reg_ena,
+ din_addr_cnt, din_addr_cnt_last,
+ din_addr_cnt_lower_prev, din_addr_cnt_upper_prev
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ input [ INDEX_WIDTH-4:0] col_index_zero;
+ input [ INDEX_WIDTH-4:0] col_index_next;
+ output [ INDEX_WIDTH-4:0] din_addr;
+ output [ 3-1:0] din_bank;
+ output [ 1-1:0] din_ena;
+ output [ 1-1:0] din_reg_ena;
+ output [ INDEX_WIDTH-1:0] din_addr_cnt;
+ output [ INDEX_WIDTH-1:0] din_addr_cnt_last;
+ output [ 3-1:0] din_addr_cnt_lower_prev;
+ output [ INDEX_WIDTH-4:0] din_addr_cnt_upper_prev;
+
+
+ //
+ // Address
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_reg;
+ wire [INDEX_WIDTH-1:0] din_addr_zero = {INDEX_WIDTH{1'b0}};
+ reg [INDEX_WIDTH-1:0] din_addr_last;
+ wire [INDEX_WIDTH-1:0] din_addr_prev = (din_addr_reg == din_addr_zero) ? din_addr_last : din_addr_reg - 1'b1;
+
+ reg [INDEX_WIDTH-1:0] din_addr_cnt_reg;
+ wire [INDEX_WIDTH-1:0] din_addr_cnt_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] din_addr_cnt_next = din_addr_cnt_reg + 1'b1;
+ reg [INDEX_WIDTH-1:0] din_addr_cnt_last_reg;
+ wire [ 3-1:0] din_addr_cnt_lower = din_addr_cnt_reg[ 3-1:0];
+ wire [INDEX_WIDTH-4:0] din_addr_cnt_upper = din_addr_cnt_reg[INDEX_WIDTH-1:3];
+ reg [ 3-1:0] din_addr_cnt_lower_dly;
+ reg [INDEX_WIDTH-4:0] din_addr_cnt_upper_dly;
+
+ reg [ 3-1:0] din_bank_reg;
+
+
+ //
+ // Enables
+ //
+ reg din_ena_reg = 1'b0;
+ reg din_reg_ena_reg = 1'b0;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n)
+ din_ena_reg <= 1'b0;
+ else case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ din_ena_reg <= 1'b1;
+ //
+ default:
+ din_ena_reg <= 1'b0;
+ //
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n)
+ din_reg_ena_reg <= 1'b0;
+ else
+ din_reg_ena_reg <= din_ena_reg;
+
+
+ //
+ // Address Mapping
+ //
+ assign din_addr = din_addr_reg[INDEX_WIDTH-1:3];
+
+ assign din_addr_cnt = din_addr_cnt_reg;
+ assign din_addr_cnt_last = din_addr_cnt_last_reg;
+ assign din_addr_cnt_lower_prev = din_addr_cnt_lower_dly;
+ assign din_addr_cnt_upper_prev = din_addr_cnt_upper_dly;
+
+ assign din_bank = din_bank_reg;
+
+
+ //
+ // Enable Mapping
+ //
+ assign din_ena = din_ena_reg;
+ assign din_reg_ena = din_reg_ena_reg;
+
+
+ //
+ // Delay
+ //
+ always @(posedge clk) begin
+ din_addr_cnt_lower_dly <= din_addr_cnt_lower;
+ din_addr_cnt_upper_dly <= din_addr_cnt_upper;
+ end
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+ din_addr_reg <= {col_index_zero, {3{1'b0}}};
+ din_addr_last <= index_last;
+ din_addr_cnt_reg <= din_addr_cnt_zero;
+ din_addr_cnt_last_reg <= index_last;
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: begin
+ din_addr_reg <= {col_index_next, {3{1'b0}}};
+ din_addr_cnt_reg <= din_addr_cnt_zero;
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+ din_addr_reg <= din_addr_prev;
+ din_addr_cnt_reg <= din_addr_cnt_next;
+ end
+ //
+ //default:
+ //
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ din_bank_reg = BANK_XY_T1T2;
+ //
+ default:
+ din_bank_reg = BANK_XY_ANY;
+ //
+ endcase
+
+endmodule
diff --git a/rtl/modexpng_mmm_dout_addr.v b/rtl/modexpng_mmm_dout_addr.v
new file mode 100644
index 0000000..3749d82
--- /dev/null
+++ b/rtl/modexpng_mmm_dout_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_dout_addr
+(
+ clk, rst_n,
+ //index_last,
+ fsm_state,
+ load_xy_addr,
+ load_addr_zero,
+ load_nn_coeff_addr_done,
+ /*
+
+ col_index_zero, col_index_next,*/
+ x_dout_addr, y_dout_addr,
+ x_dout_ena, y_dout_ena,
+ x_dout_bank, y_dout_bank
+
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ //input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+ input [INDEX_WIDTH:0] load_xy_addr; // address
+ input load_addr_zero;
+ input load_nn_coeff_addr_done;
+ //input [ INDEX_WIDTH-4:0] col_index_zero;
+ //input [ INDEX_WIDTH-4:0] col_index_next;
+ output [INDEX_WIDTH-4:0] x_dout_addr;
+ output [INDEX_WIDTH-4:0] y_dout_addr;
+
+ output [NUM_MULTS-1:0] x_dout_ena;
+ output [NUM_MULTS-1:0] y_dout_ena;
+
+ output [3-1:0] x_dout_bank;
+ output [3-1:0] y_dout_bank;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-4:0] x_dout_addr_reg; //clog2
+ reg [INDEX_WIDTH-4:0] y_dout_addr_reg; //clog2
+
+ reg [NUM_MULTS-1:0] x_dout_ena_reg = {NUM_MULTS{1'b0}};
+ reg [NUM_MULTS-1:0] y_dout_ena_reg = {NUM_MULTS{1'b0}};
+
+ reg [NUM_MULTS-1:0] x_dout_ena_int;
+ reg [NUM_MULTS-1:0] y_dout_ena_int;
+
+ reg [3-1:0] x_dout_bank_reg;
+ reg [3-1:0] y_dout_bank_reg;
+
+
+ //
+ // Mapping
+ //
+ assign x_dout_addr = x_dout_addr_reg;
+ assign y_dout_addr = y_dout_addr_reg;
+
+ assign x_dout_ena = x_dout_ena_reg;
+ assign y_dout_ena = y_dout_ena_reg;
+
+ assign x_dout_bank = x_dout_bank_reg;
+ assign y_dout_bank = y_dout_bank_reg;
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ x_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+ y_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+ y_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+ end
+ //
+ default: begin
+ x_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+ y_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+ end
+ //
+ endcase
+
+ wire [NUM_MULTS-1:0] load_xy_ena_init = {{NUM_MULTS-1{1'b0}}, 1'b1};
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_2: begin
+ x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1]};
+ y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_2: begin
+ x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1] & ~load_nn_coeff_addr_done};
+ y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+ end
+ //
+ endcase
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ y_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ end else case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3,
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_ena_reg <= x_dout_ena_int;
+ y_dout_ena_reg <= y_dout_ena_int;
+ end
+ //
+ default: begin
+ x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ y_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ end
+ //
+ endcase
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ x_dout_bank_reg <= BANK_X_T1;
+ y_dout_bank_reg <= BANK_Y_T2;
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_X_N : BANK_XY_AUX;
+ y_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_Y_N_COEFF : BANK_XY_AUX;
+ end
+ //
+ default: begin
+ x_dout_bank_reg <= BANK_XY_ANY;
+ y_dout_bank_reg <= BANK_XY_ANY;
+ end
+ //
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_fsm.vh b/rtl/modexpng_mmm_fsm.vh
new file mode 100644
index 0000000..c237a0b
--- /dev/null
+++ b/rtl/modexpng_mmm_fsm.vh
@@ -0,0 +1,24 @@
+localparam FSM_STATE_WIDTH = 32;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_IDLE = 0;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_1 = 1;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_2 = 2;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_3 = 3;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_1 = 4;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_2 = 5;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_3 = 6;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_INIT = 11;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_TRIG = 12;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_BUSY = 13;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_INIT = 14;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_TRIG = 15;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_BUSY = 16;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_HOLDOFF = 17;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_STOP = 999;
+ \ No newline at end of file
diff --git a/rtl/modexpng_mmm_pad.v b/rtl/modexpng_mmm_pad.v
new file mode 100644
index 0000000..a2a21ff
--- /dev/null
+++ b/rtl/modexpng_mmm_pad.v
@@ -0,0 +1,153 @@
+module modexpng_mmm_pad
+(
+ clk, rst_n,
+ fsm_state,
+ load_xy_addr_lsb,
+ pad_x_rd_addr, pad_y_rd_addr,
+ pad_x_rd_ena, pad_y_rd_ena,
+ pad_x_rd_dout, pad_y_rd_dout,
+ load_x_din, load_y_din
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+
+ input [INDEX_WIDTH-1:0] load_xy_addr_lsb;
+
+ input [WORD_WIDTH-1:0] load_x_din;
+ input [WORD_WIDTH-1:0] load_y_din;
+
+ input [INDEX_WIDTH-1:0] pad_x_rd_addr;
+ input [INDEX_WIDTH-1:0] pad_y_rd_addr;
+
+ input pad_x_rd_ena;
+ input pad_y_rd_ena;
+
+ output [WORD_WIDTH-1:0] pad_x_rd_dout;
+ output [WORD_WIDTH-1:0] pad_y_rd_dout;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-1:0] pad_x_wr_addr;
+ reg [INDEX_WIDTH-1:0] pad_y_wr_addr;
+ reg pad_x_wr_ena;
+ reg pad_y_wr_ena;
+ reg [ WORD_WIDTH-1:0] pad_x_wr_din;
+ reg [ WORD_WIDTH-1:0] pad_y_wr_din;
+
+ bram_1wo_1ro_readfirst_ce #
+ (
+ .MEM_WIDTH (WORD_WIDTH),
+ .MEM_ADDR_BITS (INDEX_WIDTH)
+ )
+ pad_x
+ (
+ .clk (clk),
+
+ .a_addr (pad_x_wr_addr),
+ .a_en (pad_x_wr_ena),
+ .a_wr (pad_x_wr_ena),
+ .a_in (pad_x_wr_din),
+ .a_out (), // unused
+
+ .b_addr (pad_x_rd_addr),
+ .b_en (pad_x_rd_ena),
+ .b_out (pad_x_rd_dout)
+ );
+
+ bram_1wo_1ro_readfirst_ce #
+ (
+ .MEM_WIDTH (WORD_WIDTH),
+ .MEM_ADDR_BITS (INDEX_WIDTH)
+ )
+ pad_y
+ (
+ .clk (clk),
+
+ .a_addr (pad_y_wr_addr),
+ .a_en (pad_y_wr_ena),
+ .a_wr (pad_y_wr_ena),
+ .a_in (pad_y_wr_din),
+ .a_out (), // unused
+
+ .b_addr (pad_y_rd_addr),
+ .b_en (pad_y_rd_ena),
+ .b_out (pad_y_rd_dout)
+ );
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_addr <= load_xy_addr_lsb;
+ pad_y_wr_addr <= load_xy_addr_lsb;
+ end
+ //
+ default: begin
+ pad_x_wr_addr <= {INDEX_WIDTH{1'bX}};
+ pad_y_wr_addr <= {INDEX_WIDTH{1'bX}};
+ end
+ //
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_din <= load_x_din;
+ pad_y_wr_din <= load_y_din;
+ end
+ //
+ default: begin
+ pad_x_wr_din <= load_x_din;
+ pad_y_wr_din <= load_y_din;
+ end
+ //
+ endcase
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ pad_x_wr_ena <= 1'b0;
+ pad_y_wr_ena <= 1'b0;
+ end else case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_ena <= 1'b1;
+ pad_y_wr_ena <= 1'b1;
+ end
+ //
+ default: begin
+ pad_x_wr_ena <= 1'b0;
+ pad_y_wr_ena <= 1'b0;
+ end
+ //
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_transporter.v b/rtl/modexpng_mmm_transporter.v
new file mode 100644
index 0000000..a8f309a
--- /dev/null
+++ b/rtl/modexpng_mmm_transporter.v
@@ -0,0 +1,157 @@
+module modexpng_mmm_transporter
+(
+ clk,
+ ena,
+ index_last,
+ fsm_state,
+ fsm_state_next,
+ load_phase,
+ load_xy_addr,
+ load_xy_addr_vld,
+ load_xy_req,
+ load_addr_zero,
+ load_t1t2_addr_done,
+ load_nn_coeff_addr_done
+);
+
+
+ //
+ // Includes
+ //
+ //`include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input ena;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ output load_phase;
+ output [ INDEX_WIDTH:0] load_xy_addr;
+ output load_xy_addr_vld;
+ output load_xy_req;
+ output load_addr_zero;
+ output load_t1t2_addr_done;
+ output load_nn_coeff_addr_done;
+
+
+ //
+ // Load Address Generator
+ //
+ reg load_phase_reg;
+ reg [INDEX_WIDTH:0] load_xy_addr_reg;
+ reg load_xy_addr_vld_reg;
+ reg load_xy_req_reg;
+
+
+ //
+ // Mapping
+ //
+ assign load_phase = load_phase_reg;
+ assign load_xy_addr = load_xy_addr_reg;
+ assign load_xy_addr_vld = load_xy_addr_vld_reg;
+ assign load_xy_req = load_xy_req_reg;
+
+
+ //
+ // Handy Quantities
+ //
+ wire [INDEX_WIDTH:0] load_xy_addr_zero = {{INDEX_WIDTH{1'b0}}, 1'b0};
+ wire [INDEX_WIDTH:0] load_xy_addr_next = load_xy_addr_reg + 1'b1;
+ wire [INDEX_WIDTH:0] load_xy_addr_xxx = {{INDEX_WIDTH{1'bX}}, 1'bX};
+
+
+ //
+ // More Handy Quantities
+ //
+ reg [INDEX_WIDTH:0] load_t1t2_addr_last;
+ reg [INDEX_WIDTH:0] load_nn_coeff_addr_last;
+
+
+ //
+ // Flags
+ //
+ assign load_addr_zero = load_xy_addr_reg == load_xy_addr_zero;
+ assign load_t1t2_addr_done = load_xy_addr_reg == load_t1t2_addr_last;
+ assign load_nn_coeff_addr_done = load_xy_addr_reg == load_nn_coeff_addr_last;
+
+
+ //
+ // Last Index Latch
+ //
+ always @(posedge clk)
+ //
+ if (ena && (fsm_state == FSM_STATE_IDLE)) begin
+ load_t1t2_addr_last <= {1'b0, index_last};
+ load_nn_coeff_addr_last <= {1'b0, index_last} + 1'b1;
+ end
+
+
+ //
+ // Update Load Phase
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1,
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_T1T2_3: load_phase_reg <= 1'b0;
+ FSM_STATE_LOAD_NN_COEFF_1,
+ FSM_STATE_LOAD_NN_COEFF_2,
+ FSM_STATE_LOAD_NN_COEFF_3: load_phase_reg <= 1'b1;
+ default: load_phase_reg <= 1'bX;
+ endcase
+
+
+ //
+ // Update Load Address
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_T1T2_3) ? load_xy_addr_next : load_xy_addr_zero;
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_T1T2_3: load_xy_addr_reg <= load_xy_addr_reg;
+ FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_NN_COEFF_3) ? load_xy_addr_next : load_xy_addr_zero;
+ FSM_STATE_LOAD_NN_COEFF_2,
+ FSM_STATE_LOAD_NN_COEFF_3: load_xy_addr_reg <= load_xy_addr_reg;
+ default load_xy_addr_reg <= load_xy_addr_xxx;
+ endcase
+
+
+ //
+ // Update Address Valid Flag
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1,
+ FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_vld_reg <= 1'b1;
+ default load_xy_addr_vld_reg <= 1'b0;
+ endcase
+
+
+ //
+ // Update Load Request Flag
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_NN_COEFF_2: load_xy_req_reg <= 1'b1;
+ default load_xy_req_reg <= 1'b0;
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_x8_dual.v b/rtl/modexpng_mmm_x8_dual.v
new file mode 100644
index 0000000..99a37fa
--- /dev/null
+++ b/rtl/modexpng_mmm_x8_dual.v
@@ -0,0 +1,550 @@
+module modexpng_mmm_x8_dual
+(
+ clk, rst_n,
+ ena, rdy,
+ mode, transfer,
+ index_last,
+ x_din, y_din, x_dout, y_dout,
+ x_din_addr, y_din_addr, x_dout_addr, y_dout_addr,
+ x_din_ena, y_din_ena, x_dout_ena, y_dout_ena, x_din_reg_ena, y_din_reg_ena,
+ x_din_bank, y_din_bank, x_dout_bank, y_dout_bank,
+ load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req,
+ load_x_din, load_y_din
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+
+ input ena;
+ output rdy;
+
+ input mode; // multiply: 0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2
+ // load/unload: 0 = load, 1 = unload
+ input transfer; // 0 = multiply, 1 = load/unload
+
+ input [INDEX_WIDTH-1:0] index_last;
+
+ input [NUM_MULTS*WORD_WIDTH-1:0] x_din;
+ input [NUM_MULTS*WORD_WIDTH-1:0] y_din;
+ output [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
+ output [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
+
+ output [INDEX_WIDTH-4:0] x_din_addr;
+ output [INDEX_WIDTH-4:0] y_din_addr;
+ output [INDEX_WIDTH-4:0] x_dout_addr;
+ output [INDEX_WIDTH-4:0] y_dout_addr;
+
+ output [ 1-1:0] x_din_ena;
+ output [ 1-1:0] y_din_ena;
+ output [NUM_MULTS-1:0] x_dout_ena;
+ output [NUM_MULTS-1:0] y_dout_ena;
+ output [ 1-1:0] x_din_reg_ena;
+ output [ 1-1:0] y_din_reg_ena;
+
+ output [3-1:0] x_din_bank;
+ output [3-1:0] y_din_bank;
+ output [3-1:0] x_dout_bank;
+ output [3-1:0] y_dout_bank;
+
+ output load_phase; // 0 = T1, T2; 1 = N, N_COEFF
+ output [ INDEX_WIDTH:0] load_xy_addr; // address
+ output load_xy_addr_vld; // address valid
+ output load_xy_req; // data request
+
+ input [WORD_WIDTH-1:0] load_x_din; // data input
+ input [WORD_WIDTH-1:0] load_y_din; // data input
+
+
+ //
+ // FSM State and Next States
+ //
+ reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+
+
+ //
+ // FSM Idle Next State
+ //
+ always @*
+ //
+ case ({transfer, mode})
+ 2'b00,
+ 2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG;
+ 2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1;
+ 2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload?
+ endcase
+
+
+ //
+ // Column Counter
+ //
+ wire [ INDEX_WIDTH-4:0] col_index;
+ wire col_index_done;
+ wire [ INDEX_WIDTH-4:0] col_index_zero;
+ wire [ INDEX_WIDTH-4:0] col_index_next;
+ wire [ INDEX_WIDTH-4:0] col_index_prev;
+
+ modexpng_mmm_col_index #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ mmm_col_index
+ (
+ .clk (clk),
+ .index_last (index_last),
+ .fsm_state_next (fsm_state_next),
+ .col_index (col_index),
+ .col_index_done (col_index_done),
+ .col_index_zero (col_index_zero),
+ .col_index_next (col_index_next),
+ .col_index_prev (col_index_prev)
+ );
+
+
+ //
+ // Load Address Generator
+ //
+ wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0];
+ wire load_addr_zero;
+ wire load_t1t2_addr_done;
+ wire load_nn_coeff_addr_done;
+
+ modexpng_mmm_transporter #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ transporter
+ (
+ .clk (clk),
+ .ena (ena),
+ .index_last (index_last),
+ .fsm_state (fsm_state),
+ .fsm_state_next (fsm_state_next),
+ .load_phase (load_phase),
+ .load_xy_addr (load_xy_addr),
+ .load_xy_addr_vld (load_xy_addr_vld),
+ .load_xy_req (load_xy_req),
+ .load_addr_zero (load_addr_zero),
+ .load_t1t2_addr_done (load_t1t2_addr_done),
+ .load_nn_coeff_addr_done (load_nn_coeff_addr_done)
+ );
+
+
+ //
+ // X, Y Address
+ //
+ wire [INDEX_WIDTH-1:0] x_din_addr_cnt;
+ wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last;
+ wire [ 3-1:0] x_din_addr_cnt_lower_prev;
+ wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev;
+
+ modexpng_mmm_din_addr #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ din_addr_x
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .index_last (index_last),
+ .fsm_state_next (fsm_state_next),
+ .col_index_zero (col_index_zero),
+ .col_index_next (col_index_next),
+ .din_addr (x_din_addr),
+ .din_bank (x_din_bank),
+ .din_ena (x_din_ena),
+ .din_reg_ena (x_din_reg_ena),
+ .din_addr_cnt (x_din_addr_cnt),
+ .din_addr_cnt_last (x_din_addr_cnt_last),
+ .din_addr_cnt_lower_prev (x_din_addr_cnt_lower_prev),
+ .din_addr_cnt_upper_prev (x_din_addr_cnt_upper_prev)
+ );
+
+ modexpng_mmm_dout_addr #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ dout_addr_xy
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .fsm_state (fsm_state),
+ .load_xy_addr (load_xy_addr),
+ .load_addr_zero (load_addr_zero),
+ .load_nn_coeff_addr_done (load_nn_coeff_addr_done),
+ .x_dout_addr (x_dout_addr),
+ .y_dout_addr (y_dout_addr),
+ .x_dout_ena (x_dout_ena),
+ .y_dout_ena (y_dout_ena),
+ .x_dout_bank (x_dout_bank),
+ .y_dout_bank (y_dout_bank)
+ );
+
+
+ //
+ // Helper Memories ("Scratchpad")
+ //
+ reg [INDEX_WIDTH-1:0] pad_xy_rd_addr;
+ reg pad_xy_rd_ena = 1'b0;
+ wire [ WORD_WIDTH-1:0] pad_x_rd_dout;
+ wire [ WORD_WIDTH-1:0] pad_y_rd_dout;
+
+ wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1;
+
+ modexpng_mmm_pad pad
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .fsm_state (fsm_state),
+ .load_xy_addr_lsb (load_xy_addr_lsb),
+ .load_x_din (load_x_din),
+ .load_y_din (load_y_din),
+ .pad_x_rd_addr (pad_xy_rd_addr),
+ .pad_y_rd_addr (pad_xy_rd_addr),
+ .pad_x_rd_ena (pad_xy_rd_ena),
+ .pad_y_rd_ena (pad_xy_rd_ena),
+ .pad_x_rd_dout (pad_x_rd_dout),
+ .pad_y_rd_dout (pad_y_rd_dout)
+ );
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ pad_xy_rd_ena <= 1'b0;
+ end else case (fsm_state_next)
+
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ pad_xy_rd_ena <= 1'b1;
+
+ default:
+ pad_xy_rd_ena <= 1'b0;
+
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+ pad_xy_rd_addr <= pad_xy_rd_addr_zero;
+
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ pad_xy_rd_addr <= pad_xy_rd_addr_next;
+
+ default:
+ pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}};
+
+ endcase
+
+
+
+
+ //
+ // Flags
+ //
+
+ wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last;
+
+ always @*
+ //
+ fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;;
+
+
+ //
+ // MAC Arrays
+ //
+ reg mac_x_ce = 1'b0;
+ reg mac_x_ce_aux = 1'b0;
+ reg [NUM_MULTS -1:0] mac_x_clr;
+ reg mac_x_clr_aux;
+ reg [NUM_MULTS -2:0] mac_x_casc_a;
+ reg mac_x_casc_a_aux;
+ wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a;
+ reg [ 1 * WORD_WIDTH -1:0] mac_x_a_aux;
+ //wire [ 1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1];
+ reg [ 1 * WORD_WIDTH -1:0] mac_x_b;
+ wire [NUM_MULTS * MAC_WIDTH -1:0] mac_x_p;
+ wire [ 1 * MAC_WIDTH -1:0] mac_x_p_aux;
+
+ reg mac_y_ce = 1'b0;
+ reg mac_y_ce_aux = 1'b0;
+ reg [NUM_MULTS -1:0] mac_y_clr;
+ reg mac_y_clr_aux;
+ reg [NUM_MULTS -2:0] mac_y_casc_a;
+ reg mac_y_casc_a_aux;
+ wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a;
+ reg [ 1 * WORD_WIDTH -1:0] mac_y_a_aux;
+ //wire [ 1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1];
+ reg [ 1 * WORD_WIDTH -1:0] mac_y_b;
+ wire [NUM_MULTS * MAC_WIDTH -1:0] mac_y_p;
+ wire [ 1 * MAC_WIDTH -1:0] mac_y_p_aux;
+
+ modexpng_mac_array mac_array_x
+ (
+ .clk (clk),
+ .ce (mac_x_ce),
+ .ce_aux (mac_x_ce_aux),
+ .clr (mac_x_clr),
+ .clr_aux (mac_x_clr_aux),
+ .casc_a (mac_x_casc_a),
+ .casc_a_aux (mac_x_casc_a_aux),
+ .a_in (mac_x_a),
+ .a_in_aux (mac_x_a_aux),
+ .b_in (mac_x_b),
+ .p_out (mac_x_p),
+ .p_out_aux (mac_x_p_aux)
+ );
+
+ modexpng_mac_array mac_array_y
+ (
+ .clk (clk),
+ .ce (mac_y_ce),
+ .ce_aux (mac_y_ce_aux),
+ .clr (mac_y_clr),
+ .clr_aux (mac_y_clr_aux),
+ .casc_a (mac_y_casc_a),
+ .casc_a_aux (mac_y_casc_a_aux),
+ .a_in (mac_y_a),
+ .a_in_aux (mac_y_a_aux),
+ .b_in (mac_y_b),
+ .p_out (mac_y_p),
+ .p_out_aux (mac_y_p_aux)
+ );
+
+ genvar gen_z;
+
+ generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+ begin : gen_xy_din
+ //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ //gen_xy_dout
+ assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH];
+
+ //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ end
+ endgenerate
+
+
+ //
+ // MAC Clock Enable Logic
+ //
+ reg mac_xy_ce_adv = 1'b0;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0;
+ else case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1;
+ default: mac_xy_ce_adv <= 1'b0;
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00;
+ else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}};
+
+
+ //
+ // MAC Clear Logic
+ //
+ wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value =
+ calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev);
+
+ reg [NUM_MULTS-1:0] mac_xy_clr_adv;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value;
+ default: mac_xy_clr_adv <= {NUM_MULTS{1'bX}};
+ endcase
+
+ always @(posedge clk)
+ //
+ {mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}};
+
+
+ //
+ // MAC Cascade Logic
+ //
+ reg [NUM_MULTS-2:0] mac_xy_casc_a_adv;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}};
+ default: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}};
+ endcase
+
+ always @(posedge clk)
+ //
+ {mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}};
+
+
+
+ //
+ // DOUT Mapping
+ //
+ generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+ begin : gen_xy_dout
+ assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ end
+ endgenerate
+
+
+ //
+ // DOUT
+ //
+ reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1];
+ reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1];
+
+
+
+
+ integer int_z;
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3,
+ FSM_STATE_LOAD_NN_COEFF_3:
+ for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+ x_dout_reg[int_z] <= load_x_din;
+ y_dout_reg[int_z] <= load_y_din;
+ end
+ //
+ default:
+ for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+ x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+ y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+ end
+ //
+ endcase
+
+
+
+ //
+ // FSM Process
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_state_next;
+
+
+ //
+ // FSM Transition Logic
+ //
+ always @* begin
+ //
+ fsm_state_next = FSM_STATE_IDLE;
+ //
+ case (fsm_state)
+ FSM_STATE_IDLE: fsm_state_next = ena ? fsm_state_after_idle : FSM_STATE_IDLE;
+
+ FSM_STATE_LOAD_T1T2_1: fsm_state_next = FSM_STATE_LOAD_T1T2_2 ;
+ FSM_STATE_LOAD_T1T2_2: fsm_state_next = FSM_STATE_LOAD_T1T2_3 ;
+ FSM_STATE_LOAD_T1T2_3: fsm_state_next = load_t1t2_addr_done ? FSM_STATE_LOAD_NN_COEFF_1 : FSM_STATE_LOAD_T1T2_1;
+
+ FSM_STATE_LOAD_NN_COEFF_1: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ;
+ FSM_STATE_LOAD_NN_COEFF_2: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ;
+ FSM_STATE_LOAD_NN_COEFF_3: fsm_state_next = load_nn_coeff_addr_done ? FSM_STATE_STOP : FSM_STATE_LOAD_NN_COEFF_1;
+
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+
+ /*
+ FSM_STATE_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_0_BUSY ;
+ FSM_STATE_TRIANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG : FSM_STATE_TRIANGLE_COL_0_BUSY;
+ FSM_STATE_TRIANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_N_BUSY ;
+ FSM_STATE_TRIANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle : FSM_STATE_TRIANGLE_COL_N_BUSY;
+
+ FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_0_BUSY ;
+ FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY;
+ FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_N_BUSY ;
+ FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle : FSM_STATE_RECTANGLE_COL_N_BUSY;
+ */
+
+ FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ;
+
+ endcase
+ //
+ end
+
+
+ //
+ // Ready Output
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else case (fsm_state)
+ FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0;
+ FSM_STATE_STOP: rdy_reg <= 1'b1;
+ endcase
+
+ function [ NUM_MULTS-1:0] calc_mac_clear_square;
+ input [INDEX_WIDTH-4:0] col_index_delayed;
+ input [ 3-1:0] x_din_addr_cnt_lower_delayed;
+ input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed;
+ begin
+ if (x_din_addr_cnt_upper_delayed == col_index_delayed)
+ case (x_din_addr_cnt_lower_delayed)
+ 3'b000: calc_mac_clear_square = 8'b00000001;
+ 3'b001: calc_mac_clear_square = 8'b00000010;
+ 3'b010: calc_mac_clear_square = 8'b00000100;
+ 3'b011: calc_mac_clear_square = 8'b00001000;
+ 3'b100: calc_mac_clear_square = 8'b00010000;
+ 3'b101: calc_mac_clear_square = 8'b00100000;
+ 3'b110: calc_mac_clear_square = 8'b01000000;
+ 3'b111: calc_mac_clear_square = 8'b10000000;
+ endcase
+ else
+ calc_mac_clear_square = {NUM_MULTS{1'b0}};
+ end
+ endfunction
+
+
+endmodule
diff --git a/rtl/modexpng_parameters.vh b/rtl/modexpng_parameters.vh
new file mode 100644
index 0000000..f846119
--- /dev/null
+++ b/rtl/modexpng_parameters.vh
@@ -0,0 +1,39 @@
+//localparam WORD_WIDTH = 17;
+//localparam MAC_WIDTH = 47;
+
+//localparam BANK_ADDR_WIDTH = 3; // TODO: Replace everywhere!
+
+localparam [2:0] BANK_FAT_T1T2 = 3'd0;
+localparam [2:0] BANK_FAT_ABL = 3'd1;
+localparam [2:0] BANK_FAT_ABH = 3'd2;
+localparam [2:0] BANK_FAT_Q = 3'd3;
+localparam [2:0] BANK_FAT_Q_EXT = 3'd4;
+localparam [2:0] BANK_FAT_ML = 3'd5;
+localparam [2:0] BANK_FAT_MH = 3'd6;
+localparam [2:0] BANK_FAT_MH_EXT = 3'd7;
+
+localparam [1:0] BANK_SLIM_T1T2 = 2'd0;
+localparam [1:0] BANK_SLIM_N = 2'd1;
+localparam [1:0] BANK_SLIM_N_COEFF = 2'd2;
+localparam [1:0] BANK_SLIM_N_COEFF_EXT = 2'd3;
+
+
+//localparam BANK_Y_T2 = 3'd0;
+//localparam BANK_XY_T1T2 = 3'd0;
+
+//localparam BANK_XY_AB_LSB = 3'd1;
+//localparam BANK_XY_AB_MSB = 3'd2;
+
+//localparam BANK_X_N = 3'd3;
+//localparam BANK_Y_N_COEFF = 3'd3;
+
+//localparam BANK_XY_M = 3'd4;
+
+//localparam BANK_XY_Q_LSB = 3'd5;
+//localparam BANK_XY_Q_MSB = 3'd6;
+
+//localparam BANK_XY_AUX = 3'd7;
+
+//localparam BANK_XY_ANY = 3'bXXX;
+
+//localparam BANK_XY_AUX_ADDR_N_COEFF = 0;
diff --git a/rtl/modexpng_parameters_x8.vh b/rtl/modexpng_parameters_x8.vh
new file mode 100644
index 0000000..8734354
--- /dev/null
+++ b/rtl/modexpng_parameters_x8.vh
@@ -0,0 +1 @@
+localparam NUM_MULTS = 8;
diff --git a/rtl/modexpng_part_recombinator.v b/rtl/modexpng_part_recombinator.v
new file mode 100644
index 0000000..db4774b
--- /dev/null
+++ b/rtl/modexpng_part_recombinator.v
@@ -0,0 +1,623 @@
+module modexpng_part_recombinator
+(
+ clk,
+ rdy,
+ fsm_state_next,
+ index_last,
+ dsp_x_ce_p, dsp_y_ce_p,
+ ena_x, ena_y,
+ dsp_x_p, dsp_y_p,
+ col_index, col_index_last, slim_bram_xy_addr,
+ fat_bram_xy_bank, fat_bram_xy_addr, fat_bram_x_dout, fat_bram_y_dout, fat_bram_xy_dout_valid
+);
+
+
+ //
+ // Headers
+ //
+ `include "../rtl/modexpng_mmm_fsm.vh"
+ `include "../rtl/modexpng_parameters.vh"
+ `include "../rtl/modexpng_parameters_x8.vh"
+
+
+ input clk;
+ output rdy;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ input [7:0] index_last;
+ input dsp_x_ce_p;
+ input dsp_y_ce_p;
+ input ena_x;
+ input ena_y;
+ input [8*47-1:0] dsp_x_p;
+ input [8*47-1:0] dsp_y_p;
+ input [ 4:0] col_index;
+ input [ 4:0] col_index_last;
+ input [ 7:0] slim_bram_xy_addr;
+
+ output [ 2:0] fat_bram_xy_bank;
+ output [ 7:0] fat_bram_xy_addr;
+ output [ 17:0] fat_bram_x_dout;
+ output [ 17:0] fat_bram_y_dout;
+ output fat_bram_xy_dout_valid;
+
+
+ //
+ // Latches
+ //
+ reg [1*47-1:0] dsp_x_p_latch[0:7];
+ reg [1*47-1:0] dsp_y_p_latch[0:7];
+
+
+ //
+ // Mapping
+ //
+ wire [46:0] dsp_x_p_split[0:7];
+ wire [46:0] dsp_y_p_split[0:7];
+
+ genvar z;
+ generate for (z=0; z<NUM_MULTS; z=z+1)
+ begin : gen_dsp_xy_p_split
+ assign dsp_x_p_split[z] = dsp_x_p[47*z+:47];
+ assign dsp_y_p_split[z] = dsp_y_p[47*z+:47];
+ end
+ endgenerate
+
+
+ //
+ // Delays
+ //
+ reg dsp_y_ce_p_dly1 = 1'b0;
+ reg dsp_x_ce_p_dly1 = 1'b0;
+
+ always @(posedge clk) begin
+ //
+ {dsp_y_ce_p_dly1, dsp_x_ce_p_dly1} <= {dsp_y_ce_p, dsp_x_ce_p};
+ //
+ end
+
+
+ //
+ // Registers
+ //
+
+ // valid
+ reg x_valid_lsb = 1'b0;
+ reg y_valid_lsb = 1'b0;
+ reg x_valid_msb = 1'b0;
+ reg y_valid_msb = 1'b0;
+
+ // bitmap
+ reg [7:0] x_bitmap_lsb = {8{1'b0}};
+ reg [7:0] y_bitmap_lsb = {8{1'b0}};
+ reg [7:0] x_bitmap_msb = {8{1'b0}};
+ reg [7:0] y_bitmap_msb = {8{1'b0}};
+
+ // index
+ reg [2:0] x_index_lsb = 3'dX;
+ reg [2:0] y_index_lsb = 3'dX;
+
+ // purge
+ reg x_purge_lsb = 1'b0;
+ reg y_purge_lsb = 1'b0;
+ reg x_purge_msb = 1'b0;
+ reg y_purge_msb = 1'b0;
+
+ // valid - latch
+ reg x_valid_latch_lsb = 1'b0;
+ reg y_valid_latch_lsb = 1'b0;
+
+ // bitmap - latch
+ reg [7:0] x_bitmap_latch_lsb = {8{1'b0}};
+ reg [7:0] y_bitmap_latch_lsb = {8{1'b0}};
+ reg [7:0] x_bitmap_latch_msb = {8{1'b0}};
+ reg [7:0] y_bitmap_latch_msb = {8{1'b0}};
+
+ // index - latch
+ reg [2:0] x_index_latch_lsb = 3'dX;
+ reg [2:0] y_index_latch_lsb = 3'dX;
+
+ // purge - index
+ reg x_purge_latch_lsb = 1'b0;
+ reg y_purge_latch_lsb = 1'b0;
+ reg x_purge_latch_msb = 1'b0;
+ reg y_purge_latch_msb = 1'b0;
+
+ //
+ reg xy_valid_lsb_adv[1:6];
+ reg xy_valid_msb_adv[1:6];
+ reg [7:0] xy_bitmap_lsb_adv[1:6];
+ reg [7:0] xy_bitmap_msb_adv[1:6];
+ reg [2:0] xy_index_lsb_adv[1:6];
+ reg [2:0] xy_index_msb_adv[1:6];
+ reg xy_purge_lsb_adv[1:6];
+ reg xy_purge_msb_adv[1:6];
+
+
+ integer i;
+ initial for (i=1; i<6; i=i+1) begin
+ xy_valid_lsb_adv[i] = 1'b0;
+ xy_valid_msb_adv[i] = 1'b0;
+ xy_bitmap_lsb_adv[i] = {8{1'b0}};
+ xy_bitmap_msb_adv[i] = {8{1'b0}};
+ xy_index_lsb_adv[i] = 3'dX;
+ xy_index_msb_adv[i] = 3'dX;
+ xy_purge_lsb_adv[i] = 1'b0;
+ xy_purge_msb_adv[i] = 1'b0;
+ end
+
+ function [0:0] calc_square_valid_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ calc_square_valid_lsb = 1'b1;
+ else
+ calc_square_valid_lsb = 1'b0;
+ //
+ end
+ endfunction
+
+ function [7:0] calc_square_bitmap_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ //
+ case (slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_square_bitmap_lsb = 8'b00000001;
+ 3'b001: calc_square_bitmap_lsb = 8'b00000010;
+ 3'b010: calc_square_bitmap_lsb = 8'b00000100;
+ 3'b011: calc_square_bitmap_lsb = 8'b00001000;
+ 3'b100: calc_square_bitmap_lsb = 8'b00010000;
+ 3'b101: calc_square_bitmap_lsb = 8'b00100000;
+ 3'b110: calc_square_bitmap_lsb = 8'b01000000;
+ 3'b111: calc_square_bitmap_lsb = 8'b10000000;
+ endcase
+ //
+ else
+ calc_square_bitmap_lsb = {8{1'b0}};
+ //
+ end
+ endfunction
+
+ function [2:0] calc_square_index_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ //
+ case (slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_square_index_lsb = 3'd0;
+ 3'b001: calc_square_index_lsb = 3'd1;
+ 3'b010: calc_square_index_lsb = 3'd2;
+ 3'b011: calc_square_index_lsb = 3'd3;
+ 3'b100: calc_square_index_lsb = 3'd4;
+ 3'b101: calc_square_index_lsb = 3'd5;
+ 3'b110: calc_square_index_lsb = 3'd6;
+ 3'b111: calc_square_index_lsb = 3'd7;
+ endcase
+ //
+ else
+ calc_square_index_lsb = 3'dX;
+ //
+ end
+ endfunction
+
+ function calc_square_purge_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ calc_square_purge_lsb = slim_bram_xy_addr_value[7:3] == col_index_last_value;
+ else
+ calc_square_purge_lsb = 1'b0;
+ //
+ end
+ endfunction
+
+ function calc_square_valid_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value)
+ calc_square_valid_msb = 1'b1;
+ else
+ calc_square_valid_msb = 1'b0;
+ //
+ end
+ endfunction
+
+ function [7:0] calc_square_bitmap_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value) begin
+ calc_square_bitmap_msb[7] = col_index_value != col_index_last_value;
+ calc_square_bitmap_msb[6:0] = 7'b1111111;
+ end else
+ calc_square_bitmap_msb[7:0] = 8'b00000000;
+ //
+ end
+ endfunction
+
+ function calc_square_purge_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value)
+ calc_square_purge_msb = col_index_value == col_index_last_value;
+ else
+ calc_square_purge_msb = 1'b0;
+ //
+ end
+ endfunction
+
+
+ reg recomb_lsb_ce = 1'b0;
+ reg [ 2:0] recomb_lsb_ce_purge = 3'b000;
+ wire recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_purge[0];
+ reg recomb_lsb_clr;
+ reg recomb_lsb_vld = 1'b0;
+
+ reg [46:0] recomb_lsb_din;
+ wire [15:0] recomb_lsb_dout;
+
+ reg recomb_msb_ce = 1'b0;
+ reg [ 1:0] recomb_msb_ce_purge = 2'b00;
+ wire recomb_msb_ce_combined = recomb_msb_ce | recomb_msb_ce_purge[0];
+ reg recomb_msb_clr;
+ reg recomb_msb_vld = 1'b0;
+
+ always @(posedge clk)
+ //
+ {recomb_msb_vld, recomb_lsb_vld} <= {recomb_msb_ce_combined, recomb_lsb_ce_combined};
+
+ reg [46:0] recomb_msb_din;
+ wire [15:0] recomb_msb_dout;
+
+ modexpng_recombinator_block recomb_x_lsb
+ (
+ .clk (clk),
+ .ce (recomb_lsb_ce_combined),
+ .clr (recomb_lsb_clr),
+ .din (recomb_lsb_din),
+ .dout (recomb_lsb_dout)
+ );
+
+ modexpng_recombinator_block recomb_x_msb
+ (
+ .clk (clk),
+ .ce (recomb_msb_ce_combined),
+ .clr (recomb_msb_clr),
+ .din (recomb_msb_din),
+ .dout (recomb_msb_dout)
+ );
+
+ always @(posedge clk) begin
+ //
+ recomb_lsb_ce <= x_valid_latch_lsb;
+ recomb_msb_ce <= x_bitmap_latch_msb[0];
+ //
+ if (x_purge_latch_lsb)
+ recomb_lsb_ce_purge <= 3'b111;
+ else
+ recomb_lsb_ce_purge <= {1'b0, recomb_lsb_ce_purge[2:1]};
+ //
+ if (x_purge_latch_msb && x_bitmap_latch_msb[0] && !x_bitmap_latch_msb[1])
+ recomb_msb_ce_purge = 2'b11;
+ else
+ recomb_msb_ce_purge <= {1'b0, recomb_msb_ce_purge[1]};
+ //
+ end
+
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y) begin
+ recomb_lsb_clr <= 1'b1;
+ recomb_msb_clr <= 1'b1;
+ end else begin
+ if (recomb_lsb_ce) recomb_lsb_clr <= 1'b0;
+ if (recomb_msb_ce) recomb_msb_clr <= 1'b0;
+ end
+
+ always @(posedge clk)
+ //
+ if (x_valid_latch_lsb)
+ recomb_lsb_din <= dsp_x_p_latch[x_index_latch_lsb];
+ else
+ recomb_lsb_din <= {47{1'b0}};
+
+ always @(posedge clk)
+ //
+ if (x_bitmap_latch_msb[0])
+ recomb_msb_din <= dsp_x_p_latch[0];
+ else
+ recomb_msb_din <= {47{1'b0}};
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+ //
+ xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr);
+ xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ //
+ xy_valid_msb_adv [6] <= calc_square_valid_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+ xy_bitmap_msb_adv[6] <= calc_square_bitmap_msb(col_index, col_index_last, slim_bram_xy_addr, index_last);
+ xy_purge_msb_adv [6] <= calc_square_purge_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+ //
+ end
+ //
+ default: begin
+ //
+ xy_valid_lsb_adv [6] <= 1'b0;
+ xy_bitmap_lsb_adv[6] <= {8{1'b0}};
+ xy_index_lsb_adv [6] <= 3'dX;
+ xy_purge_lsb_adv [6] <= 1'b0;
+ //
+ xy_valid_msb_adv [6] <= 1'b0;
+ xy_bitmap_msb_adv[6] <= {8{1'b0}};
+ xy_purge_msb_adv [6] <= 1'b0;
+ //
+ end
+ //
+ endcase
+
+
+ always @(posedge clk) begin
+ //
+ {y_valid_lsb, x_valid_lsb} <= {2{xy_valid_lsb_adv [1]}};
+ {y_bitmap_lsb, x_bitmap_lsb} <= {2{xy_bitmap_lsb_adv[1]}};
+ {y_index_lsb, x_index_lsb} <= {2{xy_index_lsb_adv [1]}};
+ {y_purge_lsb, x_purge_lsb} <= {2{xy_purge_lsb_adv [1]}};
+ //
+ {y_valid_latch_lsb, x_valid_latch_lsb} <= {y_valid_lsb, x_valid_lsb};
+ {y_bitmap_latch_lsb, x_bitmap_latch_lsb} <= {y_bitmap_lsb, x_bitmap_lsb};
+ {y_index_latch_lsb, x_index_latch_lsb} <= {y_index_lsb, x_index_lsb};
+ {y_purge_latch_lsb, x_purge_latch_lsb} <= {y_purge_lsb, x_purge_lsb};
+ //
+ {y_valid_msb, x_valid_msb} <= {2{xy_valid_msb_adv[1]}};
+ {y_bitmap_msb, x_bitmap_msb} <= {2{xy_bitmap_msb_adv[1]}};
+ {y_purge_msb, x_purge_msb} <= {2{xy_purge_msb_adv[1]}};
+ //
+ if (x_valid_msb) begin
+ x_bitmap_latch_msb <= x_bitmap_msb;
+ x_purge_latch_msb <= x_purge_msb;
+ end else begin
+ x_bitmap_latch_msb <= {1'b0, x_bitmap_latch_msb[7:1]};
+ end
+ //
+ //
+ for (i=1; i<6; i=i+1) begin
+ xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1];
+ xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1];
+ xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1];
+ xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1];
+ //
+ xy_valid_msb_adv [i] <= xy_valid_msb_adv [i+1];
+ xy_bitmap_msb_adv[i] <= xy_bitmap_msb_adv[i+1];
+ xy_purge_msb_adv [i] <= xy_purge_msb_adv [i+1];
+ end
+ //
+ end
+
+ always @(posedge clk)
+ //
+ if (x_bitmap_latch_msb[1]) // only shift 7 times
+ //
+ for (i=0; i<8; i=i+1)
+ if (i < 7)
+ dsp_x_p_latch[i] <= dsp_x_p_latch[i+1];
+ else
+ dsp_x_p_latch[i] <= {47{1'bX}};
+ //
+ else if (dsp_x_ce_p_dly1)
+ //
+ for (i=0; i<8; i=i+1)
+ //
+ if (x_bitmap_lsb[i])
+ dsp_x_p_latch[i] <= dsp_x_p_split[i];
+ else if (x_valid_msb && x_bitmap_msb[i])
+ dsp_x_p_latch[i] <= dsp_x_p_split[i];
+
+ reg recomb_x_lsb_dout_valid = 1'b0;
+ reg recomb_x_msb_dout_valid = 1'b0;
+
+ always @(posedge clk) begin
+ recomb_x_lsb_dout_valid <= recomb_lsb_ce_combined;
+ recomb_x_msb_dout_valid <= recomb_msb_ce_combined;
+ end
+
+
+
+ reg [ 2:0] fat_bram_xy_bank_reg;
+ reg [ 7:0] fat_bram_xy_addr_reg;
+ reg [ 7:0] fat_bram_xy_cnt_lsb;
+ reg [ 7:0] fat_bram_xy_cnt_msb;
+ reg [17:0] fat_bram_x_dout_reg;
+ reg [17:0] fat_bram_y_dout_reg;
+ reg fat_bram_xy_dout_valid_reg = 1'b0;
+
+ reg [15:0] recomb_msb_dout_carry_0;
+ reg [15:0] recomb_msb_dout_carry_1;
+
+ reg [15:0] recomb_msb_dout_delay_0;
+ reg [15:0] recomb_msb_dout_delay_1;
+ reg [15:0] recomb_msb_dout_delay_2;
+
+ reg [ 7:0] recomb_msb_cnt_delay_0 = 8'd0;
+ reg [ 7:0] recomb_msb_cnt_delay_1 = 8'd0;
+ reg [ 7:0] recomb_msb_cnt_delay_2 = 8'd0;
+
+ assign fat_bram_xy_bank = fat_bram_xy_bank_reg;
+ assign fat_bram_xy_addr = fat_bram_xy_addr_reg;
+ assign fat_bram_x_dout = fat_bram_x_dout_reg;
+ assign fat_bram_y_dout = fat_bram_y_dout_reg;
+ assign fat_bram_xy_dout_valid = fat_bram_xy_dout_valid_reg;
+
+ reg rdy_reg = 1'b1;
+ reg rdy_adv = 1'b1;
+
+ assign rdy = rdy_reg;
+
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y)
+ rdy_reg <= 1'b0;
+ else
+ rdy_reg <= rdy_adv;
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y) begin
+ rdy_adv <= 1'b0;
+ fat_bram_xy_cnt_lsb <= 8'd0;
+ fat_bram_xy_cnt_msb <= 8'd0;
+ end else begin
+ //
+ case ({recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid})
+ //
+ 2'b00: begin
+ //
+ if (recomb_msb_cnt_delay_2 > 8'd0) begin
+ //
+ rdy_adv <= recomb_msb_cnt_delay_1 == 8'd0;
+ //
+ recomb_msb_dout_delay_0 <= {18{1'bX}};
+ recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+ recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+ //
+ recomb_msb_cnt_delay_0 <= 8'd0;
+ recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+ recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= recomb_msb_cnt_delay_2;
+ fat_bram_x_dout_reg <= recomb_msb_dout_delay_2;
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= 3'bXXX;
+ fat_bram_xy_addr_reg <= 8'hXX;
+ fat_bram_x_dout_reg <= {18{1'bX}};
+ fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b0;
+ //
+ end
+ //
+ end
+ //
+ 2'b01: begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABL;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout};
+// fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1;
+ //
+ end
+ //
+ 2'b10: begin
+ //
+ if (fat_bram_xy_cnt_msb < 8'd2) begin
+ //
+ recomb_msb_dout_carry_0 <= recomb_msb_dout;
+ recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
+ //
+ fat_bram_xy_bank_reg <= 3'bXXX;
+ fat_bram_xy_addr_reg <= 8'hXX;
+ fat_bram_x_dout_reg <= {18{1'bX}};
+ // fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b0;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_msb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_msb_dout};
+ // fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ end
+ //
+ fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1;
+ //
+ end
+ //
+ 2'b11: begin
+ //
+ if (fat_bram_xy_cnt_lsb == index_last) begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABL;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout};
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= 8'd0;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}};
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1;
+ //
+ recomb_msb_dout_carry_0 <= {16{1'bX}};
+ recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
+ //
+ end
+ //
+ recomb_msb_dout_delay_0 <= recomb_msb_dout;
+ recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+ recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+ //
+ recomb_msb_cnt_delay_0 <= fat_bram_xy_cnt_msb;
+ recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+ recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+ //
+ fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1;
+ //
+ end
+ //
+ endcase
+ //
+ end
+
+
+
+
+endmodule
diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v
new file mode 100644
index 0000000..efe0ac5
--- /dev/null
+++ b/rtl/modexpng_recombinator_block.v
@@ -0,0 +1,35 @@
+module modexpng_recombinator_block
+(
+ clk,
+ ce, clr,
+ din, dout
+);
+
+ input clk;
+ input ce;
+ input clr;
+ input [46:0] din;
+ output [15:0] dout;
+
+ reg [14:0] z;
+ reg [16:0] y;
+ reg [17:0] x;
+ //reg [15:0] w;
+
+ //assign dout = w;
+ assign dout = x[15:0];
+
+ wire [14:0] din_z = din[46:32]; // TODO: maybe determine more precise bound here
+ wire [15:0] din_y = din[31:16];
+ wire [15:0] din_x = din[15: 0];
+
+ always @(posedge clk)
+ //
+ if (ce) begin
+ z <= din_z;
+ y <= clr ? {1'b0, din_y} : {1'b0, din_y} + {2'b00, z};
+ x <= clr ? {2'b00, din_x} : {2'b00, din_x} + {1'b0, y} + {{16{1'b0}}, x[17:16]};
+ //w <= clr ? {16{1'bX}} : x[15:0];
+ end
+
+endmodule