aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2019-10-01 15:01:43 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2019-10-01 15:01:43 +0300
commit29fb6afd018c601a2e0c7376656d5e37beb565d6 (patch)
treedc11ee0c8e5a30113052254be23594da74a8a572
parentec07464d239f7f6379a682ac57b58b863d3f0374 (diff)
Started working on the pipelined Montgomery modular multiplier. Currently can
do the "square" part of the multiplication, i.e. compute the twice larger intermediate product AB = A * B.
-rw-r--r--bench/tb_mmm_x8_dual.v327
-rw-r--r--bench/tb_square.v716
-rw-r--r--rtl/dev/temp.txt384
-rw-r--r--rtl/dsp/dsp_array.v111
-rw-r--r--rtl/dsp/dsp_slice.v125
-rw-r--r--rtl/modexpng_mac.v54
-rw-r--r--rtl/modexpng_mac_array.v116
-rw-r--r--rtl/modexpng_mem.v93
-rw-r--r--rtl/modexpng_mmm_col_index.v90
-rw-r--r--rtl/modexpng_mmm_din_addr.v167
-rw-r--r--rtl/modexpng_mmm_dout_addr.v167
-rw-r--r--rtl/modexpng_mmm_fsm.vh24
-rw-r--r--rtl/modexpng_mmm_pad.v153
-rw-r--r--rtl/modexpng_mmm_transporter.v157
-rw-r--r--rtl/modexpng_mmm_x8_dual.v550
-rw-r--r--rtl/modexpng_parameters.vh39
-rw-r--r--rtl/modexpng_parameters_x8.vh1
-rw-r--r--rtl/modexpng_part_recombinator.v623
-rw-r--r--rtl/modexpng_recombinator_block.v35
19 files changed, 3932 insertions, 0 deletions
diff --git a/bench/tb_mmm_x8_dual.v b/bench/tb_mmm_x8_dual.v
new file mode 100644
index 0000000..aa25900
--- /dev/null
+++ b/bench/tb_mmm_x8_dual.v
@@ -0,0 +1,327 @@
+`timescale 1ns / 1ps
+
+module tb_mmm_x8_dual;
+
+
+ //
+ // Headers
+ //
+ `include "../rtl/modexpng_parameters.vh"
+ `include "../rtl/modexpng_parameters_x8.vh"
+
+
+ //
+ // Settings
+ //
+ localparam INDEX_WIDTH = 6;
+
+ wire [INDEX_WIDTH-1:0] index_last = 31; // 512 bits
+
+
+ //
+ // Clock
+ //
+ `define CLK_FREQUENCY_MHZ 100.0
+ `define CLK_PERIOD_NS (1000.0 / `CLK_FREQUENCY_MHZ)
+ `define CLK_PERIOD_HALF_NS (0.5 * `CLK_PERIOD_NS)
+
+ reg clk = 1'b0;
+
+ always begin
+ #`CLK_PERIOD_HALF_NS clk = 1'b1;
+ #`CLK_PERIOD_HALF_NS clk = 1'b0;
+ end
+
+
+ //
+ // Reset
+ //
+ reg rst = 1'b1;
+ wire rst_n = ~rst;
+
+
+ //
+ // Control
+ //
+ reg ena = 1'b0;
+ wire rdy;
+
+ reg mode;
+ reg transfer;
+
+
+ //
+ // Interface
+ //
+
+
+ //
+ // Interface - Data Buses
+ //
+ wire [NUM_MULTS*WORD_WIDTH-1:0] x_din;
+ wire [NUM_MULTS*WORD_WIDTH-1:0] y_din;
+ wire [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
+ wire [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
+
+
+ //
+ // Interface - Address Buses
+ //
+ wire [INDEX_WIDTH-4:0] x_din_addr;
+ wire [INDEX_WIDTH-4:0] y_din_addr;
+ wire [INDEX_WIDTH-4:0] x_dout_addr;
+ wire [INDEX_WIDTH-4:0] y_dout_addr;
+
+
+ //
+ // Interface - Enable Buses
+ //
+ wire [ 1-1:0] x_din_ena;
+ wire [ 1-1:0] y_din_ena;
+ wire [ 1-1:0] x_din_reg_ena;
+ wire [ 1-1:0] y_din_reg_ena;
+ wire [NUM_MULTS-1:0] x_dout_ena;
+ wire [NUM_MULTS-1:0] y_dout_ena;
+
+
+ //
+ // Interface - Bank Buses
+ //
+ wire [3-1:0] x_din_bank;
+ wire [3-1:0] y_din_bank;
+ wire [3-1:0] x_dout_bank;
+ wire [3-1:0] y_dout_bank;
+
+
+ //
+ // Operands
+ //
+ reg [WORD_WIDTH-1:0] T1[0:2**INDEX_WIDTH-1];
+ reg [WORD_WIDTH-1:0] T2[0:2**INDEX_WIDTH-1];
+ reg [WORD_WIDTH-1:0] N[0:2**INDEX_WIDTH-1];
+ reg [WORD_WIDTH-1:0] N_COEFF[0:2**INDEX_WIDTH];
+
+
+ //
+ // Memories
+ //
+ genvar z;
+ generate for (z=0; z<NUM_MULTS; z=z+1)
+ //
+ begin : gen_z_mem
+ //
+ modexpng_mem /*bram_1wo_1ro_readfirst_ce*/ #
+ (
+ .MEM_WIDTH(WORD_WIDTH),
+ .MEM_ADDR_BITS(INDEX_WIDTH) // - clog2(NUM_MULTS) + clog2(NUM_BANKS)
+ )
+ gen_z_mem_x
+ (
+ .clk (clk),
+
+ .a_addr ({x_dout_bank, x_dout_addr}),
+ .a_en (x_dout_ena[z]),
+ .a_wr (x_dout_ena[z]),
+ .a_in (x_dout[z*WORD_WIDTH+:WORD_WIDTH]),
+ .a_out (), // unused
+
+ .b_addr ({x_din_bank, x_din_addr}),
+ .b_en (x_din_ena),
+ .b_reg_en (x_din_reg_ena),
+ .b_out (x_din[z*WORD_WIDTH+:WORD_WIDTH])
+ );
+ //
+ modexpng_mem /*bram_1wo_1ro_readfirst_ce*/ #
+ (
+ .MEM_WIDTH(WORD_WIDTH),
+ .MEM_ADDR_BITS(INDEX_WIDTH) // - clog2(NUM_MULTS) + clog2(NUM_BANKS)
+ )
+ gen_z_mem_y
+ (
+ .clk (clk),
+
+ .a_addr ({y_dout_bank, y_dout_addr}),
+ .a_en (y_dout_ena[z]),
+ .a_wr (y_dout_ena[z]),
+ .a_in (y_dout[z*WORD_WIDTH+:WORD_WIDTH]),
+ .a_out (), // unused
+
+ .b_addr ({y_din_bank, y_din_addr}),
+ .b_en (y_din_ena),
+ .b_reg_en (y_din_reg_ena),
+ .b_out (y_din[z*WORD_WIDTH+:WORD_WIDTH])
+ );
+ //
+ end
+ //
+ endgenerate
+
+
+ // T1 / T2
+ // N / N_COEFF
+ // AB_LSB
+ // AB_MSB
+ // M
+ // Q_LSB
+ // Q_MSB
+ // ?
+
+
+ //
+ // Operands - Values
+ //
+ initial begin
+ //
+ T1[ 0] = 18'h0b27b; T1[ 1] = 18'h0fc7d; T1[ 2] = 18'h0a214; T1[ 3] = 18'h08d2b;
+ T1[ 4] = 18'h1c80c; T1[ 5] = 18'h145f1; T1[ 6] = 18'h00db6; T1[ 7] = 18'h1cf0f;
+ T1[ 8] = 18'h19386; T1[ 9] = 18'h02ad9; T1[10] = 18'h1a8b5; T1[11] = 18'h1479b;
+ T1[12] = 18'h08b5f; T1[13] = 18'h14806; T1[14] = 18'h0e6f7; T1[15] = 18'h0ce9d;
+ T1[16] = 18'h0cbc2; T1[17] = 18'h16ef1; T1[18] = 18'h0e14e; T1[19] = 18'h1796f;
+ T1[20] = 18'h14901; T1[21] = 18'h06666; T1[22] = 18'h0cb9f; T1[23] = 18'h09ab4;
+ T1[24] = 18'h12ffc; T1[25] = 18'h0a86d; T1[26] = 18'h19d35; T1[27] = 18'h0cda9;
+ T1[28] = 18'h16a19; T1[29] = 18'h09a36; T1[30] = 18'h0b176; T1[31] = 18'h0e0dc;
+ //
+ T2[ 0] = 18'h0b21a; T2[ 1] = 18'h13e71; T2[ 2] = 18'h03459; T2[ 3] = 18'h1063f;
+ T2[ 4] = 18'h18cef; T2[ 5] = 18'h1b8a5; T2[ 6] = 18'h082d1; T2[ 7] = 18'h1b1be;
+ T2[ 8] = 18'h18979; T2[ 9] = 18'h1409a; T2[10] = 18'h1713c; T2[11] = 18'h0cda3;
+ T2[12] = 18'h11c7d; T2[13] = 18'h0c943; T2[14] = 18'h12d7c; T2[15] = 18'h1531e;
+ T2[16] = 18'h0a45a; T2[17] = 18'h1c637; T2[18] = 18'h0906a; T2[19] = 18'h1670e;
+ T2[20] = 18'h12f78; T2[21] = 18'h08ce6; T2[22] = 18'h1c5c7; T2[23] = 18'h1292d;
+ T2[24] = 18'h0fc4b; T2[25] = 18'h064fb; T2[26] = 18'h0cc3c; T2[27] = 18'h19b37;
+ T2[28] = 18'h1b721; T2[29] = 18'h0f424; T2[30] = 18'h0f608; T2[31] = 18'h03e9b;
+ //
+ N[ 0] = 18'h00a9d; N[ 1] = 18'h01175; N[ 2] = 18'h0254f; N[ 3] = 18'h0ee38;
+ N[ 4] = 18'h00a6a; N[ 5] = 18'h0c7bd; N[ 6] = 18'h0ddac; N[ 7] = 18'h069fe;
+ N[ 8] = 18'h0e9d6; N[ 9] = 18'h0b6bf; N[10] = 18'h09230; N[11] = 18'h04fc5;
+ N[12] = 18'h05c9f; N[13] = 18'h09502; N[14] = 18'h0cbc5; N[15] = 18'h03109;
+ N[16] = 18'h08029; N[17] = 18'h0b27c; N[18] = 18'h0eeb8; N[19] = 18'h0c191;
+ N[20] = 18'h0ff86; N[21] = 18'h027ab; N[22] = 18'h07d76; N[23] = 18'h0ff1a;
+ N[24] = 18'h02afc; N[25] = 18'h0b25a; N[26] = 18'h0d3c1; N[27] = 18'h05589;
+ N[28] = 18'h09f7c; N[29] = 18'h0ddd6; N[30] = 18'h0b4fc; N[31] = 18'h0e8e7;
+ //
+ N_COEFF[ 0] = 18'h0344b; N_COEFF[ 1] = 18'h0ca66; N_COEFF[ 2] = 18'h0d9e8; N_COEFF[ 3] = 18'h070d5;
+ N_COEFF[ 4] = 18'h0ce4b; N_COEFF[ 5] = 18'h049b2; N_COEFF[ 6] = 18'h0abb3; N_COEFF[ 7] = 18'h0c3b2;
+ N_COEFF[ 8] = 18'h0ad38; N_COEFF[ 9] = 18'h05672; N_COEFF[10] = 18'h0fd47; N_COEFF[11] = 18'h06671;
+ N_COEFF[12] = 18'h00b7f; N_COEFF[13] = 18'h0fa35; N_COEFF[14] = 18'h0d4ac; N_COEFF[15] = 18'h0f1ca;
+ N_COEFF[16] = 18'h08e0a; N_COEFF[17] = 18'h05858; N_COEFF[18] = 18'h02dc6; N_COEFF[19] = 18'h08cfc;
+ N_COEFF[20] = 18'h01941; N_COEFF[21] = 18'h0f855; N_COEFF[22] = 18'h01e43; N_COEFF[23] = 18'h053f0;
+ N_COEFF[24] = 18'h0a479; N_COEFF[25] = 18'h0ae7e; N_COEFF[26] = 18'h05c66; N_COEFF[27] = 18'h02413;
+ N_COEFF[28] = 18'h0b5f8; N_COEFF[29] = 18'h0eb06; N_COEFF[30] = 18'h0de5b; N_COEFF[31] = 18'h0a751;
+ N_COEFF[32] = 18'h0c1ec;
+ //
+ end
+
+
+ //
+ // Load Interface
+ //
+ wire load_phase;
+ wire [ INDEX_WIDTH:0] load_xy_addr;
+ wire load_xy_addr_vld;
+ wire load_xy_req;
+ reg [ WORD_WIDTH-1:0] load_x_din;
+ reg [ WORD_WIDTH-1:0] load_y_din;
+ reg [ WORD_WIDTH-1:0] load_x_pipe;
+ reg [ WORD_WIDTH-1:0] load_y_pipe;
+
+ always @(posedge clk)
+ //
+ if (load_xy_addr_vld) begin
+
+ if (!load_phase) begin
+ load_x_pipe <= T1[load_xy_addr];
+ load_y_pipe <= T2[load_xy_addr];
+ end else begin
+ load_x_pipe <= !load_xy_addr[INDEX_WIDTH] ? N[load_xy_addr] : {WORD_WIDTH{1'bX}};
+ load_y_pipe <= N_COEFF[load_xy_addr];
+ end
+ end
+
+ always @(posedge clk)
+ //
+ if (load_xy_req)
+ {load_y_din, load_x_din} <= {load_y_pipe, load_x_pipe};
+ else
+ {load_y_din, load_x_din} <= {2*WORD_WIDTH{1'bX}};
+
+
+ //
+ // UUT
+ //
+ modexpng_mmm_x8_dual #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ uut
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+
+ .ena (ena),
+ .rdy (rdy),
+
+ .mode (mode),
+ .transfer (transfer),
+
+ .index_last (index_last),
+
+ .x_din (x_din),
+ .y_din (y_din),
+ .x_dout (x_dout),
+ .y_dout (y_dout),
+
+ .x_din_addr (x_din_addr),
+ .y_din_addr (y_din_addr),
+ .x_dout_addr (x_dout_addr),
+ .y_dout_addr (y_dout_addr),
+
+ .x_din_ena (x_din_ena),
+ .y_din_ena (y_din_ena),
+ .x_dout_ena (x_dout_ena),
+ .y_dout_ena (y_dout_ena),
+
+ .x_din_reg_ena (x_din_reg_ena),
+ .y_din_reg_ena (y_din_reg_ena),
+
+ .x_din_bank (x_din_bank),
+ .y_din_bank (y_din_bank),
+ .x_dout_bank (x_dout_bank),
+ .y_dout_bank (y_dout_bank),
+
+ .load_phase (load_phase),
+ .load_xy_addr (load_xy_addr),
+ .load_xy_addr_vld (load_xy_addr_vld),
+ .load_xy_req (load_xy_req),
+ .load_x_din (load_x_din),
+ .load_y_din (load_y_din)
+ );
+
+
+ //
+ // Script
+ //
+ initial begin
+ #(100.0*`CLK_PERIOD_NS) rst = 1'b0;
+ #(100.0*`CLK_PERIOD_NS) ena = 1'b1;
+ transfer = 1'b1;
+ mode = 1'b0;
+ #( 1.0*`CLK_PERIOD_NS) ena = 1'b0;
+ transfer = 1'bX;
+ mode = 1'bX;
+
+ while (!rdy) #`CLK_PERIOD_NS;
+
+ #(100.0*`CLK_PERIOD_NS) ena = 1'b1;
+ transfer = 1'b0;
+ mode = 1'b0;
+ #( 1.0*`CLK_PERIOD_NS) ena = 1'b0;
+ transfer = 1'bX;
+ mode = 1'bX;
+
+ while (!rdy) #`CLK_PERIOD_NS;
+
+ end
+
+
+endmodule
+
diff --git a/bench/tb_square.v b/bench/tb_square.v
new file mode 100644
index 0000000..61e5d8a
--- /dev/null
+++ b/bench/tb_square.v
@@ -0,0 +1,716 @@
+`timescale 1ns / 1ps
+
+module tb_square;
+
+
+ //
+ // Headers
+ //
+ `include "../rtl/modexpng_parameters.vh"
+ `include "../rtl/modexpng_parameters_x8.vh"
+ `include "../rtl/modexpng_mmm_fsm.vh"
+
+
+ //
+ // Clock
+ //
+ `define CLK_FREQUENCY_MHZ 100.0
+ `define CLK_PERIOD_NS (1000.0 / `CLK_FREQUENCY_MHZ)
+ `define CLK_PERIOD_HALF_NS (0.5 * `CLK_PERIOD_NS)
+
+ reg clk = 1'b0;
+
+ always begin
+ #`CLK_PERIOD_HALF_NS clk = 1'b1;
+ #`CLK_PERIOD_HALF_NS clk = 1'b0;
+ end
+
+
+ //
+ // Reset
+ //
+ reg rst = 1'b1;
+
+
+
+ //
+ // T1, T2
+ //
+ reg [17:0] T1[0:31];
+ reg [17:0] T2[0:31];
+ reg [17:0] AB[0:63];
+
+
+ //
+ // Init
+ //
+ initial begin
+ //
+ T1[ 0] = 18'h0f13e; T1[ 1] = 18'h0daf6; T1[ 2] = 18'h0aaa9; T1[ 3] = 18'h0c2c2;
+ T1[ 4] = 18'h0fc5f; T1[ 5] = 18'h12164; T1[ 6] = 18'h14375; T1[ 7] = 18'h15615;
+ T1[ 8] = 18'h0d8e2; T1[ 9] = 18'h0ec15; T1[10] = 18'h17c46; T1[11] = 18'h0c922;
+ T1[12] = 18'h08f00; T1[13] = 18'h152f9; T1[14] = 18'h0b0b6; T1[15] = 18'h0ce87;
+ T1[16] = 18'h178f2; T1[17] = 18'h09efb; T1[18] = 18'h0409d; T1[19] = 18'h11104;
+ T1[20] = 18'h0b4a6; T1[21] = 18'h158a6; T1[22] = 18'h0514e; T1[23] = 18'h0ec55;
+ T1[24] = 18'h11e73; T1[25] = 18'h11ddd; T1[26] = 18'h07bd4; T1[27] = 18'h0638b;
+ T1[28] = 18'h0e805; T1[29] = 18'h11c4f; T1[30] = 18'h0a2eb; T1[31] = 18'h05454;
+ //
+ T2[ 0] = 18'h1a479; T2[ 1] = 18'h102f5; T2[ 2] = 18'h10e72; T2[ 3] = 18'h120b1;
+ T2[ 4] = 18'h169cd; T2[ 5] = 18'h1d0c4; T2[ 6] = 18'h11462; T2[ 7] = 18'h12015;
+ T2[ 8] = 18'h16fca; T2[ 9] = 18'h1044f; T2[10] = 18'h122b4; T2[11] = 18'h10a5a;
+ T2[12] = 18'h12620; T2[13] = 18'h0e01a; T2[14] = 18'h095cd; T2[15] = 18'h1278a;
+ T2[16] = 18'h10763; T2[17] = 18'h09fe7; T2[18] = 18'h0d35c; T2[19] = 18'h10e24;
+ T2[20] = 18'h1527d; T2[21] = 18'h115b3; T2[22] = 18'h05443; T2[23] = 18'h1190a;
+ T2[24] = 18'h0fcc3; T2[25] = 18'h115e2; T2[26] = 18'h0a398; T2[27] = 18'h0608d;
+ T2[28] = 18'h13075; T2[29] = 18'h0d816; T2[30] = 18'h0bb4c; T2[31] = 18'h04e8a;
+ //
+ AB[ 0] = 18'h0be4e; AB[ 1] = 18'h0fed7; AB[ 2] = 18'h09496; AB[ 3] = 18'h07181;
+ AB[ 4] = 18'h0ee73; AB[ 5] = 18'h04692; AB[ 6] = 18'h0141a; AB[ 7] = 18'h0078c;
+ AB[ 8] = 18'h030eb; AB[ 9] = 18'h0217c; AB[10] = 18'h0696f; AB[11] = 18'h0a165;
+ AB[12] = 18'h0b753; AB[13] = 18'h04af9; AB[14] = 18'h0ed7c; AB[15] = 18'h079ce;
+ AB[16] = 18'h0e863; AB[17] = 18'h097df; AB[18] = 18'h07984; AB[19] = 18'h048af;
+ AB[20] = 18'h0197f; AB[21] = 18'h0206a; AB[22] = 18'h027e7; AB[23] = 18'h04b3a;
+ AB[24] = 18'h03312; AB[25] = 18'h03b56; AB[26] = 18'h04487; AB[27] = 18'h0bd6a;
+ AB[28] = 18'h04e4b; AB[29] = 18'h069ca; AB[30] = 18'h0f994; AB[31] = 18'h0dd4e;
+ AB[32] = 18'h1b024; AB[33] = 18'h0127f; AB[34] = 18'h02631; AB[35] = 18'h0186b;
+ AB[36] = 18'h03adb; AB[37] = 18'h05368; AB[38] = 18'h059a5; AB[39] = 18'h002e0;
+ AB[40] = 18'h0b78a; AB[41] = 18'h016f3; AB[42] = 18'h0b58d; AB[43] = 18'h03ddb;
+ AB[44] = 18'h078b0; AB[45] = 18'h0073b; AB[46] = 18'h07337; AB[47] = 18'h0c7b0;
+ AB[48] = 18'h00668; AB[49] = 18'h0106d; AB[50] = 18'h01a44; AB[51] = 18'h05ee3;
+ AB[52] = 18'h0462d; AB[53] = 18'h0fdeb; AB[54] = 18'h05f85; AB[55] = 18'h02af9;
+ AB[56] = 18'h0e1c0; AB[57] = 18'h00989; AB[58] = 18'h01201; AB[59] = 18'h0e194;
+ AB[60] = 18'h07f93; AB[61] = 18'h0e739; AB[62] = 18'h07cf6; AB[63] = 18'h019df;
+ //
+ end
+
+
+ //
+ // BRAMs
+ //
+ reg tb_fat_bram_xy_ena = 1'b0;
+ reg [ 2:0] tb_fat_bram_xy_bank;
+ reg [ 7:0] tb_fat_bram_xy_addr;
+ reg [17:0] tb_fat_bram_x_din;
+ reg [17:0] tb_fat_bram_y_din;
+
+ reg mgr_fat_bram_xy_ena = 1'b0;
+ reg [ 2:0] mgr_fat_bram_xy_bank;
+ reg [ 7:0] mgr_fat_bram_xy_addr;
+ reg [17:0] mgr_fat_bram_x_din;
+ reg [17:0] mgr_fat_bram_y_din;
+
+ reg mac_fat_bram_xy_ena = 1'b0;
+ reg mac_fat_bram_xy_reg_ena = 1'b0;
+ reg [ 2:0] mac_fat_bram_xy_bank;
+ reg [ 7:0] mac_fat_bram_xy_addr[0:3];
+ wire [17:0] mac_fat_bram_x_dout[0:3];
+ wire [17:0] mac_fat_bram_y_dout[0:3];
+
+ reg tb_slim_bram_xy_ena = 1'b0;
+ reg [ 1:0] tb_slim_bram_xy_bank;
+ reg [ 7:0] tb_slim_bram_xy_addr;
+ reg [17:0] tb_slim_bram_x_din;
+ reg [17:0] tb_slim_bram_y_din;
+
+ reg mac_slim_bram_xy_ena = 1'b0;
+ reg mac_slim_bram_xy_reg_ena = 1'b0;
+ reg [ 1:0] mac_slim_bram_xy_bank;
+ reg [ 7:0] mac_slim_bram_xy_addr;
+ reg [ 7:0] mac_slim_bram_xy_addr_dly;
+ wire [17:0] mac_slim_bram_x_dout;
+ wire [17:0] mac_slim_bram_y_dout;
+
+ always @(posedge clk)
+ //
+ mac_slim_bram_xy_addr_dly <= mac_slim_bram_xy_addr;
+
+ reg mac_slim_bram_xy_reg_ena_dly = 1'b0;
+ always @(posedge clk)
+ mac_slim_bram_xy_reg_ena_dly <= mac_slim_bram_xy_reg_ena;
+
+
+
+ genvar z;
+ generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+ begin : gen_fat_bram
+ //
+ ip_bram_36k fat_bram_x
+ (
+ .clka (clk),
+ .ena (mgr_fat_bram_xy_ena),
+ .wea (mgr_fat_bram_xy_ena),
+ .addra ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
+ .dina (mgr_fat_bram_x_din),
+
+ .clkb (clk),
+ .enb (mac_fat_bram_xy_ena),
+ .regceb (mac_fat_bram_xy_reg_ena),
+ .addrb ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}),
+ .doutb (mac_fat_bram_x_dout[z])
+ );
+ //
+ ip_bram_36k fat_bram_y
+ (
+ .clka (clk),
+ .ena (mgr_fat_bram_xy_ena),
+ .wea (mgr_fat_bram_xy_ena),
+ .addra ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
+ .dina (mgr_fat_bram_y_din),
+
+ .clkb (clk),
+ .enb (mac_fat_bram_xy_ena),
+ .regceb (mac_fat_bram_xy_reg_ena),
+ .addrb ({mac_fat_bram_xy_bank, mac_fat_bram_xy_addr[z]}),
+ .doutb (mac_fat_bram_y_dout[z])
+ );
+ //
+ end
+ endgenerate
+
+ ip_bram_18k slim_bram_x
+ (
+ .clka (clk),
+ .ena (tb_slim_bram_xy_ena),
+ .wea (tb_slim_bram_xy_ena),
+ .addra ({tb_slim_bram_xy_bank, tb_slim_bram_xy_addr}),
+ .dina (tb_slim_bram_x_din),
+
+ .clkb (clk),
+ .enb (mac_slim_bram_xy_ena),
+ .regceb (mac_slim_bram_xy_reg_ena),
+ .addrb ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
+ .doutb (mac_slim_bram_x_dout)
+ );
+
+ ip_bram_18k slim_bram_y
+ (
+ .clka (clk),
+ .ena (tb_slim_bram_xy_ena),
+ .wea (tb_slim_bram_xy_ena),
+ .addra ({tb_slim_bram_xy_bank, tb_slim_bram_xy_addr}),
+ .dina (tb_slim_bram_y_din),
+
+ .clkb (clk),
+ .enb (mac_slim_bram_xy_ena),
+ .regceb (mac_slim_bram_xy_reg_ena),
+ .addrb ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
+ .doutb (mac_slim_bram_y_dout)
+ );
+
+
+
+ //
+ // Enable, Ready
+ //
+ reg ena = 1'b0;
+
+ integer i;
+ initial begin
+
+ for (i=0; i<10; i=i+1)
+ wait_clock_tick;
+
+ rst = 1'b0;
+
+ for (i=0; i<10; i=i+1)
+ wait_clock_tick;
+
+ tb_fat_bram_xy_ena = 1'b1;
+ tb_slim_bram_xy_ena = 1'b1;
+
+ for (i=0; i<32; i=i+1) begin
+ tb_fat_bram_xy_bank = BANK_FAT_T1T2;
+ tb_fat_bram_xy_addr = i[7:0];
+ tb_fat_bram_x_din = T1[i];
+ tb_fat_bram_y_din = T2[i];
+
+ tb_slim_bram_xy_bank = BANK_SLIM_T1T2;
+ tb_slim_bram_xy_addr = i[7:0];
+ tb_slim_bram_x_din = T1[i];
+ tb_slim_bram_y_din = T2[i];
+
+ wait_clock_tick;
+ end
+
+ tb_fat_bram_xy_ena = 1'b0;
+ tb_slim_bram_xy_ena = 1'b0;
+
+ tb_fat_bram_xy_bank = {3{1'bX}};
+ tb_fat_bram_xy_addr = {8{1'bX}};
+ tb_fat_bram_x_din = {18{1'bX}};
+ tb_fat_bram_y_din = {18{1'bX}};
+
+ tb_slim_bram_xy_bank = {2{1'bX}};
+ tb_slim_bram_xy_addr = {8{1'bX}};
+ tb_slim_bram_x_din = {18{1'bX}};
+ tb_slim_bram_y_din = {18{1'bX}};
+
+ for (i=0; i<10; i=i+1)
+ wait_clock_tick;
+
+ ena = 1'b1;
+ wait_clock_tick;
+ ena = 1'b0;
+
+ for (i=0; i<10000; i=i+1)
+ wait_clock_tick;
+
+ verify_ab;
+
+ end
+
+
+ //
+ // DSPs
+ //
+ reg dsp_x_ce_a;
+ reg dsp_x_ce_b;
+ reg dsp_x_ce_b_dly;
+ reg dsp_x_ce_m;
+ reg dsp_x_ce_p;
+ reg dsp_x_ce_mode;
+
+ reg [8 -1:0] dsp_x_mode_z = {8{1'b1}};
+
+ wire [4*18-1:0] dsp_x_a;
+ reg [1*17-1:0] dsp_x_b;
+ wire [8*47-1:0] dsp_x_p;
+
+ reg dsp_y_ce_a;
+ reg dsp_y_ce_b;
+ reg dsp_y_ce_b_dly;
+ reg dsp_y_ce_m;
+ reg dsp_y_ce_p;
+ reg dsp_y_ce_mode;
+
+ reg [8 -1:0] dsp_y_mode_z = {8{1'b1}};
+
+ wire [4*18-1:0] dsp_y_a;
+ reg [1*17-1:0] dsp_y_b;
+ wire [8*47-1:0] dsp_y_p;
+
+ generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+ begin : gen_dsp_xy_a_split
+ assign dsp_x_a[18*z+:18] = mac_fat_bram_x_dout[z];
+ assign dsp_y_a[18*z+:18] = mac_fat_bram_y_dout[z];
+ end
+ endgenerate
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_b_dly, dsp_x_ce_b_dly} <= {dsp_y_ce_b, dsp_x_ce_b};
+
+
+ reg [8 -1:0] dsp_xy_mode_z_adv1 = {8{1'b1}};
+ reg [8 -1:0] dsp_xy_mode_z_adv2 = {8{1'b1}};
+ reg [8 -1:0] dsp_xy_mode_z_adv3 = {8{1'b1}};
+ reg [8 -1:0] dsp_xy_mode_z_adv4 = {8{1'b1}};
+
+ dsp_array dsp_x
+ (
+ .clk (clk),
+
+ .ce_a (dsp_x_ce_a),
+ .ce_b (dsp_x_ce_b),
+ .ce_m (dsp_x_ce_m),
+ .ce_p (dsp_x_ce_p),
+ .ce_mode (dsp_x_ce_mode),
+
+ .mode_z (dsp_x_mode_z),
+
+ .a (dsp_x_a),
+ .b (dsp_x_b),
+ .p (dsp_x_p)
+ );
+
+ dsp_array dsp_y
+ (
+ .clk (clk),
+
+ .ce_a (dsp_y_ce_a),
+ .ce_b (dsp_y_ce_b),
+ .ce_m (dsp_y_ce_m),
+ .ce_p (dsp_y_ce_p),
+ .ce_mode (dsp_y_ce_mode),
+
+ .mode_z (dsp_y_mode_z),
+
+ .a (dsp_y_a),
+ .b (dsp_y_b),
+ .p (dsp_y_p)
+ );
+
+
+ //
+ // FSM State and Next States
+ //
+ reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+
+
+ always @(posedge clk)
+ //
+ if (rst) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_state_next;
+
+
+ localparam [7:0] index_last = 8'd31;
+
+
+ wire mult_square_addr_almost_done_comb;
+ reg mult_square_addr_almost_done_flop;
+
+ wire mult_square_addr_surely_done_comb;
+ reg mult_square_addr_surely_done_flop;
+
+ assign mult_square_addr_almost_done_comb = mac_slim_bram_xy_addr == (index_last - 8'd1);
+ assign mult_square_addr_surely_done_comb = mac_slim_bram_xy_addr == index_last;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <=
+ {mult_square_addr_surely_done_comb, mult_square_addr_almost_done_comb};
+
+ default:
+ {mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 2'b00;
+
+ endcase
+
+
+ //
+ // Column
+ //
+ reg [4:0] col_index;
+ reg [4:0] col_index_prev;
+ reg [4:0] col_index_last;
+
+ always @(posedge clk)
+ //
+ col_index_prev <= col_index;
+
+ //
+ // FSM Transition Logic
+ //
+ wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0;
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_addr <= !mult_square_addr_almost_done_flop ? mac_slim_bram_xy_addr + 1'b1 : 8'd0;
+ default: mac_slim_bram_xy_addr <= 8'dX;
+ endcase
+
+ integer j;
+ always @(posedge clk)
+ //
+ for (j=0; j<(NUM_MULTS/2); j=j+1)
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT: mac_fat_bram_xy_addr[j] <= 1 + 2 * j;
+ FSM_STATE_MULT_SQUARE_COL_N_INIT: mac_fat_bram_xy_addr[j] <= 8 * (col_index + 1) + 1 + 2 * j;
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
+ default: mac_fat_bram_xy_addr[j] <= 8'dX;
+ endcase
+
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_bank <= BANK_SLIM_T1T2;
+ default: mac_slim_bram_xy_bank <= 2'bXX;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_bank <= BANK_FAT_T1T2;
+ default: mac_fat_bram_xy_bank <= 3'bXXX;
+ endcase
+
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_slim_bram_xy_ena <= 1'b1;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_slim_bram_xy_ena <= ~mult_square_addr_almost_done_flop;
+ default: mac_slim_bram_xy_ena <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_INIT,
+ FSM_STATE_MULT_SQUARE_COL_N_INIT,
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_fat_bram_xy_ena <= 1'b1;
+ default: mac_fat_bram_xy_ena <= 1'b0;
+ endcase
+
+
+ always @(posedge clk)
+ //
+ mac_slim_bram_xy_reg_ena <= mac_slim_bram_xy_ena;
+
+ always @(posedge clk)
+ //
+ mac_fat_bram_xy_reg_ena <= mac_fat_bram_xy_ena;
+
+
+ always @(posedge clk)
+ //
+ if (mac_slim_bram_xy_reg_ena_dly)
+ {dsp_y_b, dsp_x_b} <= {mac_slim_bram_x_dout[16:0], mac_slim_bram_y_dout[16:0]};
+ else
+ {dsp_y_b, dsp_x_b} <= {2{{17{1'bX}}}};
+
+
+ function [7:0] mac_fat_bram_xy_addr_next;
+ input [7:0] mac_fat_bram_xy_addr_current;
+ input [7:0] mac_fat_bram_xy_addr_last;
+ begin
+ if (mac_fat_bram_xy_addr_current > 0)
+ mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_current - 1'b1;
+ else
+ mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_last;
+ end
+ endfunction
+
+
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_a, dsp_x_ce_a} <= {2{mac_slim_bram_xy_reg_ena | mac_slim_bram_xy_reg_ena_dly}};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_b, dsp_x_ce_b} <= {2{mac_slim_bram_xy_reg_ena_dly}};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_m, dsp_x_ce_m} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_p, dsp_x_ce_p} <= {dsp_y_ce_m, dsp_x_ce_m};
+
+ always @(posedge clk)
+ //
+ {dsp_y_ce_mode, dsp_x_ce_mode} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};
+
+ task wait_clock_tick;
+ begin
+ #`CLK_PERIOD_NS;
+ end
+ endtask
+
+ //
+ // Increment Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_INIT: begin
+ col_index <= 5'd0;
+ col_index_last <= index_last[7:3];
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_INIT:
+ col_index <= col_index + 1'b1;
+ //
+ endcase
+
+ assign fsm_state_after_mult_square = (col_index == col_index_last) ? FSM_STATE_MULT_SQUARE_HOLDOFF : FSM_STATE_MULT_SQUARE_COL_N_INIT;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: dsp_xy_mode_z_adv4 <= {8{1'b0}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly);
+ default: dsp_xy_mode_z_adv4 <= {8{1'b1}};
+ endcase
+
+ always @(posedge clk) begin
+ {dsp_y_mode_z, dsp_x_mode_z} <= {2{dsp_xy_mode_z_adv1}};
+ //
+ dsp_xy_mode_z_adv1 <= {dsp_xy_mode_z_adv2};
+ dsp_xy_mode_z_adv2 <= {dsp_xy_mode_z_adv3};
+ dsp_xy_mode_z_adv3 <= {dsp_xy_mode_z_adv4};
+ end
+
+ function [NUM_MULTS-1:0] calc_mac_mode_z_square;
+ input [ 4:0] col_index_value;
+ input [ 7:0] mac_slim_bram_xy_addr_value;
+ begin
+ if (mac_slim_bram_xy_addr_value[7:3] == col_index_value)
+ case (mac_slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_mac_mode_z_square = 8'b11111110;
+ 3'b001: calc_mac_mode_z_square = 8'b11111101;
+ 3'b010: calc_mac_mode_z_square = 8'b11111011;
+ 3'b011: calc_mac_mode_z_square = 8'b11110111;
+ 3'b100: calc_mac_mode_z_square = 8'b11101111;
+ 3'b101: calc_mac_mode_z_square = 8'b11011111;
+ 3'b110: calc_mac_mode_z_square = 8'b10111111;
+ 3'b111: calc_mac_mode_z_square = 8'b01111111;
+ endcase
+ else
+ calc_mac_mode_z_square = {NUM_MULTS{1'b1}};
+ end
+ endfunction
+
+ reg recomb_x_ena = 1'b0;
+ reg recomb_y_ena = 1'b0;
+
+ always @(posedge clk) begin
+ //
+ recomb_x_ena <= dsp_x_ce_a && !dsp_x_ce_b && !dsp_x_ce_m && !dsp_x_ce_p;
+ recomb_y_ena <= dsp_y_ce_a && !dsp_y_ce_b && !dsp_y_ce_m && !dsp_y_ce_p;
+ //
+ end
+
+ wire [ 2:0] recomb_fat_bram_xy_bank;
+ wire [ 7:0] recomb_fat_bram_xy_addr;
+ wire [17:0] recomb_fat_bram_x_dout;
+ wire [17:0] recomb_fat_bram_y_dout;
+ wire recomb_fat_bram_xy_dout_valid;
+ wire recomb_rdy;
+
+ modexpng_part_recombinator recomb
+ (
+ .clk (clk),
+ .rdy (recomb_rdy),
+ .fsm_state_next (fsm_state_next),
+ .index_last (index_last),
+ .dsp_x_ce_p (dsp_x_ce_p),
+ .dsp_y_ce_p (dsp_y_ce_p),
+ .ena_x (recomb_x_ena),
+ .ena_y (recomb_y_ena),
+ .dsp_x_p (dsp_x_p),
+ .dsp_y_p (dsp_y_p),
+ .col_index (col_index),
+ .col_index_last (col_index_last),
+ .slim_bram_xy_addr (mac_slim_bram_xy_addr),
+ .fat_bram_xy_bank (recomb_fat_bram_xy_bank),
+ .fat_bram_xy_addr (recomb_fat_bram_xy_addr),
+ .fat_bram_x_dout (recomb_fat_bram_x_dout),
+ .fat_bram_y_dout (recomb_fat_bram_y_dout),
+ .fat_bram_xy_dout_valid (recomb_fat_bram_xy_dout_valid)
+ );
+
+ reg [17:0] AB_READ[0:63];
+
+ always @(posedge clk)
+ //
+ if (recomb_fat_bram_xy_dout_valid)
+ //
+ case (recomb_fat_bram_xy_bank)
+ 3'd1: AB_READ[recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout;
+ 3'd2: AB_READ[32 + recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout;
+ endcase
+
+
+ always @(posedge clk)
+ //
+ if (tb_fat_bram_xy_ena) begin
+ mgr_fat_bram_xy_ena <= 1'b1;
+ mgr_fat_bram_xy_bank <= tb_fat_bram_xy_bank;
+ mgr_fat_bram_xy_addr <= tb_fat_bram_xy_addr;
+ mgr_fat_bram_x_din <= tb_fat_bram_x_din;
+ mgr_fat_bram_y_din <= tb_fat_bram_y_din;
+ end else if (recomb_fat_bram_xy_dout_valid) begin
+ mgr_fat_bram_xy_ena <= 1'b1;
+ mgr_fat_bram_xy_bank <= recomb_fat_bram_xy_bank;
+ mgr_fat_bram_xy_addr <= recomb_fat_bram_xy_addr;
+ mgr_fat_bram_x_din <= recomb_fat_bram_x_dout;
+ mgr_fat_bram_y_din <= recomb_fat_bram_y_dout;
+ end else begin
+ mgr_fat_bram_xy_ena <= 1'b0;
+ mgr_fat_bram_xy_bank <= 3'bXXX;
+ mgr_fat_bram_xy_addr <= 8'hXX;
+ mgr_fat_bram_x_din <= {18{1'bX}};
+ mgr_fat_bram_y_din <= {18{1'bX}};
+ end
+
+
+
+
+
+ task verify_ab;
+ reg verify_ab_ok;
+ begin
+ verify_ab_ok = 1;
+ for (i=0; i<64; i=i+1)
+ if (AB_READ[i] === AB[i])
+ $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x", i, AB[i], AB_READ[i]);
+ else begin
+ $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x <???>", i, AB[i], AB_READ[i]);
+ verify_ab_ok = 0;
+ end
+ if (verify_ab_ok)
+ $display("AB is OK.");
+ else
+ $display("AB is WRONG!");
+ end
+ endtask
+
+
+
+ always @* begin
+ //
+ fsm_state_next = FSM_STATE_IDLE;
+ //
+ case (fsm_state)
+ FSM_STATE_IDLE: fsm_state_next = ena ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_IDLE;
+
+ FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_TRIG ;
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+
+ FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_TRIG ;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+
+ FSM_STATE_MULT_SQUARE_HOLDOFF: fsm_state_next = recomb_rdy ? FSM_STATE_IDLE : FSM_STATE_MULT_SQUARE_HOLDOFF;
+
+ default: fsm_state_next = FSM_STATE_IDLE ;
+
+ endcase
+ //
+ end
+
+
+endmodule
+
diff --git a/rtl/dev/temp.txt b/rtl/dev/temp.txt
new file mode 100644
index 0000000..987bd86
--- /dev/null
+++ b/rtl/dev/temp.txt
@@ -0,0 +1,384 @@
+ //
+ // Helper Functions
+ //
+ /*
+ function [INDEX_WIDTH-1:0] calc_preset_a_index;
+ input [INDEX_WIDTH-4:0] col_in;
+ input integer x_in;
+ integer index_out;
+ begin
+ index_out = col_in * NUM_MULTS + x_in;
+ calc_preset_a_index = index_out[INDEX_WIDTH-1:0];
+ end
+ endfunction
+
+ function [INDEX_WIDTH-1:0] calc_rotate_a_index;
+ input [INDEX_WIDTH-1:0] current_index_in;
+ input [INDEX_WIDTH-1:0] last_index_in;
+ begin
+ if (current_index_in > {INDEX_WIDTH{1'b0}})
+ calc_rotate_a_index = current_index_in - 1'b1;
+ else
+ calc_rotate_a_index = last_index_in;
+ end
+ endfunction
+ */
+
+ /*
+ //
+ // Narrow Counters
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_narrow_reg;
+ reg [INDEX_WIDTH-1:0] din_addr_narrow_dly;
+ localparam [INDEX_WIDTH-1:0] din_addr_narrow_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] din_addr_narrow_next = (din_addr_narrow_reg < index_last) ?
+ din_addr_narrow_reg + 1'b1 : din_addr_narrow_zero;
+ wire din_addr_narrow_done = din_addr_narrow_reg == index_last;
+
+ assign din_addr_narrow = din_addr_narrow_reg;
+
+ always @(posedge clk)
+ //
+ din_addr_narrow_dly <= din_addr_narrow_reg;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: din_addr_narrow_reg <= din_addr_narrow_next;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_narrow_reg <= din_addr_narrow_next;
+ endcase
+
+
+ //
+ // Helper Functions
+ //
+ function [NUM_MULTS-1:0] calc_mac_clear_bitmask;
+ input [2:0] t;
+ begin
+ case (t)
+ 3'd0: calc_mac_clear_bitmask = 8'b00000001;
+ 3'd1: calc_mac_clear_bitmask = 8'b00000010;
+ 3'd2: calc_mac_clear_bitmask = 8'b00000100;
+ 3'd3: calc_mac_clear_bitmask = 8'b00001000;
+ 3'd4: calc_mac_clear_bitmask = 8'b00010000;
+ 3'd5: calc_mac_clear_bitmask = 8'b00100000;
+ 3'd6: calc_mac_clear_bitmask = 8'b01000000;
+ 3'd7: calc_mac_clear_bitmask = 8'b10000000;
+ endcase
+ end
+ endfunction
+
+ function [NUM_MULTS:0] calc_mac_clear_square;
+ input [INDEX_WIDTH-4:0] current_col_index;
+ input [INDEX_WIDTH-1:0] b_addr_prev;
+ begin
+ if (b_addr_prev[INDEX_WIDTH-1:3] == current_col_index)
+ calc_mac_clear_square = {1'b0, calc_mac_clear_bitmask(b_addr_prev[2:0])};
+ else
+ calc_mac_clear_square = {1'b0, {NUM_MULTS{1'b0}}};
+ end
+ endfunction
+
+
+ //
+ // Wide Counters
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_wide_reg[0:NUM_MULTS-1];
+
+ integer xi;
+ always @(posedge clk)
+ //
+ for (xi=0; xi<NUM_MULTS; xi=xi+1)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(0, xi);
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_wide_reg[xi] <= calc_preset_a_index(col_index + 1'b1, xi);
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_wide_reg[xi] <= calc_rotate_a_index(din_addr_wide_reg[xi], index_last);
+ //
+ endcase
+
+
+ //
+ // Enables
+ //
+ reg din_ena_narrow_reg = 1'b0;
+ reg [NUM_MULTS-1:0] din_ena_wide_reg = {NUM_MULTS{1'b0}};
+
+ assign din_ena_narrow = din_ena_narrow_reg;
+ assign din_ena_wide = din_ena_wide_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) din_ena_narrow_reg <= 1'b0;
+ else case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_narrow_reg <= 1'b1;
+ default: din_ena_narrow_reg <= 1'b0;
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+ else case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_ena_wide_reg <= {NUM_MULTS{1'b1}};
+ default: din_ena_wide_reg <= {NUM_MULTS{1'b0}};
+ endcase
+
+
+ //
+ // Modes
+ //
+ reg [2-1:0] din_mode_wide_reg;
+ reg [2-1:0] din_mode_narrow_reg;
+ reg [2-1:0] dout_mode_wide_reg;
+ reg [2-1:0] dout_mode_narrow_reg;
+
+ assign din_mode_wide = din_mode_wide_reg;
+ assign din_mode_narrow = din_mode_narrow_reg;
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_wide_reg <= MODEXPNG_MODE_A;
+ default: din_mode_wide_reg <= 2'bXX;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_mode_narrow_reg <= MODEXPNG_MODE_B;
+ default: din_mode_narrow_reg <= 2'bXX;
+ endcase
+
+
+ //
+ // MAC Array
+ //
+ wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_a[0:NUM_MULTS];
+ wire [MODEXPNG_WORD_WIDTH-1:0] mac_din_b;
+ reg [ NUM_MULTS :0] mac_ce;
+ reg [ NUM_MULTS :0] mac_clr;
+ wire [ MODEXPNG_MAC_WIDTH-1:0] mac_p[0:NUM_MULTS];
+ reg [ NUM_MULTS :0] mac_rdy_lsb;
+ reg [ NUM_MULTS :0] mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1:0];
+
+ //reg [ NUM_MULTS :0] mac_ce_dly[MODEXPNG_MAC_LATENCY-1:0];
+ //wire [ NUM_MULTS :0] mac_rdy;
+
+
+
+
+
+ assign mac_din_b = din_narrow;
+
+
+ genvar x;
+ generate for (x=0; x<=NUM_MULTS; x=x+1)
+ begin : gen_macs
+ //
+ //assign mac_rdy[x] = mac_ce_dly[MODEXPNG_MAC_LATENCY-1][x];
+ //
+ modexpng_mac mac_inst
+ (
+ .clk (clk),
+ .ce (mac_ce[x]),
+ .clr (mac_clr[x]),
+ .a (mac_din_a[x]),
+ .b (mac_din_b),
+ .p (mac_p[x])
+ );
+ //
+ end
+ //
+ endgenerate
+
+ generate for (x=0; x<NUM_MULTS; x=x+1)
+ begin : gen_mac_din_a
+ //
+ assign mac_din_a[x] = din_wide[x*MODEXPNG_WORD_WIDTH+:MODEXPNG_WORD_WIDTH];
+ //
+ end
+ endgenerate
+
+ generate for (x=0; x<NUM_MULTS; x=x+1)
+ begin : gen_din_addr_wide
+ //
+ assign din_addr_wide[x*INDEX_WIDTH+:INDEX_WIDTH] = din_addr_wide_reg[x];
+ //
+ end
+ endgenerate
+
+
+ //
+ // MAC Clock Enable Logic
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+ else case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_ce <= {1'b0, {NUM_MULTS{1'b1}}};
+ default: mac_ce <= {1'b0, {NUM_MULTS{1'b0}}};
+ endcase
+
+
+ //
+ // MAC Valid Logic
+ //
+ integer y;
+
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_rdy_lsb_dly[0][xi] <= mac_rdy_lsb[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_rdy_lsb_dly[y][xi] <= mac_rdy_lsb_dly[y-1][xi];
+ end
+
+ always @(posedge clk) begin
+ //
+ fsm_state_dly[0] <= fsm_state;
+ for (y=1; y<=MODEXPNG_MAC_LATENCY; y=y+1)
+ fsm_state_dly[y] <= fsm_state_dly[y-1];
+ end
+
+ */
+
+ /*
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_ce_dly[0][xi] <= mac_ce[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_ce_dly[y][xi] <= mac_ce_dly[y-1][xi];
+ end
+ */
+ /*
+ always @(posedge clk)
+ //
+ for (xi=0; xi<=NUM_MULTS; xi=xi+1) begin
+ mac_clr_dly[0][xi] <= mac_clr[xi];
+ for (y=1; y<MODEXPNG_MAC_LATENCY; y=y+1)
+ mac_clr_dly[y][xi] <= mac_clr_dly[y-1][xi];
+ end
+ */
+
+ /*
+ //
+ // MAC Clear Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_clr <= {1'b0, {NUM_MULTS{1'b1}}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_clr <= calc_mac_clear_square(col_index, din_addr_narrow_dly);
+ default: mac_clr <= {1'bX, {NUM_MULTS{1'bX}}};
+ endcase
+
+
+ //
+ // MAC Ready Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_rdy_lsb <= calc_mac_clear_square(col_index, din_addr_narrow);
+ default: mac_rdy_lsb <= {1'bX, {NUM_MULTS{1'bX}}};
+ endcase
+
+
+
+ //
+ // Recombinators
+ //
+ reg rcmb_lsb_ce;
+ reg rcmb_lsb_clr;
+ reg [MODEXPNG_MAC_WIDTH-1: 0] rcmb_lsb_din;
+ wire [15: 0] rcmb_lsb_dout;
+
+ modexpng_part_recombinator recomb_lsb
+ (
+ .clk (clk),
+ .ce (rcmb_lsb_ce),
+ .clr (rcmb_lsb_clr),
+ .din (rcmb_lsb_din),
+ .dout (rcmb_lsb_dout)
+ );
+
+
+ reg calc_rcmb_lsb_ce;
+ always @*
+ //
+ calc_rcmb_lsb_ce = | mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0];
+
+ reg [MODEXPNG_MAC_WIDTH-1:0] calc_rcmb_lsb_din;
+
+ always @*
+ //
+ casez (mac_rdy_lsb_dly[MODEXPNG_MAC_LATENCY-1][NUM_MULTS-1:0])
+ 8'b00000001: calc_rcmb_lsb_din = mac_p[0];
+ 8'b00000010: calc_rcmb_lsb_din = mac_p[1];
+ 8'b00000100: calc_rcmb_lsb_din = mac_p[2];
+ 8'b00001000: calc_rcmb_lsb_din = mac_p[3];
+ 8'b00010000: calc_rcmb_lsb_din = mac_p[4];
+ 8'b00100000: calc_rcmb_lsb_din = mac_p[5];
+ 8'b01000000: calc_rcmb_lsb_din = mac_p[6];
+ 8'b10000000: calc_rcmb_lsb_din = mac_p[7];
+ default: calc_rcmb_lsb_din = {MODEXPNG_MAC_WIDTH{1'bX}};
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0)
+ rcmb_lsb_ce <= 1'b0;
+ else case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_ce <= calc_rcmb_lsb_ce;
+ default: rcmb_lsb_ce <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: rcmb_lsb_clr <= 1'b1;
+ default: rcmb_lsb_clr <= 1'b0;
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_dly[MODEXPNG_MAC_LATENCY])
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: rcmb_lsb_din <= calc_rcmb_lsb_din;
+ default: rcmb_lsb_din <= {MODEXPNG_MAC_WIDTH{1'bX}};
+ endcase
+
+
+
+*/
diff --git a/rtl/dsp/dsp_array.v b/rtl/dsp/dsp_array.v
new file mode 100644
index 0000000..178f87f
--- /dev/null
+++ b/rtl/dsp/dsp_array.v
@@ -0,0 +1,111 @@
+module dsp_array
+(
+ input clk,
+
+ input ce_a,
+ input ce_b,
+ input ce_m,
+ input ce_p,
+ input ce_mode,
+
+ input [8 -1:0] mode_z,
+
+ input [4*18-1:0] a,
+ input [1*17-1:0] b,
+ output [8*47-1:0] p
+);
+
+ `include "../modexpng_parameters_x8.vh"
+
+ wire [17:0] casc_a[0:3];
+ wire [16:0] casc_b[0:3];
+
+ wire ce_a0 = ce_a;
+ reg ce_a1 = 1'b0;
+ reg ce_a2 = 1'b0;
+
+ wire ce_b0 = ce_b;
+ reg ce_b1 = 1'b0;
+
+ always @(posedge clk) begin
+ ce_a1 <= ce_a0;
+ ce_a2 <= ce_a1;
+ ce_b1 <= ce_b0;
+ end
+
+
+ genvar z;
+ generate for (z=0; z<(NUM_MULTS/2); z=z+1)
+ //
+ begin : DSP48E1
+ //
+ dsp_slice #
+ (
+ .AB_INPUT("DIRECT"),
+ .B_REG(2)
+ )
+ dsp_direct
+ (
+ .clk (clk),
+
+ .ce_a1 (ce_a0),
+ .ce_b1 (ce_b0),
+ .ce_a2 (ce_a1),
+ .ce_b2 (ce_b1),
+ .ce_m (ce_m),
+ .ce_p (ce_p),
+ .ce_mode (ce_mode),
+
+ .a (a[z*18+:18]),
+ .b (b),
+ .p (p[47*2*z+:47]),
+
+ .inmode (5'b00000),
+ .opmode ({1'b0, mode_z[2*z], 1'b0, 2'b01, 2'b01}),
+ .alumode (4'b0000),
+
+ .casc_a_in ({17{1'b0}}),
+ .casc_b_in ({17{1'b0}}),
+
+ .casc_a_out (casc_a[z]),
+ .casc_b_out (casc_b[z])
+ );
+ //
+ dsp_slice #
+ (
+ .AB_INPUT("CASCADE"),
+ .B_REG(1)
+ )
+ dsp_cascade
+ (
+ .clk (clk),
+
+ .ce_a1 (ce_a1),
+ .ce_b1 (1'b0),
+ .ce_a2 (ce_a2),
+ .ce_b2 (ce_b1),
+ .ce_m (ce_m),
+ .ce_p (ce_p),
+ .ce_mode (ce_mode),
+
+ .a (a[z*18+:18]),
+ .b (b),
+ .p (p[47*(2*z+1)+:47]),
+
+ .inmode (5'b00000),
+ .opmode ({1'b0, mode_z[2*z+1], 1'b0, 2'b01, 2'b01}),
+ .alumode (4'b0000),
+
+ .casc_a_in (casc_a[z]),
+ .casc_b_in (casc_b[z]),
+
+ .casc_a_out (),
+ .casc_b_out ()
+ );
+ //
+ end
+ //
+ endgenerate
+
+
+endmodule
diff --git a/rtl/dsp/dsp_slice.v b/rtl/dsp/dsp_slice.v
new file mode 100644
index 0000000..9f1298b
--- /dev/null
+++ b/rtl/dsp/dsp_slice.v
@@ -0,0 +1,125 @@
+module dsp_slice #
+(
+ AB_INPUT = "DIRECT",
+ B_REG = 2
+)
+(
+ input clk,
+ input ce_a1,
+ input ce_b1,
+ input ce_a2,
+ input ce_b2,
+ input ce_m,
+ input ce_p,
+ input ce_mode,
+ input [17:0] a,
+ input [16:0] b,
+ output [46:0] p,
+ input [ 4:0] inmode,
+ input [ 6:0] opmode,
+ input [ 3:0] alumode,
+ input [17:0] casc_a_in,
+ input [16:0] casc_b_in,
+ output [17:0] casc_a_out,
+ output [16:0] casc_b_out
+);
+
+ wire [30-18-1:0] casc_a_dummy;
+ wire [18-17-1:0] casc_b_dummy;
+ wire [48-47-1:0] p_dummy;
+
+ DSP48E1 #
+ (
+ .AREG (2),
+ .BREG (B_REG),
+ .CREG (0),
+ .DREG (0),
+ .ADREG (0),
+ .MREG (1),
+ .PREG (1),
+ .ACASCREG (1),
+ .BCASCREG (1),
+ .INMODEREG (0),
+ .OPMODEREG (1),
+ .ALUMODEREG (0),
+ .CARRYINREG (0),
+ .CARRYINSELREG (0),
+
+ .A_INPUT (AB_INPUT),
+ .B_INPUT (AB_INPUT),
+
+ .USE_DPORT ("FALSE"),
+ .USE_MULT ("DYNAMIC"),
+ .USE_SIMD ("ONE48"),
+
+ .MASK (48'h3fffffffffff),
+ .PATTERN (48'h000000000000),
+ .SEL_MASK ("MASK"),
+ .SEL_PATTERN ("PATTERN"),
+
+ .USE_PATTERN_DETECT ("NO_PATDET"),
+ .AUTORESET_PATDET ("NO_RESET")
+ )
+ DSP48E1_inst
+ (
+ .CLK (clk),
+
+ .CEA1 (ce_a1),
+ .CEB1 (ce_b1),
+ .CEA2 (ce_a2),
+ .CEB2 (ce_b2),
+ .CEAD (1'b0),
+ .CEC (1'b0),
+ .CED (1'b0),
+ .CEM (ce_m),
+ .CEP (ce_p),
+ .CEINMODE (1'b0),
+ .CECTRL (ce_mode),
+ .CEALUMODE (1'b0),
+ .CECARRYIN (1'b0),
+
+ .A ({{(30-18){1'b0}}, a}),
+ .B ({{(18-17){1'b0}}, b}),
+ .C ({48{1'b0}}),
+ .D ({25{1'b0}}),
+ .P ({p_dummy, p}),
+
+ .INMODE (inmode),
+ .OPMODE (opmode),
+ .ALUMODE (alumode),
+
+ .ACIN ({{(30-18){1'b0}}, casc_a_in}),
+ .BCIN ({{(18-17){1'b0}}, casc_b_in}),
+ .ACOUT ({casc_a_dummy, casc_a_out}),
+ .BCOUT ({casc_b_dummy, casc_b_out}),
+ .PCIN ({48{1'b0}}),
+ .PCOUT (),
+ .CARRYCASCIN (1'b0),
+ .CARRYCASCOUT (),
+
+ .RSTA (1'b0),
+ .RSTB (1'b0),
+ .RSTC (1'b0),
+ .RSTD (1'b0),
+ .RSTM (1'b0),
+ .RSTP (1'b0),
+ .RSTINMODE (1'b0),
+ .RSTCTRL (1'b0),
+ .RSTALUMODE (1'b0),
+ .RSTALLCARRYIN (1'b0),
+
+ .UNDERFLOW (),
+ .OVERFLOW (),
+ .PATTERNDETECT (),
+ .PATTERNBDETECT (),
+
+ .CARRYIN (1'b0),
+ .CARRYOUT (),
+ .CARRYINSEL (3'b000),
+
+ .MULTSIGNIN (1'b0),
+ .MULTSIGNOUT ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mac.v b/rtl/modexpng_mac.v
new file mode 100644
index 0000000..9105dab
--- /dev/null
+++ b/rtl/modexpng_mac.v
@@ -0,0 +1,54 @@
+module modexpng_mac
+(
+ clk,
+ ce, clr,
+ casc_a,
+ a_in, b_in, p_out,
+ a_casc_in, a_casc_out
+);
+
+ input clk;
+ input ce;
+ input clr;
+ input casc_a;
+ input [16:0] a_in;
+ input [16:0] b_in;
+ output [46:0] p_out;
+ input [16:0] a_casc_in;
+ output [16:0] a_casc_out;
+
+ reg [16:0] a_reg;
+ reg [16:0] b_reg;
+ assign a_casc_out = a_reg;
+ always @(posedge clk)
+ //
+ if (ce) {b_reg, a_reg} <= {b_in, casc_a ? a_casc_in : a_in};
+
+ reg ce_dly1;
+ reg ce_dly2;
+ always @(posedge clk)
+ //
+ {ce_dly2, ce_dly1} <= {ce_dly1, ce};
+
+ reg clr_dly1;
+ reg clr_dly2;
+ always @(posedge clk) begin
+ //
+ if (ce) clr_dly1 <= clr;
+ if (ce_dly1) clr_dly2 <= clr_dly1;
+ //
+ end
+
+ reg [33:0] m_reg;
+ wire [46:0] m_reg_ext = {{13{1'b0}}, m_reg};
+ always @(posedge clk)
+ //
+ if (ce_dly1) m_reg <= {{17{1'b0}}, a_reg} * {{17{1'b0}}, b_reg};
+
+ reg [46:0] p_reg;
+ assign p_out = p_reg;
+ always @(posedge clk)
+ //
+ if (ce_dly2) p_reg <= clr_dly2 ? m_reg_ext : p_reg + m_reg_ext;
+
+endmodule
diff --git a/rtl/modexpng_mac_array.v b/rtl/modexpng_mac_array.v
new file mode 100644
index 0000000..067929e
--- /dev/null
+++ b/rtl/modexpng_mac_array.v
@@ -0,0 +1,116 @@
+module modexpng_mac_array
+(
+ clk,
+ ce, clr,
+ ce_aux, clr_aux,
+ casc_a, casc_a_aux,
+ a_in, b_in, p_out,
+ a_in_aux, p_out_aux
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input ce;
+ input [NUM_MULTS -1:0] clr;
+ input ce_aux;
+ input clr_aux;
+ input [NUM_MULTS -2:0] casc_a;
+ input casc_a_aux;
+ input [NUM_MULTS * WORD_WIDTH -1:0] a_in;
+ input [ 1 * WORD_WIDTH -1:0] b_in;
+ output [NUM_MULTS * MAC_WIDTH -1:0] p_out;
+ input [ 1 * WORD_WIDTH -1:0] a_in_aux;
+ output [ 1 * MAC_WIDTH -1:0] p_out_aux;
+
+
+ //
+ // A-Cascade Paths
+ //
+ wire [WORD_WIDTH-1:0] a_casc_int[0:NUM_MULTS-2];
+ wire [WORD_WIDTH-1:0] a_casc_int_aux;
+
+
+ //
+ // LSB
+ //
+ modexpng_mac mac_lsb
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[0]),
+ .casc_a (1'b0),
+ .a_in (a_in[0+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[0+:MAC_WIDTH]),
+ .a_casc_in ({WORD_WIDTH{1'b0}}),
+ .a_casc_out (a_casc_int[0])
+ );
+
+
+ //
+ // INT
+ //
+ genvar z;
+ generate for (z=1; z<(NUM_MULTS-1); z=z+1)
+ begin : gen_modexpng_mac_int
+ modexpng_mac mac_int
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[z]),
+ .casc_a (casc_a[z-1]),
+ .a_in (a_in[z*WORD_WIDTH+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[z*MAC_WIDTH+:MAC_WIDTH]),
+ .a_casc_in (a_casc_int[z-1]),
+ .a_casc_out (a_casc_int[z])
+ );
+ end
+ endgenerate
+
+
+ //
+ // MSB
+ //
+ modexpng_mac mac_msb
+ (
+ .clk (clk),
+ .ce (ce),
+ .clr (clr[NUM_MULTS-1]),
+ .casc_a (casc_a[NUM_MULTS-2]),
+ .a_in (a_in[(NUM_MULTS-1)*WORD_WIDTH+:WORD_WIDTH]),
+ .b_in (b_in),
+ .p_out (p_out[(NUM_MULTS-1)*MAC_WIDTH+:MAC_WIDTH]),
+ .a_casc_in (a_casc_int[NUM_MULTS-2]),
+ .a_casc_out (a_casc_int_aux)
+ );
+
+
+ //
+ // AUX
+ //
+ modexpng_mac mac_aux
+ (
+ .clk (clk),
+ .ce (ce_aux),
+ .clr (clr_aux),
+ .casc_a (casc_a_aux),
+ .a_in (a_in_aux),
+ .b_in (b_in),
+ .p_out (p_out_aux),
+ .a_casc_in (a_casc_int_aux),
+ .a_casc_out ()
+ );
+
+
+endmodule
diff --git a/rtl/modexpng_mem.v b/rtl/modexpng_mem.v
new file mode 100644
index 0000000..ca89214
--- /dev/null
+++ b/rtl/modexpng_mem.v
@@ -0,0 +1,93 @@
+//
+// TODO: Add license text!
+//
+
+module modexpng_mem #
+(
+ parameter MEM_WIDTH = 17,
+ parameter MEM_ADDR_BITS = 6
+)
+(
+ input clk,
+
+ input [MEM_ADDR_BITS-1:0] a_addr,
+ input a_en,
+ input a_wr,
+ input [MEM_WIDTH -1:0] a_in,
+ output [MEM_WIDTH -1:0] a_out,
+
+ input [MEM_ADDR_BITS-1:0] b_addr,
+ input b_en,
+ input b_reg_en,
+ output [MEM_WIDTH -1:0] b_out
+);
+
+
+ //
+ // BRAM
+ //
+ (* RAM_STYLE="BLOCK" *)
+ reg [MEM_WIDTH-1:0] bram[0:(2**MEM_ADDR_BITS)-1];
+
+
+ //
+ // Initialization for Simulation
+ //
+ /*
+ integer c;
+ initial begin
+ for (c=0; c<(2**MEM_ADDR_BITS); c=c+1)
+ bram[c] = {MEM_WIDTH{1'b0}};
+ end
+ */
+
+
+
+ //
+ // Output Registers
+ //
+ reg [MEM_WIDTH-1:0] bram_b;
+ reg [MEM_WIDTH-1:0] bram_b_reg;
+
+ assign a_out = 32'hDEADCE11;
+ assign b_out = bram_b_reg;
+
+
+ //
+ // Note, that when both ports are accessing the same location, conflict can
+ // potentionally arise. See Xilinx UG473 (pages 19-20, "Conflict
+ // Avoidance") for more information. In our configuration to avoid that the
+ // write port must be coded to operate in READ_FIRST mode. If the write
+ // port is overwriting the same address the read port is accessing, the
+ // write port must read the previously stored data (not the data it is
+ // writing, as that would be WRITE_FIRST mode).
+ //
+
+
+ //
+ // Write-Only Port A
+ //
+ always @(posedge clk)
+ //
+ if (a_en)
+ //
+ if (a_wr) bram[a_addr] <= a_in;
+
+
+ //
+ // Read-Only Port B
+ //
+ always @(posedge clk)
+ //
+ if (b_en)
+ //
+ bram_b <= bram[b_addr];
+
+ always @(posedge clk)
+ //
+ if (b_reg_en)
+ //
+ bram_b_reg <= bram_b;
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_col_index.v b/rtl/modexpng_mmm_col_index.v
new file mode 100644
index 0000000..b904795
--- /dev/null
+++ b/rtl/modexpng_mmm_col_index.v
@@ -0,0 +1,90 @@
+module modexpng_mmm_col_index
+(
+ clk,
+ index_last,
+ fsm_state_next,
+ col_index,
+ col_index_done,
+ col_index_zero,
+ col_index_next,
+ col_index_prev
+);
+
+
+ //
+ // Includes
+ //
+ //`include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ output [ INDEX_WIDTH-4:0] col_index;
+ output col_index_done;
+ output [ INDEX_WIDTH-4:0] col_index_zero;
+ output [ INDEX_WIDTH-4:0] col_index_next;
+ output [ INDEX_WIDTH-4:0] col_index_prev;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-4:0] col_index_reg;
+ reg [INDEX_WIDTH-4:0] col_index_last;
+ reg [INDEX_WIDTH-4:0] col_index_dly;
+
+
+ //
+ // Mapping
+ //
+ assign col_index = col_index_reg;
+ assign col_index_prev = col_index_dly;
+
+
+ //
+ // Handy Wires
+ //
+ assign col_index_done = col_index == col_index_last;
+ assign col_index_zero = {(INDEX_WIDTH-3){1'b0}};
+ assign col_index_next = col_index + 1'b1;
+
+
+ //
+ // Increment Logic
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+ col_index_reg <= col_index_zero;
+ col_index_last <= index_last[INDEX_WIDTH-1:3];
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+ col_index_reg <= col_index_next;
+ //
+ endcase
+
+
+ //
+ // Delay Logic
+ //
+ always @(posedge clk)
+ //
+ col_index_dly <= col_index;
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_din_addr.v b/rtl/modexpng_mmm_din_addr.v
new file mode 100644
index 0000000..565c7e0
--- /dev/null
+++ b/rtl/modexpng_mmm_din_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_din_addr
+(
+ clk, rst_n,
+ index_last,
+ fsm_state_next,
+ col_index_zero, col_index_next,
+ din_addr, din_bank, din_ena, din_reg_ena,
+ din_addr_cnt, din_addr_cnt_last,
+ din_addr_cnt_lower_prev, din_addr_cnt_upper_prev
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ input [ INDEX_WIDTH-4:0] col_index_zero;
+ input [ INDEX_WIDTH-4:0] col_index_next;
+ output [ INDEX_WIDTH-4:0] din_addr;
+ output [ 3-1:0] din_bank;
+ output [ 1-1:0] din_ena;
+ output [ 1-1:0] din_reg_ena;
+ output [ INDEX_WIDTH-1:0] din_addr_cnt;
+ output [ INDEX_WIDTH-1:0] din_addr_cnt_last;
+ output [ 3-1:0] din_addr_cnt_lower_prev;
+ output [ INDEX_WIDTH-4:0] din_addr_cnt_upper_prev;
+
+
+ //
+ // Address
+ //
+ reg [INDEX_WIDTH-1:0] din_addr_reg;
+ wire [INDEX_WIDTH-1:0] din_addr_zero = {INDEX_WIDTH{1'b0}};
+ reg [INDEX_WIDTH-1:0] din_addr_last;
+ wire [INDEX_WIDTH-1:0] din_addr_prev = (din_addr_reg == din_addr_zero) ? din_addr_last : din_addr_reg - 1'b1;
+
+ reg [INDEX_WIDTH-1:0] din_addr_cnt_reg;
+ wire [INDEX_WIDTH-1:0] din_addr_cnt_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] din_addr_cnt_next = din_addr_cnt_reg + 1'b1;
+ reg [INDEX_WIDTH-1:0] din_addr_cnt_last_reg;
+ wire [ 3-1:0] din_addr_cnt_lower = din_addr_cnt_reg[ 3-1:0];
+ wire [INDEX_WIDTH-4:0] din_addr_cnt_upper = din_addr_cnt_reg[INDEX_WIDTH-1:3];
+ reg [ 3-1:0] din_addr_cnt_lower_dly;
+ reg [INDEX_WIDTH-4:0] din_addr_cnt_upper_dly;
+
+ reg [ 3-1:0] din_bank_reg;
+
+
+ //
+ // Enables
+ //
+ reg din_ena_reg = 1'b0;
+ reg din_reg_ena_reg = 1'b0;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n)
+ din_ena_reg <= 1'b0;
+ else case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ din_ena_reg <= 1'b1;
+ //
+ default:
+ din_ena_reg <= 1'b0;
+ //
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n)
+ din_reg_ena_reg <= 1'b0;
+ else
+ din_reg_ena_reg <= din_ena_reg;
+
+
+ //
+ // Address Mapping
+ //
+ assign din_addr = din_addr_reg[INDEX_WIDTH-1:3];
+
+ assign din_addr_cnt = din_addr_cnt_reg;
+ assign din_addr_cnt_last = din_addr_cnt_last_reg;
+ assign din_addr_cnt_lower_prev = din_addr_cnt_lower_dly;
+ assign din_addr_cnt_upper_prev = din_addr_cnt_upper_dly;
+
+ assign din_bank = din_bank_reg;
+
+
+ //
+ // Enable Mapping
+ //
+ assign din_ena = din_ena_reg;
+ assign din_reg_ena = din_reg_ena_reg;
+
+
+ //
+ // Delay
+ //
+ always @(posedge clk) begin
+ din_addr_cnt_lower_dly <= din_addr_cnt_lower;
+ din_addr_cnt_upper_dly <= din_addr_cnt_upper;
+ end
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: begin
+ din_addr_reg <= {col_index_zero, {3{1'b0}}};
+ din_addr_last <= index_last;
+ din_addr_cnt_reg <= din_addr_cnt_zero;
+ din_addr_cnt_last_reg <= index_last;
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: begin
+ din_addr_reg <= {col_index_next, {3{1'b0}}};
+ din_addr_cnt_reg <= din_addr_cnt_zero;
+ end
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+ din_addr_reg <= din_addr_prev;
+ din_addr_cnt_reg <= din_addr_cnt_next;
+ end
+ //
+ //default:
+ //
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ din_bank_reg = BANK_XY_T1T2;
+ //
+ default:
+ din_bank_reg = BANK_XY_ANY;
+ //
+ endcase
+
+endmodule
diff --git a/rtl/modexpng_mmm_dout_addr.v b/rtl/modexpng_mmm_dout_addr.v
new file mode 100644
index 0000000..3749d82
--- /dev/null
+++ b/rtl/modexpng_mmm_dout_addr.v
@@ -0,0 +1,167 @@
+module modexpng_mmm_dout_addr
+(
+ clk, rst_n,
+ //index_last,
+ fsm_state,
+ load_xy_addr,
+ load_addr_zero,
+ load_nn_coeff_addr_done,
+ /*
+
+ col_index_zero, col_index_next,*/
+ x_dout_addr, y_dout_addr,
+ x_dout_ena, y_dout_ena,
+ x_dout_bank, y_dout_bank
+
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ //input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+ input [INDEX_WIDTH:0] load_xy_addr; // address
+ input load_addr_zero;
+ input load_nn_coeff_addr_done;
+ //input [ INDEX_WIDTH-4:0] col_index_zero;
+ //input [ INDEX_WIDTH-4:0] col_index_next;
+ output [INDEX_WIDTH-4:0] x_dout_addr;
+ output [INDEX_WIDTH-4:0] y_dout_addr;
+
+ output [NUM_MULTS-1:0] x_dout_ena;
+ output [NUM_MULTS-1:0] y_dout_ena;
+
+ output [3-1:0] x_dout_bank;
+ output [3-1:0] y_dout_bank;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-4:0] x_dout_addr_reg; //clog2
+ reg [INDEX_WIDTH-4:0] y_dout_addr_reg; //clog2
+
+ reg [NUM_MULTS-1:0] x_dout_ena_reg = {NUM_MULTS{1'b0}};
+ reg [NUM_MULTS-1:0] y_dout_ena_reg = {NUM_MULTS{1'b0}};
+
+ reg [NUM_MULTS-1:0] x_dout_ena_int;
+ reg [NUM_MULTS-1:0] y_dout_ena_int;
+
+ reg [3-1:0] x_dout_bank_reg;
+ reg [3-1:0] y_dout_bank_reg;
+
+
+ //
+ // Mapping
+ //
+ assign x_dout_addr = x_dout_addr_reg;
+ assign y_dout_addr = y_dout_addr_reg;
+
+ assign x_dout_ena = x_dout_ena_reg;
+ assign y_dout_ena = y_dout_ena_reg;
+
+ assign x_dout_bank = x_dout_bank_reg;
+ assign y_dout_bank = y_dout_bank_reg;
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ x_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+ y_dout_addr_reg <= load_xy_addr[INDEX_WIDTH-1:3];
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+ y_dout_addr_reg <= !load_nn_coeff_addr_done ? load_xy_addr[INDEX_WIDTH-1:3] : BANK_XY_AUX_ADDR_N_COEFF[INDEX_WIDTH-4:0];
+ end
+ //
+ default: begin
+ x_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+ y_dout_addr_reg <= {INDEX_WIDTH-3{1'bX}};
+ end
+ //
+ endcase
+
+ wire [NUM_MULTS-1:0] load_xy_ena_init = {{NUM_MULTS-1{1'b0}}, 1'b1};
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_2: begin
+ x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1]};
+ y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_2: begin
+ x_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {x_dout_ena_int[NUM_MULTS-2:0], x_dout_ena_int[NUM_MULTS-1] & ~load_nn_coeff_addr_done};
+ y_dout_ena_int <= load_addr_zero ? load_xy_ena_init : {y_dout_ena_int[NUM_MULTS-2:0], y_dout_ena_int[NUM_MULTS-1]};
+ end
+ //
+ endcase
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ y_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ end else case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3,
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_ena_reg <= x_dout_ena_int;
+ y_dout_ena_reg <= y_dout_ena_int;
+ end
+ //
+ default: begin
+ x_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ y_dout_ena_reg <= {NUM_MULTS{1'b0}};
+ end
+ //
+ endcase
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ x_dout_bank_reg <= BANK_X_T1;
+ y_dout_bank_reg <= BANK_Y_T2;
+ end
+ //
+ FSM_STATE_LOAD_NN_COEFF_3: begin
+ x_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_X_N : BANK_XY_AUX;
+ y_dout_bank_reg <= !load_nn_coeff_addr_done ? BANK_Y_N_COEFF : BANK_XY_AUX;
+ end
+ //
+ default: begin
+ x_dout_bank_reg <= BANK_XY_ANY;
+ y_dout_bank_reg <= BANK_XY_ANY;
+ end
+ //
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_fsm.vh b/rtl/modexpng_mmm_fsm.vh
new file mode 100644
index 0000000..c237a0b
--- /dev/null
+++ b/rtl/modexpng_mmm_fsm.vh
@@ -0,0 +1,24 @@
+localparam FSM_STATE_WIDTH = 32;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_IDLE = 0;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_1 = 1;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_2 = 2;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_T1T2_3 = 3;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_1 = 4;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_2 = 5;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_LOAD_NN_COEFF_3 = 6;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_INIT = 11;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_TRIG = 12;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_0_BUSY = 13;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_INIT = 14;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_TRIG = 15;
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_COL_N_BUSY = 16;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_MULT_SQUARE_HOLDOFF = 17;
+
+localparam [FSM_STATE_WIDTH-1:0] FSM_STATE_STOP = 999;
+ \ No newline at end of file
diff --git a/rtl/modexpng_mmm_pad.v b/rtl/modexpng_mmm_pad.v
new file mode 100644
index 0000000..a2a21ff
--- /dev/null
+++ b/rtl/modexpng_mmm_pad.v
@@ -0,0 +1,153 @@
+module modexpng_mmm_pad
+(
+ clk, rst_n,
+ fsm_state,
+ load_xy_addr_lsb,
+ pad_x_rd_addr, pad_y_rd_addr,
+ pad_x_rd_ena, pad_y_rd_ena,
+ pad_x_rd_dout, pad_y_rd_dout,
+ load_x_din, load_y_din
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+
+ input [INDEX_WIDTH-1:0] load_xy_addr_lsb;
+
+ input [WORD_WIDTH-1:0] load_x_din;
+ input [WORD_WIDTH-1:0] load_y_din;
+
+ input [INDEX_WIDTH-1:0] pad_x_rd_addr;
+ input [INDEX_WIDTH-1:0] pad_y_rd_addr;
+
+ input pad_x_rd_ena;
+ input pad_y_rd_ena;
+
+ output [WORD_WIDTH-1:0] pad_x_rd_dout;
+ output [WORD_WIDTH-1:0] pad_y_rd_dout;
+
+
+ //
+ // Registers
+ //
+ reg [INDEX_WIDTH-1:0] pad_x_wr_addr;
+ reg [INDEX_WIDTH-1:0] pad_y_wr_addr;
+ reg pad_x_wr_ena;
+ reg pad_y_wr_ena;
+ reg [ WORD_WIDTH-1:0] pad_x_wr_din;
+ reg [ WORD_WIDTH-1:0] pad_y_wr_din;
+
+ bram_1wo_1ro_readfirst_ce #
+ (
+ .MEM_WIDTH (WORD_WIDTH),
+ .MEM_ADDR_BITS (INDEX_WIDTH)
+ )
+ pad_x
+ (
+ .clk (clk),
+
+ .a_addr (pad_x_wr_addr),
+ .a_en (pad_x_wr_ena),
+ .a_wr (pad_x_wr_ena),
+ .a_in (pad_x_wr_din),
+ .a_out (), // unused
+
+ .b_addr (pad_x_rd_addr),
+ .b_en (pad_x_rd_ena),
+ .b_out (pad_x_rd_dout)
+ );
+
+ bram_1wo_1ro_readfirst_ce #
+ (
+ .MEM_WIDTH (WORD_WIDTH),
+ .MEM_ADDR_BITS (INDEX_WIDTH)
+ )
+ pad_y
+ (
+ .clk (clk),
+
+ .a_addr (pad_y_wr_addr),
+ .a_en (pad_y_wr_ena),
+ .a_wr (pad_y_wr_ena),
+ .a_in (pad_y_wr_din),
+ .a_out (), // unused
+
+ .b_addr (pad_y_rd_addr),
+ .b_en (pad_y_rd_ena),
+ .b_out (pad_y_rd_dout)
+ );
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_addr <= load_xy_addr_lsb;
+ pad_y_wr_addr <= load_xy_addr_lsb;
+ end
+ //
+ default: begin
+ pad_x_wr_addr <= {INDEX_WIDTH{1'bX}};
+ pad_y_wr_addr <= {INDEX_WIDTH{1'bX}};
+ end
+ //
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_din <= load_x_din;
+ pad_y_wr_din <= load_y_din;
+ end
+ //
+ default: begin
+ pad_x_wr_din <= load_x_din;
+ pad_y_wr_din <= load_y_din;
+ end
+ //
+ endcase
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ pad_x_wr_ena <= 1'b0;
+ pad_y_wr_ena <= 1'b0;
+ end else case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3: begin
+ pad_x_wr_ena <= 1'b1;
+ pad_y_wr_ena <= 1'b1;
+ end
+ //
+ default: begin
+ pad_x_wr_ena <= 1'b0;
+ pad_y_wr_ena <= 1'b0;
+ end
+ //
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_transporter.v b/rtl/modexpng_mmm_transporter.v
new file mode 100644
index 0000000..a8f309a
--- /dev/null
+++ b/rtl/modexpng_mmm_transporter.v
@@ -0,0 +1,157 @@
+module modexpng_mmm_transporter
+(
+ clk,
+ ena,
+ index_last,
+ fsm_state,
+ fsm_state_next,
+ load_phase,
+ load_xy_addr,
+ load_xy_addr_vld,
+ load_xy_req,
+ load_addr_zero,
+ load_t1t2_addr_done,
+ load_nn_coeff_addr_done
+);
+
+
+ //
+ // Includes
+ //
+ //`include "modexpng_parameters.vh"
+ //`include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input ena;
+ input [ INDEX_WIDTH-1:0] index_last;
+ input [FSM_STATE_WIDTH-1:0] fsm_state;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ output load_phase;
+ output [ INDEX_WIDTH:0] load_xy_addr;
+ output load_xy_addr_vld;
+ output load_xy_req;
+ output load_addr_zero;
+ output load_t1t2_addr_done;
+ output load_nn_coeff_addr_done;
+
+
+ //
+ // Load Address Generator
+ //
+ reg load_phase_reg;
+ reg [INDEX_WIDTH:0] load_xy_addr_reg;
+ reg load_xy_addr_vld_reg;
+ reg load_xy_req_reg;
+
+
+ //
+ // Mapping
+ //
+ assign load_phase = load_phase_reg;
+ assign load_xy_addr = load_xy_addr_reg;
+ assign load_xy_addr_vld = load_xy_addr_vld_reg;
+ assign load_xy_req = load_xy_req_reg;
+
+
+ //
+ // Handy Quantities
+ //
+ wire [INDEX_WIDTH:0] load_xy_addr_zero = {{INDEX_WIDTH{1'b0}}, 1'b0};
+ wire [INDEX_WIDTH:0] load_xy_addr_next = load_xy_addr_reg + 1'b1;
+ wire [INDEX_WIDTH:0] load_xy_addr_xxx = {{INDEX_WIDTH{1'bX}}, 1'bX};
+
+
+ //
+ // More Handy Quantities
+ //
+ reg [INDEX_WIDTH:0] load_t1t2_addr_last;
+ reg [INDEX_WIDTH:0] load_nn_coeff_addr_last;
+
+
+ //
+ // Flags
+ //
+ assign load_addr_zero = load_xy_addr_reg == load_xy_addr_zero;
+ assign load_t1t2_addr_done = load_xy_addr_reg == load_t1t2_addr_last;
+ assign load_nn_coeff_addr_done = load_xy_addr_reg == load_nn_coeff_addr_last;
+
+
+ //
+ // Last Index Latch
+ //
+ always @(posedge clk)
+ //
+ if (ena && (fsm_state == FSM_STATE_IDLE)) begin
+ load_t1t2_addr_last <= {1'b0, index_last};
+ load_nn_coeff_addr_last <= {1'b0, index_last} + 1'b1;
+ end
+
+
+ //
+ // Update Load Phase
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1,
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_T1T2_3: load_phase_reg <= 1'b0;
+ FSM_STATE_LOAD_NN_COEFF_1,
+ FSM_STATE_LOAD_NN_COEFF_2,
+ FSM_STATE_LOAD_NN_COEFF_3: load_phase_reg <= 1'b1;
+ default: load_phase_reg <= 1'bX;
+ endcase
+
+
+ //
+ // Update Load Address
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_T1T2_3) ? load_xy_addr_next : load_xy_addr_zero;
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_T1T2_3: load_xy_addr_reg <= load_xy_addr_reg;
+ FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_reg <= (fsm_state == FSM_STATE_LOAD_NN_COEFF_3) ? load_xy_addr_next : load_xy_addr_zero;
+ FSM_STATE_LOAD_NN_COEFF_2,
+ FSM_STATE_LOAD_NN_COEFF_3: load_xy_addr_reg <= load_xy_addr_reg;
+ default load_xy_addr_reg <= load_xy_addr_xxx;
+ endcase
+
+
+ //
+ // Update Address Valid Flag
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_1,
+ FSM_STATE_LOAD_NN_COEFF_1: load_xy_addr_vld_reg <= 1'b1;
+ default load_xy_addr_vld_reg <= 1'b0;
+ endcase
+
+
+ //
+ // Update Load Request Flag
+ //
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_LOAD_T1T2_2,
+ FSM_STATE_LOAD_NN_COEFF_2: load_xy_req_reg <= 1'b1;
+ default load_xy_req_reg <= 1'b0;
+ endcase
+
+
+endmodule
diff --git a/rtl/modexpng_mmm_x8_dual.v b/rtl/modexpng_mmm_x8_dual.v
new file mode 100644
index 0000000..99a37fa
--- /dev/null
+++ b/rtl/modexpng_mmm_x8_dual.v
@@ -0,0 +1,550 @@
+module modexpng_mmm_x8_dual
+(
+ clk, rst_n,
+ ena, rdy,
+ mode, transfer,
+ index_last,
+ x_din, y_din, x_dout, y_dout,
+ x_din_addr, y_din_addr, x_dout_addr, y_dout_addr,
+ x_din_ena, y_din_ena, x_dout_ena, y_dout_ena, x_din_reg_ena, y_din_reg_ena,
+ x_din_bank, y_din_bank, x_dout_bank, y_dout_bank,
+ load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req,
+ load_x_din, load_y_din
+);
+
+
+ //
+ // Includes
+ //
+ `include "modexpng_parameters.vh"
+ `include "modexpng_parameters_x8.vh"
+ `include "modexpng_mmm_fsm.vh"
+
+
+ //
+ // Parameters
+ //
+ parameter INDEX_WIDTH = 6;
+
+
+ //
+ // Ports
+ //
+ input clk;
+ input rst_n;
+
+ input ena;
+ output rdy;
+
+ input mode; // multiply: 0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2
+ // load/unload: 0 = load, 1 = unload
+ input transfer; // 0 = multiply, 1 = load/unload
+
+ input [INDEX_WIDTH-1:0] index_last;
+
+ input [NUM_MULTS*WORD_WIDTH-1:0] x_din;
+ input [NUM_MULTS*WORD_WIDTH-1:0] y_din;
+ output [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
+ output [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
+
+ output [INDEX_WIDTH-4:0] x_din_addr;
+ output [INDEX_WIDTH-4:0] y_din_addr;
+ output [INDEX_WIDTH-4:0] x_dout_addr;
+ output [INDEX_WIDTH-4:0] y_dout_addr;
+
+ output [ 1-1:0] x_din_ena;
+ output [ 1-1:0] y_din_ena;
+ output [NUM_MULTS-1:0] x_dout_ena;
+ output [NUM_MULTS-1:0] y_dout_ena;
+ output [ 1-1:0] x_din_reg_ena;
+ output [ 1-1:0] y_din_reg_ena;
+
+ output [3-1:0] x_din_bank;
+ output [3-1:0] y_din_bank;
+ output [3-1:0] x_dout_bank;
+ output [3-1:0] y_dout_bank;
+
+ output load_phase; // 0 = T1, T2; 1 = N, N_COEFF
+ output [ INDEX_WIDTH:0] load_xy_addr; // address
+ output load_xy_addr_vld; // address valid
+ output load_xy_req; // data request
+
+ input [WORD_WIDTH-1:0] load_x_din; // data input
+ input [WORD_WIDTH-1:0] load_y_din; // data input
+
+
+ //
+ // FSM State and Next States
+ //
+ reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle;
+ reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
+
+
+ //
+ // FSM Idle Next State
+ //
+ always @*
+ //
+ case ({transfer, mode})
+ 2'b00,
+ 2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG;
+ 2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1;
+ 2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload?
+ endcase
+
+
+ //
+ // Column Counter
+ //
+ wire [ INDEX_WIDTH-4:0] col_index;
+ wire col_index_done;
+ wire [ INDEX_WIDTH-4:0] col_index_zero;
+ wire [ INDEX_WIDTH-4:0] col_index_next;
+ wire [ INDEX_WIDTH-4:0] col_index_prev;
+
+ modexpng_mmm_col_index #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ mmm_col_index
+ (
+ .clk (clk),
+ .index_last (index_last),
+ .fsm_state_next (fsm_state_next),
+ .col_index (col_index),
+ .col_index_done (col_index_done),
+ .col_index_zero (col_index_zero),
+ .col_index_next (col_index_next),
+ .col_index_prev (col_index_prev)
+ );
+
+
+ //
+ // Load Address Generator
+ //
+ wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0];
+ wire load_addr_zero;
+ wire load_t1t2_addr_done;
+ wire load_nn_coeff_addr_done;
+
+ modexpng_mmm_transporter #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ transporter
+ (
+ .clk (clk),
+ .ena (ena),
+ .index_last (index_last),
+ .fsm_state (fsm_state),
+ .fsm_state_next (fsm_state_next),
+ .load_phase (load_phase),
+ .load_xy_addr (load_xy_addr),
+ .load_xy_addr_vld (load_xy_addr_vld),
+ .load_xy_req (load_xy_req),
+ .load_addr_zero (load_addr_zero),
+ .load_t1t2_addr_done (load_t1t2_addr_done),
+ .load_nn_coeff_addr_done (load_nn_coeff_addr_done)
+ );
+
+
+ //
+ // X, Y Address
+ //
+ wire [INDEX_WIDTH-1:0] x_din_addr_cnt;
+ wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last;
+ wire [ 3-1:0] x_din_addr_cnt_lower_prev;
+ wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev;
+
+ modexpng_mmm_din_addr #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ din_addr_x
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .index_last (index_last),
+ .fsm_state_next (fsm_state_next),
+ .col_index_zero (col_index_zero),
+ .col_index_next (col_index_next),
+ .din_addr (x_din_addr),
+ .din_bank (x_din_bank),
+ .din_ena (x_din_ena),
+ .din_reg_ena (x_din_reg_ena),
+ .din_addr_cnt (x_din_addr_cnt),
+ .din_addr_cnt_last (x_din_addr_cnt_last),
+ .din_addr_cnt_lower_prev (x_din_addr_cnt_lower_prev),
+ .din_addr_cnt_upper_prev (x_din_addr_cnt_upper_prev)
+ );
+
+ modexpng_mmm_dout_addr #
+ (
+ .INDEX_WIDTH(INDEX_WIDTH)
+ )
+ dout_addr_xy
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .fsm_state (fsm_state),
+ .load_xy_addr (load_xy_addr),
+ .load_addr_zero (load_addr_zero),
+ .load_nn_coeff_addr_done (load_nn_coeff_addr_done),
+ .x_dout_addr (x_dout_addr),
+ .y_dout_addr (y_dout_addr),
+ .x_dout_ena (x_dout_ena),
+ .y_dout_ena (y_dout_ena),
+ .x_dout_bank (x_dout_bank),
+ .y_dout_bank (y_dout_bank)
+ );
+
+
+ //
+ // Helper Memories ("Scratchpad")
+ //
+ reg [INDEX_WIDTH-1:0] pad_xy_rd_addr;
+ reg pad_xy_rd_ena = 1'b0;
+ wire [ WORD_WIDTH-1:0] pad_x_rd_dout;
+ wire [ WORD_WIDTH-1:0] pad_y_rd_dout;
+
+ wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}};
+ wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1;
+
+ modexpng_mmm_pad pad
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+ .fsm_state (fsm_state),
+ .load_xy_addr_lsb (load_xy_addr_lsb),
+ .load_x_din (load_x_din),
+ .load_y_din (load_y_din),
+ .pad_x_rd_addr (pad_xy_rd_addr),
+ .pad_y_rd_addr (pad_xy_rd_addr),
+ .pad_x_rd_ena (pad_xy_rd_ena),
+ .pad_y_rd_ena (pad_xy_rd_ena),
+ .pad_x_rd_dout (pad_x_rd_dout),
+ .pad_y_rd_dout (pad_y_rd_dout)
+ );
+
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (!rst_n) begin
+ pad_xy_rd_ena <= 1'b0;
+ end else case (fsm_state_next)
+
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ pad_xy_rd_ena <= 1'b1;
+
+ default:
+ pad_xy_rd_ena <= 1'b0;
+
+ endcase
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG:
+ pad_xy_rd_addr <= pad_xy_rd_addr_zero;
+
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY:
+ pad_xy_rd_addr <= pad_xy_rd_addr_next;
+
+ default:
+ pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}};
+
+ endcase
+
+
+
+
+ //
+ // Flags
+ //
+
+ wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last;
+
+ always @*
+ //
+ fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;;
+
+
+ //
+ // MAC Arrays
+ //
+ reg mac_x_ce = 1'b0;
+ reg mac_x_ce_aux = 1'b0;
+ reg [NUM_MULTS -1:0] mac_x_clr;
+ reg mac_x_clr_aux;
+ reg [NUM_MULTS -2:0] mac_x_casc_a;
+ reg mac_x_casc_a_aux;
+ wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a;
+ reg [ 1 * WORD_WIDTH -1:0] mac_x_a_aux;
+ //wire [ 1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1];
+ reg [ 1 * WORD_WIDTH -1:0] mac_x_b;
+ wire [NUM_MULTS * MAC_WIDTH -1:0] mac_x_p;
+ wire [ 1 * MAC_WIDTH -1:0] mac_x_p_aux;
+
+ reg mac_y_ce = 1'b0;
+ reg mac_y_ce_aux = 1'b0;
+ reg [NUM_MULTS -1:0] mac_y_clr;
+ reg mac_y_clr_aux;
+ reg [NUM_MULTS -2:0] mac_y_casc_a;
+ reg mac_y_casc_a_aux;
+ wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a;
+ reg [ 1 * WORD_WIDTH -1:0] mac_y_a_aux;
+ //wire [ 1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1];
+ reg [ 1 * WORD_WIDTH -1:0] mac_y_b;
+ wire [NUM_MULTS * MAC_WIDTH -1:0] mac_y_p;
+ wire [ 1 * MAC_WIDTH -1:0] mac_y_p_aux;
+
+ modexpng_mac_array mac_array_x
+ (
+ .clk (clk),
+ .ce (mac_x_ce),
+ .ce_aux (mac_x_ce_aux),
+ .clr (mac_x_clr),
+ .clr_aux (mac_x_clr_aux),
+ .casc_a (mac_x_casc_a),
+ .casc_a_aux (mac_x_casc_a_aux),
+ .a_in (mac_x_a),
+ .a_in_aux (mac_x_a_aux),
+ .b_in (mac_x_b),
+ .p_out (mac_x_p),
+ .p_out_aux (mac_x_p_aux)
+ );
+
+ modexpng_mac_array mac_array_y
+ (
+ .clk (clk),
+ .ce (mac_y_ce),
+ .ce_aux (mac_y_ce_aux),
+ .clr (mac_y_clr),
+ .clr_aux (mac_y_clr_aux),
+ .casc_a (mac_y_casc_a),
+ .casc_a_aux (mac_y_casc_a_aux),
+ .a_in (mac_y_a),
+ .a_in_aux (mac_y_a_aux),
+ .b_in (mac_y_b),
+ .p_out (mac_y_p),
+ .p_out_aux (mac_y_p_aux)
+ );
+
+ genvar gen_z;
+
+ generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+ begin : gen_xy_din
+ //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ //gen_xy_dout
+ assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH];
+
+ //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ end
+ endgenerate
+
+
+ //
+ // MAC Clock Enable Logic
+ //
+ reg mac_xy_ce_adv = 1'b0;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0;
+ else case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1;
+ default: mac_xy_ce_adv <= 1'b0;
+ endcase
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00;
+ else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}};
+
+
+ //
+ // MAC Clear Logic
+ //
+ wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value =
+ calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev);
+
+ reg [NUM_MULTS-1:0] mac_xy_clr_adv;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value;
+ default: mac_xy_clr_adv <= {NUM_MULTS{1'bX}};
+ endcase
+
+ always @(posedge clk)
+ //
+ {mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}};
+
+
+ //
+ // MAC Cascade Logic
+ //
+ reg [NUM_MULTS-2:0] mac_xy_casc_a_adv;
+
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}};
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}};
+ default: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}};
+ endcase
+
+ always @(posedge clk)
+ //
+ {mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}};
+
+
+
+ //
+ // DOUT Mapping
+ //
+ generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
+ begin : gen_xy_dout
+ assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
+ assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
+ end
+ endgenerate
+
+
+ //
+ // DOUT
+ //
+ reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1];
+ reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1];
+
+
+
+
+ integer int_z;
+ always @(posedge clk)
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_LOAD_T1T2_3,
+ FSM_STATE_LOAD_NN_COEFF_3:
+ for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+ x_dout_reg[int_z] <= load_x_din;
+ y_dout_reg[int_z] <= load_y_din;
+ end
+ //
+ default:
+ for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
+ x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+ y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
+ end
+ //
+ endcase
+
+
+
+ //
+ // FSM Process
+ //
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_state_next;
+
+
+ //
+ // FSM Transition Logic
+ //
+ always @* begin
+ //
+ fsm_state_next = FSM_STATE_IDLE;
+ //
+ case (fsm_state)
+ FSM_STATE_IDLE: fsm_state_next = ena ? fsm_state_after_idle : FSM_STATE_IDLE;
+
+ FSM_STATE_LOAD_T1T2_1: fsm_state_next = FSM_STATE_LOAD_T1T2_2 ;
+ FSM_STATE_LOAD_T1T2_2: fsm_state_next = FSM_STATE_LOAD_T1T2_3 ;
+ FSM_STATE_LOAD_T1T2_3: fsm_state_next = load_t1t2_addr_done ? FSM_STATE_LOAD_NN_COEFF_1 : FSM_STATE_LOAD_T1T2_1;
+
+ FSM_STATE_LOAD_NN_COEFF_1: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ;
+ FSM_STATE_LOAD_NN_COEFF_2: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ;
+ FSM_STATE_LOAD_NN_COEFF_3: fsm_state_next = load_nn_coeff_addr_done ? FSM_STATE_STOP : FSM_STATE_LOAD_NN_COEFF_1;
+
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
+
+ /*
+ FSM_STATE_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_0_BUSY ;
+ FSM_STATE_TRIANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG : FSM_STATE_TRIANGLE_COL_0_BUSY;
+ FSM_STATE_TRIANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_N_BUSY ;
+ FSM_STATE_TRIANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle : FSM_STATE_TRIANGLE_COL_N_BUSY;
+
+ FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_0_BUSY ;
+ FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY;
+ FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_N_BUSY ;
+ FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle : FSM_STATE_RECTANGLE_COL_N_BUSY;
+ */
+
+ FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ;
+
+ endcase
+ //
+ end
+
+
+ //
+ // Ready Output
+ //
+ reg rdy_reg = 1'b1;
+ assign rdy = rdy_reg;
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) rdy_reg <= 1'b1;
+ else case (fsm_state)
+ FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0;
+ FSM_STATE_STOP: rdy_reg <= 1'b1;
+ endcase
+
+ function [ NUM_MULTS-1:0] calc_mac_clear_square;
+ input [INDEX_WIDTH-4:0] col_index_delayed;
+ input [ 3-1:0] x_din_addr_cnt_lower_delayed;
+ input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed;
+ begin
+ if (x_din_addr_cnt_upper_delayed == col_index_delayed)
+ case (x_din_addr_cnt_lower_delayed)
+ 3'b000: calc_mac_clear_square = 8'b00000001;
+ 3'b001: calc_mac_clear_square = 8'b00000010;
+ 3'b010: calc_mac_clear_square = 8'b00000100;
+ 3'b011: calc_mac_clear_square = 8'b00001000;
+ 3'b100: calc_mac_clear_square = 8'b00010000;
+ 3'b101: calc_mac_clear_square = 8'b00100000;
+ 3'b110: calc_mac_clear_square = 8'b01000000;
+ 3'b111: calc_mac_clear_square = 8'b10000000;
+ endcase
+ else
+ calc_mac_clear_square = {NUM_MULTS{1'b0}};
+ end
+ endfunction
+
+
+endmodule
diff --git a/rtl/modexpng_parameters.vh b/rtl/modexpng_parameters.vh
new file mode 100644
index 0000000..f846119
--- /dev/null
+++ b/rtl/modexpng_parameters.vh
@@ -0,0 +1,39 @@
+//localparam WORD_WIDTH = 17;
+//localparam MAC_WIDTH = 47;
+
+//localparam BANK_ADDR_WIDTH = 3; // TODO: Replace everywhere!
+
+localparam [2:0] BANK_FAT_T1T2 = 3'd0;
+localparam [2:0] BANK_FAT_ABL = 3'd1;
+localparam [2:0] BANK_FAT_ABH = 3'd2;
+localparam [2:0] BANK_FAT_Q = 3'd3;
+localparam [2:0] BANK_FAT_Q_EXT = 3'd4;
+localparam [2:0] BANK_FAT_ML = 3'd5;
+localparam [2:0] BANK_FAT_MH = 3'd6;
+localparam [2:0] BANK_FAT_MH_EXT = 3'd7;
+
+localparam [1:0] BANK_SLIM_T1T2 = 2'd0;
+localparam [1:0] BANK_SLIM_N = 2'd1;
+localparam [1:0] BANK_SLIM_N_COEFF = 2'd2;
+localparam [1:0] BANK_SLIM_N_COEFF_EXT = 2'd3;
+
+
+//localparam BANK_Y_T2 = 3'd0;
+//localparam BANK_XY_T1T2 = 3'd0;
+
+//localparam BANK_XY_AB_LSB = 3'd1;
+//localparam BANK_XY_AB_MSB = 3'd2;
+
+//localparam BANK_X_N = 3'd3;
+//localparam BANK_Y_N_COEFF = 3'd3;
+
+//localparam BANK_XY_M = 3'd4;
+
+//localparam BANK_XY_Q_LSB = 3'd5;
+//localparam BANK_XY_Q_MSB = 3'd6;
+
+//localparam BANK_XY_AUX = 3'd7;
+
+//localparam BANK_XY_ANY = 3'bXXX;
+
+//localparam BANK_XY_AUX_ADDR_N_COEFF = 0;
diff --git a/rtl/modexpng_parameters_x8.vh b/rtl/modexpng_parameters_x8.vh
new file mode 100644
index 0000000..8734354
--- /dev/null
+++ b/rtl/modexpng_parameters_x8.vh
@@ -0,0 +1 @@
+localparam NUM_MULTS = 8;
diff --git a/rtl/modexpng_part_recombinator.v b/rtl/modexpng_part_recombinator.v
new file mode 100644
index 0000000..db4774b
--- /dev/null
+++ b/rtl/modexpng_part_recombinator.v
@@ -0,0 +1,623 @@
+module modexpng_part_recombinator
+(
+ clk,
+ rdy,
+ fsm_state_next,
+ index_last,
+ dsp_x_ce_p, dsp_y_ce_p,
+ ena_x, ena_y,
+ dsp_x_p, dsp_y_p,
+ col_index, col_index_last, slim_bram_xy_addr,
+ fat_bram_xy_bank, fat_bram_xy_addr, fat_bram_x_dout, fat_bram_y_dout, fat_bram_xy_dout_valid
+);
+
+
+ //
+ // Headers
+ //
+ `include "../rtl/modexpng_mmm_fsm.vh"
+ `include "../rtl/modexpng_parameters.vh"
+ `include "../rtl/modexpng_parameters_x8.vh"
+
+
+ input clk;
+ output rdy;
+ input [FSM_STATE_WIDTH-1:0] fsm_state_next;
+ input [7:0] index_last;
+ input dsp_x_ce_p;
+ input dsp_y_ce_p;
+ input ena_x;
+ input ena_y;
+ input [8*47-1:0] dsp_x_p;
+ input [8*47-1:0] dsp_y_p;
+ input [ 4:0] col_index;
+ input [ 4:0] col_index_last;
+ input [ 7:0] slim_bram_xy_addr;
+
+ output [ 2:0] fat_bram_xy_bank;
+ output [ 7:0] fat_bram_xy_addr;
+ output [ 17:0] fat_bram_x_dout;
+ output [ 17:0] fat_bram_y_dout;
+ output fat_bram_xy_dout_valid;
+
+
+ //
+ // Latches
+ //
+ reg [1*47-1:0] dsp_x_p_latch[0:7];
+ reg [1*47-1:0] dsp_y_p_latch[0:7];
+
+
+ //
+ // Mapping
+ //
+ wire [46:0] dsp_x_p_split[0:7];
+ wire [46:0] dsp_y_p_split[0:7];
+
+ genvar z;
+ generate for (z=0; z<NUM_MULTS; z=z+1)
+ begin : gen_dsp_xy_p_split
+ assign dsp_x_p_split[z] = dsp_x_p[47*z+:47];
+ assign dsp_y_p_split[z] = dsp_y_p[47*z+:47];
+ end
+ endgenerate
+
+
+ //
+ // Delays
+ //
+ reg dsp_y_ce_p_dly1 = 1'b0;
+ reg dsp_x_ce_p_dly1 = 1'b0;
+
+ always @(posedge clk) begin
+ //
+ {dsp_y_ce_p_dly1, dsp_x_ce_p_dly1} <= {dsp_y_ce_p, dsp_x_ce_p};
+ //
+ end
+
+
+ //
+ // Registers
+ //
+
+ // valid
+ reg x_valid_lsb = 1'b0;
+ reg y_valid_lsb = 1'b0;
+ reg x_valid_msb = 1'b0;
+ reg y_valid_msb = 1'b0;
+
+ // bitmap
+ reg [7:0] x_bitmap_lsb = {8{1'b0}};
+ reg [7:0] y_bitmap_lsb = {8{1'b0}};
+ reg [7:0] x_bitmap_msb = {8{1'b0}};
+ reg [7:0] y_bitmap_msb = {8{1'b0}};
+
+ // index
+ reg [2:0] x_index_lsb = 3'dX;
+ reg [2:0] y_index_lsb = 3'dX;
+
+ // purge
+ reg x_purge_lsb = 1'b0;
+ reg y_purge_lsb = 1'b0;
+ reg x_purge_msb = 1'b0;
+ reg y_purge_msb = 1'b0;
+
+ // valid - latch
+ reg x_valid_latch_lsb = 1'b0;
+ reg y_valid_latch_lsb = 1'b0;
+
+ // bitmap - latch
+ reg [7:0] x_bitmap_latch_lsb = {8{1'b0}};
+ reg [7:0] y_bitmap_latch_lsb = {8{1'b0}};
+ reg [7:0] x_bitmap_latch_msb = {8{1'b0}};
+ reg [7:0] y_bitmap_latch_msb = {8{1'b0}};
+
+ // index - latch
+ reg [2:0] x_index_latch_lsb = 3'dX;
+ reg [2:0] y_index_latch_lsb = 3'dX;
+
+ // purge - index
+ reg x_purge_latch_lsb = 1'b0;
+ reg y_purge_latch_lsb = 1'b0;
+ reg x_purge_latch_msb = 1'b0;
+ reg y_purge_latch_msb = 1'b0;
+
+ //
+ reg xy_valid_lsb_adv[1:6];
+ reg xy_valid_msb_adv[1:6];
+ reg [7:0] xy_bitmap_lsb_adv[1:6];
+ reg [7:0] xy_bitmap_msb_adv[1:6];
+ reg [2:0] xy_index_lsb_adv[1:6];
+ reg [2:0] xy_index_msb_adv[1:6];
+ reg xy_purge_lsb_adv[1:6];
+ reg xy_purge_msb_adv[1:6];
+
+
+ integer i;
+ initial for (i=1; i<6; i=i+1) begin
+ xy_valid_lsb_adv[i] = 1'b0;
+ xy_valid_msb_adv[i] = 1'b0;
+ xy_bitmap_lsb_adv[i] = {8{1'b0}};
+ xy_bitmap_msb_adv[i] = {8{1'b0}};
+ xy_index_lsb_adv[i] = 3'dX;
+ xy_index_msb_adv[i] = 3'dX;
+ xy_purge_lsb_adv[i] = 1'b0;
+ xy_purge_msb_adv[i] = 1'b0;
+ end
+
+ function [0:0] calc_square_valid_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ calc_square_valid_lsb = 1'b1;
+ else
+ calc_square_valid_lsb = 1'b0;
+ //
+ end
+ endfunction
+
+ function [7:0] calc_square_bitmap_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ //
+ case (slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_square_bitmap_lsb = 8'b00000001;
+ 3'b001: calc_square_bitmap_lsb = 8'b00000010;
+ 3'b010: calc_square_bitmap_lsb = 8'b00000100;
+ 3'b011: calc_square_bitmap_lsb = 8'b00001000;
+ 3'b100: calc_square_bitmap_lsb = 8'b00010000;
+ 3'b101: calc_square_bitmap_lsb = 8'b00100000;
+ 3'b110: calc_square_bitmap_lsb = 8'b01000000;
+ 3'b111: calc_square_bitmap_lsb = 8'b10000000;
+ endcase
+ //
+ else
+ calc_square_bitmap_lsb = {8{1'b0}};
+ //
+ end
+ endfunction
+
+ function [2:0] calc_square_index_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ //
+ case (slim_bram_xy_addr_value[2:0])
+ 3'b000: calc_square_index_lsb = 3'd0;
+ 3'b001: calc_square_index_lsb = 3'd1;
+ 3'b010: calc_square_index_lsb = 3'd2;
+ 3'b011: calc_square_index_lsb = 3'd3;
+ 3'b100: calc_square_index_lsb = 3'd4;
+ 3'b101: calc_square_index_lsb = 3'd5;
+ 3'b110: calc_square_index_lsb = 3'd6;
+ 3'b111: calc_square_index_lsb = 3'd7;
+ endcase
+ //
+ else
+ calc_square_index_lsb = 3'dX;
+ //
+ end
+ endfunction
+
+ function calc_square_purge_lsb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value[7:3] == col_index_value)
+ calc_square_purge_lsb = slim_bram_xy_addr_value[7:3] == col_index_last_value;
+ else
+ calc_square_purge_lsb = 1'b0;
+ //
+ end
+ endfunction
+
+ function calc_square_valid_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value)
+ calc_square_valid_msb = 1'b1;
+ else
+ calc_square_valid_msb = 1'b0;
+ //
+ end
+ endfunction
+
+ function [7:0] calc_square_bitmap_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value) begin
+ calc_square_bitmap_msb[7] = col_index_value != col_index_last_value;
+ calc_square_bitmap_msb[6:0] = 7'b1111111;
+ end else
+ calc_square_bitmap_msb[7:0] = 8'b00000000;
+ //
+ end
+ endfunction
+
+ function calc_square_purge_msb;
+ input [4:0] col_index_value;
+ input [4:0] col_index_last_value;
+ input [7:0] slim_bram_xy_addr_value;
+ input [7:0] index_last_value;
+ begin
+ //
+ if (slim_bram_xy_addr_value == index_last_value)
+ calc_square_purge_msb = col_index_value == col_index_last_value;
+ else
+ calc_square_purge_msb = 1'b0;
+ //
+ end
+ endfunction
+
+
+ reg recomb_lsb_ce = 1'b0;
+ reg [ 2:0] recomb_lsb_ce_purge = 3'b000;
+ wire recomb_lsb_ce_combined = recomb_lsb_ce | recomb_lsb_ce_purge[0];
+ reg recomb_lsb_clr;
+ reg recomb_lsb_vld = 1'b0;
+
+ reg [46:0] recomb_lsb_din;
+ wire [15:0] recomb_lsb_dout;
+
+ reg recomb_msb_ce = 1'b0;
+ reg [ 1:0] recomb_msb_ce_purge = 2'b00;
+ wire recomb_msb_ce_combined = recomb_msb_ce | recomb_msb_ce_purge[0];
+ reg recomb_msb_clr;
+ reg recomb_msb_vld = 1'b0;
+
+ always @(posedge clk)
+ //
+ {recomb_msb_vld, recomb_lsb_vld} <= {recomb_msb_ce_combined, recomb_lsb_ce_combined};
+
+ reg [46:0] recomb_msb_din;
+ wire [15:0] recomb_msb_dout;
+
+ modexpng_recombinator_block recomb_x_lsb
+ (
+ .clk (clk),
+ .ce (recomb_lsb_ce_combined),
+ .clr (recomb_lsb_clr),
+ .din (recomb_lsb_din),
+ .dout (recomb_lsb_dout)
+ );
+
+ modexpng_recombinator_block recomb_x_msb
+ (
+ .clk (clk),
+ .ce (recomb_msb_ce_combined),
+ .clr (recomb_msb_clr),
+ .din (recomb_msb_din),
+ .dout (recomb_msb_dout)
+ );
+
+ always @(posedge clk) begin
+ //
+ recomb_lsb_ce <= x_valid_latch_lsb;
+ recomb_msb_ce <= x_bitmap_latch_msb[0];
+ //
+ if (x_purge_latch_lsb)
+ recomb_lsb_ce_purge <= 3'b111;
+ else
+ recomb_lsb_ce_purge <= {1'b0, recomb_lsb_ce_purge[2:1]};
+ //
+ if (x_purge_latch_msb && x_bitmap_latch_msb[0] && !x_bitmap_latch_msb[1])
+ recomb_msb_ce_purge = 2'b11;
+ else
+ recomb_msb_ce_purge <= {1'b0, recomb_msb_ce_purge[1]};
+ //
+ end
+
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y) begin
+ recomb_lsb_clr <= 1'b1;
+ recomb_msb_clr <= 1'b1;
+ end else begin
+ if (recomb_lsb_ce) recomb_lsb_clr <= 1'b0;
+ if (recomb_msb_ce) recomb_msb_clr <= 1'b0;
+ end
+
+ always @(posedge clk)
+ //
+ if (x_valid_latch_lsb)
+ recomb_lsb_din <= dsp_x_p_latch[x_index_latch_lsb];
+ else
+ recomb_lsb_din <= {47{1'b0}};
+
+ always @(posedge clk)
+ //
+ if (x_bitmap_latch_msb[0])
+ recomb_msb_din <= dsp_x_p_latch[0];
+ else
+ recomb_msb_din <= {47{1'b0}};
+
+
+ always @(posedge clk)
+ //
+ case (fsm_state_next)
+ //
+ FSM_STATE_MULT_SQUARE_COL_0_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_N_TRIG,
+ FSM_STATE_MULT_SQUARE_COL_0_BUSY,
+ FSM_STATE_MULT_SQUARE_COL_N_BUSY: begin
+ //
+ xy_valid_lsb_adv [6] <= calc_square_valid_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ xy_bitmap_lsb_adv[6] <= calc_square_bitmap_lsb(col_index, col_index_last, slim_bram_xy_addr);
+ xy_index_lsb_adv [6] <= calc_square_index_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ xy_purge_lsb_adv [6] <= calc_square_purge_lsb (col_index, col_index_last, slim_bram_xy_addr);
+ //
+ xy_valid_msb_adv [6] <= calc_square_valid_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+ xy_bitmap_msb_adv[6] <= calc_square_bitmap_msb(col_index, col_index_last, slim_bram_xy_addr, index_last);
+ xy_purge_msb_adv [6] <= calc_square_purge_msb (col_index, col_index_last, slim_bram_xy_addr, index_last);
+ //
+ end
+ //
+ default: begin
+ //
+ xy_valid_lsb_adv [6] <= 1'b0;
+ xy_bitmap_lsb_adv[6] <= {8{1'b0}};
+ xy_index_lsb_adv [6] <= 3'dX;
+ xy_purge_lsb_adv [6] <= 1'b0;
+ //
+ xy_valid_msb_adv [6] <= 1'b0;
+ xy_bitmap_msb_adv[6] <= {8{1'b0}};
+ xy_purge_msb_adv [6] <= 1'b0;
+ //
+ end
+ //
+ endcase
+
+
+ always @(posedge clk) begin
+ //
+ {y_valid_lsb, x_valid_lsb} <= {2{xy_valid_lsb_adv [1]}};
+ {y_bitmap_lsb, x_bitmap_lsb} <= {2{xy_bitmap_lsb_adv[1]}};
+ {y_index_lsb, x_index_lsb} <= {2{xy_index_lsb_adv [1]}};
+ {y_purge_lsb, x_purge_lsb} <= {2{xy_purge_lsb_adv [1]}};
+ //
+ {y_valid_latch_lsb, x_valid_latch_lsb} <= {y_valid_lsb, x_valid_lsb};
+ {y_bitmap_latch_lsb, x_bitmap_latch_lsb} <= {y_bitmap_lsb, x_bitmap_lsb};
+ {y_index_latch_lsb, x_index_latch_lsb} <= {y_index_lsb, x_index_lsb};
+ {y_purge_latch_lsb, x_purge_latch_lsb} <= {y_purge_lsb, x_purge_lsb};
+ //
+ {y_valid_msb, x_valid_msb} <= {2{xy_valid_msb_adv[1]}};
+ {y_bitmap_msb, x_bitmap_msb} <= {2{xy_bitmap_msb_adv[1]}};
+ {y_purge_msb, x_purge_msb} <= {2{xy_purge_msb_adv[1]}};
+ //
+ if (x_valid_msb) begin
+ x_bitmap_latch_msb <= x_bitmap_msb;
+ x_purge_latch_msb <= x_purge_msb;
+ end else begin
+ x_bitmap_latch_msb <= {1'b0, x_bitmap_latch_msb[7:1]};
+ end
+ //
+ //
+ for (i=1; i<6; i=i+1) begin
+ xy_valid_lsb_adv [i] <= xy_valid_lsb_adv [i+1];
+ xy_bitmap_lsb_adv[i] <= xy_bitmap_lsb_adv[i+1];
+ xy_index_lsb_adv [i] <= xy_index_lsb_adv [i+1];
+ xy_purge_lsb_adv [i] <= xy_purge_lsb_adv [i+1];
+ //
+ xy_valid_msb_adv [i] <= xy_valid_msb_adv [i+1];
+ xy_bitmap_msb_adv[i] <= xy_bitmap_msb_adv[i+1];
+ xy_purge_msb_adv [i] <= xy_purge_msb_adv [i+1];
+ end
+ //
+ end
+
+ always @(posedge clk)
+ //
+ if (x_bitmap_latch_msb[1]) // only shift 7 times
+ //
+ for (i=0; i<8; i=i+1)
+ if (i < 7)
+ dsp_x_p_latch[i] <= dsp_x_p_latch[i+1];
+ else
+ dsp_x_p_latch[i] <= {47{1'bX}};
+ //
+ else if (dsp_x_ce_p_dly1)
+ //
+ for (i=0; i<8; i=i+1)
+ //
+ if (x_bitmap_lsb[i])
+ dsp_x_p_latch[i] <= dsp_x_p_split[i];
+ else if (x_valid_msb && x_bitmap_msb[i])
+ dsp_x_p_latch[i] <= dsp_x_p_split[i];
+
+ reg recomb_x_lsb_dout_valid = 1'b0;
+ reg recomb_x_msb_dout_valid = 1'b0;
+
+ always @(posedge clk) begin
+ recomb_x_lsb_dout_valid <= recomb_lsb_ce_combined;
+ recomb_x_msb_dout_valid <= recomb_msb_ce_combined;
+ end
+
+
+
+ reg [ 2:0] fat_bram_xy_bank_reg;
+ reg [ 7:0] fat_bram_xy_addr_reg;
+ reg [ 7:0] fat_bram_xy_cnt_lsb;
+ reg [ 7:0] fat_bram_xy_cnt_msb;
+ reg [17:0] fat_bram_x_dout_reg;
+ reg [17:0] fat_bram_y_dout_reg;
+ reg fat_bram_xy_dout_valid_reg = 1'b0;
+
+ reg [15:0] recomb_msb_dout_carry_0;
+ reg [15:0] recomb_msb_dout_carry_1;
+
+ reg [15:0] recomb_msb_dout_delay_0;
+ reg [15:0] recomb_msb_dout_delay_1;
+ reg [15:0] recomb_msb_dout_delay_2;
+
+ reg [ 7:0] recomb_msb_cnt_delay_0 = 8'd0;
+ reg [ 7:0] recomb_msb_cnt_delay_1 = 8'd0;
+ reg [ 7:0] recomb_msb_cnt_delay_2 = 8'd0;
+
+ assign fat_bram_xy_bank = fat_bram_xy_bank_reg;
+ assign fat_bram_xy_addr = fat_bram_xy_addr_reg;
+ assign fat_bram_x_dout = fat_bram_x_dout_reg;
+ assign fat_bram_y_dout = fat_bram_y_dout_reg;
+ assign fat_bram_xy_dout_valid = fat_bram_xy_dout_valid_reg;
+
+ reg rdy_reg = 1'b1;
+ reg rdy_adv = 1'b1;
+
+ assign rdy = rdy_reg;
+
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y)
+ rdy_reg <= 1'b0;
+ else
+ rdy_reg <= rdy_adv;
+
+ always @(posedge clk)
+ //
+ if (ena_x & ena_y) begin
+ rdy_adv <= 1'b0;
+ fat_bram_xy_cnt_lsb <= 8'd0;
+ fat_bram_xy_cnt_msb <= 8'd0;
+ end else begin
+ //
+ case ({recomb_x_msb_dout_valid, recomb_x_lsb_dout_valid})
+ //
+ 2'b00: begin
+ //
+ if (recomb_msb_cnt_delay_2 > 8'd0) begin
+ //
+ rdy_adv <= recomb_msb_cnt_delay_1 == 8'd0;
+ //
+ recomb_msb_dout_delay_0 <= {18{1'bX}};
+ recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+ recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+ //
+ recomb_msb_cnt_delay_0 <= 8'd0;
+ recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+ recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= recomb_msb_cnt_delay_2;
+ fat_bram_x_dout_reg <= recomb_msb_dout_delay_2;
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= 3'bXXX;
+ fat_bram_xy_addr_reg <= 8'hXX;
+ fat_bram_x_dout_reg <= {18{1'bX}};
+ fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b0;
+ //
+ end
+ //
+ end
+ //
+ 2'b01: begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABL;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout};
+// fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1;
+ //
+ end
+ //
+ 2'b10: begin
+ //
+ if (fat_bram_xy_cnt_msb < 8'd2) begin
+ //
+ recomb_msb_dout_carry_0 <= recomb_msb_dout;
+ recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
+ //
+ fat_bram_xy_bank_reg <= 3'bXXX;
+ fat_bram_xy_addr_reg <= 8'hXX;
+ fat_bram_x_dout_reg <= {18{1'bX}};
+ // fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b0;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_msb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_msb_dout};
+ // fat_bram_y_dout_reg
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ end
+ //
+ fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1;
+ //
+ end
+ //
+ 2'b11: begin
+ //
+ if (fat_bram_xy_cnt_lsb == index_last) begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABL;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout};
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= 8'd0;
+ //
+ end else begin
+ //
+ fat_bram_xy_bank_reg <= BANK_FAT_ABH;
+ fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb;
+ fat_bram_x_dout_reg <= {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}};
+// fat_bram_y_dout_reg <= {18{1'bX}};
+ fat_bram_xy_dout_valid_reg <= 1'b1;
+ //
+ fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1;
+ //
+ recomb_msb_dout_carry_0 <= {16{1'bX}};
+ recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0;
+ //
+ end
+ //
+ recomb_msb_dout_delay_0 <= recomb_msb_dout;
+ recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0;
+ recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1;
+ //
+ recomb_msb_cnt_delay_0 <= fat_bram_xy_cnt_msb;
+ recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0;
+ recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1;
+ //
+ fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1;
+ //
+ end
+ //
+ endcase
+ //
+ end
+
+
+
+
+endmodule
diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v
new file mode 100644
index 0000000..efe0ac5
--- /dev/null
+++ b/rtl/modexpng_recombinator_block.v
@@ -0,0 +1,35 @@
+module modexpng_recombinator_block
+(
+ clk,
+ ce, clr,
+ din, dout
+);
+
+ input clk;
+ input ce;
+ input clr;
+ input [46:0] din;
+ output [15:0] dout;
+
+ reg [14:0] z;
+ reg [16:0] y;
+ reg [17:0] x;
+ //reg [15:0] w;
+
+ //assign dout = w;
+ assign dout = x[15:0];
+
+ wire [14:0] din_z = din[46:32]; // TODO: maybe determine more precise bound here
+ wire [15:0] din_y = din[31:16];
+ wire [15:0] din_x = din[15: 0];
+
+ always @(posedge clk)
+ //
+ if (ce) begin
+ z <= din_z;
+ y <= clr ? {1'b0, din_y} : {1'b0, din_y} + {2'b00, z};
+ x <= clr ? {2'b00, din_x} : {2'b00, din_x} + {1'b0, y} + {{16{1'b0}}, x[17:16]};
+ //w <= clr ? {16{1'bX}} : x[15:0];
+ end
+
+endmodule