From 29fb6afd018c601a2e0c7376656d5e37beb565d6 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Tue, 1 Oct 2019 15:01:43 +0300 Subject: Started working on the pipelined Montgomery modular multiplier. Currently can do the "square" part of the multiplication, i.e. compute the twice larger intermediate product AB = A * B. --- bench/tb_mmm_x8_dual.v | 327 +++++++++++++++++ bench/tb_square.v | 716 ++++++++++++++++++++++++++++++++++++++ rtl/dev/temp.txt | 384 ++++++++++++++++++++ rtl/dsp/dsp_array.v | 111 ++++++ rtl/dsp/dsp_slice.v | 125 +++++++ rtl/modexpng_mac.v | 54 +++ rtl/modexpng_mac_array.v | 116 ++++++ rtl/modexpng_mem.v | 93 +++++ rtl/modexpng_mmm_col_index.v | 90 +++++ rtl/modexpng_mmm_din_addr.v | 167 +++++++++ rtl/modexpng_mmm_dout_addr.v | 167 +++++++++ rtl/modexpng_mmm_fsm.vh | 24 ++ rtl/modexpng_mmm_pad.v | 153 ++++++++ rtl/modexpng_mmm_transporter.v | 157 +++++++++ rtl/modexpng_mmm_x8_dual.v | 550 +++++++++++++++++++++++++++++ rtl/modexpng_parameters.vh | 39 +++ rtl/modexpng_parameters_x8.vh | 1 + rtl/modexpng_part_recombinator.v | 623 +++++++++++++++++++++++++++++++++ rtl/modexpng_recombinator_block.v | 35 ++ 19 files changed, 3932 insertions(+) create mode 100644 bench/tb_mmm_x8_dual.v create mode 100644 bench/tb_square.v create mode 100644 rtl/dev/temp.txt create mode 100644 rtl/dsp/dsp_array.v create mode 100644 rtl/dsp/dsp_slice.v create mode 100644 rtl/modexpng_mac.v create mode 100644 rtl/modexpng_mac_array.v create mode 100644 rtl/modexpng_mem.v create mode 100644 rtl/modexpng_mmm_col_index.v create mode 100644 rtl/modexpng_mmm_din_addr.v create mode 100644 rtl/modexpng_mmm_dout_addr.v create mode 100644 rtl/modexpng_mmm_fsm.vh create mode 100644 rtl/modexpng_mmm_pad.v create mode 100644 rtl/modexpng_mmm_transporter.v create mode 100644 rtl/modexpng_mmm_x8_dual.v create mode 100644 rtl/modexpng_parameters.vh create mode 100644 rtl/modexpng_parameters_x8.vh create mode 100644 rtl/modexpng_part_recombinator.v create mode 100644 rtl/modexpng_recombinator_block.v diff --git a/bench/tb_mmm_x8_dual.v b/bench/tb_mmm_x8_dual.v new file mode 100644 index 0000000..aa25900 --- /dev/null +++ b/bench/tb_mmm_x8_dual.v @@ -0,0 +1,327 @@ +`timescale 1ns / 1ps + +module tb_mmm_x8_dual; + + + // + // Headers + // + `include "../rtl/modexpng_parameters.vh" + `include "../rtl/modexpng_parameters_x8.vh" + + + // + // Settings + // + localparam INDEX_WIDTH = 6; + + wire [INDEX_WIDTH-1:0] index_last = 31; // 512 bits + + + // + // Clock + // + `define CLK_FREQUENCY_MHZ 100.0 + `define CLK_PERIOD_NS (1000.0 / `CLK_FREQUENCY_MHZ) + `define CLK_PERIOD_HALF_NS (0.5 * `CLK_PERIOD_NS) + + reg clk = 1'b0; + + always begin + #`CLK_PERIOD_HALF_NS clk = 1'b1; + #`CLK_PERIOD_HALF_NS clk = 1'b0; + end + + + // + // Reset + // + reg rst = 1'b1; + wire rst_n = ~rst; + + + // + // Control + // + reg ena = 1'b0; + wire rdy; + + reg mode; + reg transfer; + + + // + // Interface + // + + + // + // Interface - Data Buses + // + wire [NUM_MULTS*WORD_WIDTH-1:0] x_din; + wire [NUM_MULTS*WORD_WIDTH-1:0] y_din; + wire [NUM_MULTS*WORD_WIDTH-1:0] x_dout; + wire [NUM_MULTS*WORD_WIDTH-1:0] y_dout; + + + // + // Interface - Address Buses + // + wire [INDEX_WIDTH-4:0] x_din_addr; + wire [INDEX_WIDTH-4:0] y_din_addr; + wire [INDEX_WIDTH-4:0] x_dout_addr; + wire [INDEX_WIDTH-4:0] y_dout_addr; + + + // + // Interface - Enable Buses + // + wire [ 1-1:0] x_din_ena; + wire [ 1-1:0] y_din_ena; + wire [ 1-1:0] x_din_reg_ena; + wire [ 1-1:0] y_din_reg_ena; + wire [NUM_MULTS-1:0] x_dout_ena; + wire [NUM_MULTS-1:0] y_dout_ena; + + + // + // Interface - Bank Buses + // + wire [3-1:0] x_din_bank; + wire [3-1:0] y_din_bank; + wire [3-1:0] x_dout_bank; + wire [3-1:0] y_dout_bank; + + + // + // Operands + // + reg [WORD_WIDTH-1:0] T1[0:2**INDEX_WIDTH-1]; + reg [WORD_WIDTH-1:0] T2[0:2**INDEX_WIDTH-1]; + reg [WORD_WIDTH-1:0] N[0:2**INDEX_WIDTH-1]; + reg [WORD_WIDTH-1:0] N_COEFF[0:2**INDEX_WIDTH]; + + + // + // Memories + // + genvar z; + generate for (z=0; z 0) + mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_current - 1'b1; + else + mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_last; + end + endfunction + + + + always @(posedge clk) + // + {dsp_y_ce_a, dsp_x_ce_a} <= {2{mac_slim_bram_xy_reg_ena | mac_slim_bram_xy_reg_ena_dly}}; + + always @(posedge clk) + // + {dsp_y_ce_b, dsp_x_ce_b} <= {2{mac_slim_bram_xy_reg_ena_dly}}; + + always @(posedge clk) + // + {dsp_y_ce_m, dsp_x_ce_m} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly}; + + always @(posedge clk) + // + {dsp_y_ce_p, dsp_x_ce_p} <= {dsp_y_ce_m, dsp_x_ce_m}; + + always @(posedge clk) + // + {dsp_y_ce_mode, dsp_x_ce_mode} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly}; + + task wait_clock_tick; + begin + #`CLK_PERIOD_NS; + end + endtask + + // + // Increment Logic + // + always @(posedge clk) + // + case (fsm_state_next) + // + FSM_STATE_MULT_SQUARE_COL_0_INIT: begin + col_index <= 5'd0; + col_index_last <= index_last[7:3]; + end + // + FSM_STATE_MULT_SQUARE_COL_N_INIT: + col_index <= col_index + 1'b1; + // + endcase + + assign fsm_state_after_mult_square = (col_index == col_index_last) ? FSM_STATE_MULT_SQUARE_HOLDOFF : FSM_STATE_MULT_SQUARE_COL_N_INIT; + + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: dsp_xy_mode_z_adv4 <= {8{1'b0}}; + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly); + default: dsp_xy_mode_z_adv4 <= {8{1'b1}}; + endcase + + always @(posedge clk) begin + {dsp_y_mode_z, dsp_x_mode_z} <= {2{dsp_xy_mode_z_adv1}}; + // + dsp_xy_mode_z_adv1 <= {dsp_xy_mode_z_adv2}; + dsp_xy_mode_z_adv2 <= {dsp_xy_mode_z_adv3}; + dsp_xy_mode_z_adv3 <= {dsp_xy_mode_z_adv4}; + end + + function [NUM_MULTS-1:0] calc_mac_mode_z_square; + input [ 4:0] col_index_value; + input [ 7:0] mac_slim_bram_xy_addr_value; + begin + if (mac_slim_bram_xy_addr_value[7:3] == col_index_value) + case (mac_slim_bram_xy_addr_value[2:0]) + 3'b000: calc_mac_mode_z_square = 8'b11111110; + 3'b001: calc_mac_mode_z_square = 8'b11111101; + 3'b010: calc_mac_mode_z_square = 8'b11111011; + 3'b011: calc_mac_mode_z_square = 8'b11110111; + 3'b100: calc_mac_mode_z_square = 8'b11101111; + 3'b101: calc_mac_mode_z_square = 8'b11011111; + 3'b110: calc_mac_mode_z_square = 8'b10111111; + 3'b111: calc_mac_mode_z_square = 8'b01111111; + endcase + else + calc_mac_mode_z_square = {NUM_MULTS{1'b1}}; + end + endfunction + + reg recomb_x_ena = 1'b0; + reg recomb_y_ena = 1'b0; + + always @(posedge clk) begin + // + recomb_x_ena <= dsp_x_ce_a && !dsp_x_ce_b && !dsp_x_ce_m && !dsp_x_ce_p; + recomb_y_ena <= dsp_y_ce_a && !dsp_y_ce_b && !dsp_y_ce_m && !dsp_y_ce_p; + // + end + + wire [ 2:0] recomb_fat_bram_xy_bank; + wire [ 7:0] recomb_fat_bram_xy_addr; + wire [17:0] recomb_fat_bram_x_dout; + wire [17:0] recomb_fat_bram_y_dout; + wire recomb_fat_bram_xy_dout_valid; + wire recomb_rdy; + + modexpng_part_recombinator recomb + ( + .clk (clk), + .rdy (recomb_rdy), + .fsm_state_next (fsm_state_next), + .index_last (index_last), + .dsp_x_ce_p (dsp_x_ce_p), + .dsp_y_ce_p (dsp_y_ce_p), + .ena_x (recomb_x_ena), + .ena_y (recomb_y_ena), + .dsp_x_p (dsp_x_p), + .dsp_y_p (dsp_y_p), + .col_index (col_index), + .col_index_last (col_index_last), + .slim_bram_xy_addr (mac_slim_bram_xy_addr), + .fat_bram_xy_bank (recomb_fat_bram_xy_bank), + .fat_bram_xy_addr (recomb_fat_bram_xy_addr), + .fat_bram_x_dout (recomb_fat_bram_x_dout), + .fat_bram_y_dout (recomb_fat_bram_y_dout), + .fat_bram_xy_dout_valid (recomb_fat_bram_xy_dout_valid) + ); + + reg [17:0] AB_READ[0:63]; + + always @(posedge clk) + // + if (recomb_fat_bram_xy_dout_valid) + // + case (recomb_fat_bram_xy_bank) + 3'd1: AB_READ[recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout; + 3'd2: AB_READ[32 + recomb_fat_bram_xy_addr] <= recomb_fat_bram_x_dout; + endcase + + + always @(posedge clk) + // + if (tb_fat_bram_xy_ena) begin + mgr_fat_bram_xy_ena <= 1'b1; + mgr_fat_bram_xy_bank <= tb_fat_bram_xy_bank; + mgr_fat_bram_xy_addr <= tb_fat_bram_xy_addr; + mgr_fat_bram_x_din <= tb_fat_bram_x_din; + mgr_fat_bram_y_din <= tb_fat_bram_y_din; + end else if (recomb_fat_bram_xy_dout_valid) begin + mgr_fat_bram_xy_ena <= 1'b1; + mgr_fat_bram_xy_bank <= recomb_fat_bram_xy_bank; + mgr_fat_bram_xy_addr <= recomb_fat_bram_xy_addr; + mgr_fat_bram_x_din <= recomb_fat_bram_x_dout; + mgr_fat_bram_y_din <= recomb_fat_bram_y_dout; + end else begin + mgr_fat_bram_xy_ena <= 1'b0; + mgr_fat_bram_xy_bank <= 3'bXXX; + mgr_fat_bram_xy_addr <= 8'hXX; + mgr_fat_bram_x_din <= {18{1'bX}}; + mgr_fat_bram_y_din <= {18{1'bX}}; + end + + + + + + task verify_ab; + reg verify_ab_ok; + begin + verify_ab_ok = 1; + for (i=0; i<64; i=i+1) + if (AB_READ[i] === AB[i]) + $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x", i, AB[i], AB_READ[i]); + else begin + $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x ", i, AB[i], AB_READ[i]); + verify_ab_ok = 0; + end + if (verify_ab_ok) + $display("AB is OK."); + else + $display("AB is WRONG!"); + end + endtask + + + + always @* begin + // + fsm_state_next = FSM_STATE_IDLE; + // + case (fsm_state) + FSM_STATE_IDLE: fsm_state_next = ena ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_IDLE; + + FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_TRIG ; + FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ; + FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY; + + FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_TRIG ; + FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_surely_done_flop ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY; + + FSM_STATE_MULT_SQUARE_HOLDOFF: fsm_state_next = recomb_rdy ? FSM_STATE_IDLE : FSM_STATE_MULT_SQUARE_HOLDOFF; + + default: fsm_state_next = FSM_STATE_IDLE ; + + endcase + // + end + + +endmodule + diff --git a/rtl/dev/temp.txt b/rtl/dev/temp.txt new file mode 100644 index 0000000..987bd86 --- /dev/null +++ b/rtl/dev/temp.txt @@ -0,0 +1,384 @@ + // + // Helper Functions + // + /* + function [INDEX_WIDTH-1:0] calc_preset_a_index; + input [INDEX_WIDTH-4:0] col_in; + input integer x_in; + integer index_out; + begin + index_out = col_in * NUM_MULTS + x_in; + calc_preset_a_index = index_out[INDEX_WIDTH-1:0]; + end + endfunction + + function [INDEX_WIDTH-1:0] calc_rotate_a_index; + input [INDEX_WIDTH-1:0] current_index_in; + input [INDEX_WIDTH-1:0] last_index_in; + begin + if (current_index_in > {INDEX_WIDTH{1'b0}}) + calc_rotate_a_index = current_index_in - 1'b1; + else + calc_rotate_a_index = last_index_in; + end + endfunction + */ + + /* + // + // Narrow Counters + // + reg [INDEX_WIDTH-1:0] din_addr_narrow_reg; + reg [INDEX_WIDTH-1:0] din_addr_narrow_dly; + localparam [INDEX_WIDTH-1:0] din_addr_narrow_zero = {INDEX_WIDTH{1'b0}}; + wire [INDEX_WIDTH-1:0] din_addr_narrow_next = (din_addr_narrow_reg < index_last) ? + din_addr_narrow_reg + 1'b1 : din_addr_narrow_zero; + wire din_addr_narrow_done = din_addr_narrow_reg == index_last; + + assign din_addr_narrow = din_addr_narrow_reg; + + always @(posedge clk) + // + din_addr_narrow_dly <= din_addr_narrow_reg; + + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero; + FSM_STATE_MULT_SQUARE_COL_0_BUSY: din_addr_narrow_reg <= din_addr_narrow_next; + FSM_STATE_MULT_SQUARE_COL_N_TRIG: din_addr_narrow_reg <= din_addr_narrow_zero; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: din_addr_narrow_reg <= din_addr_narrow_next; + endcase + + + // + // Helper Functions + // + function [NUM_MULTS-1:0] calc_mac_clear_bitmask; + input [2:0] t; + begin + case (t) + 3'd0: calc_mac_clear_bitmask = 8'b00000001; + 3'd1: calc_mac_clear_bitmask = 8'b00000010; + 3'd2: calc_mac_clear_bitmask = 8'b00000100; + 3'd3: calc_mac_clear_bitmask = 8'b00001000; + 3'd4: calc_mac_clear_bitmask = 8'b00010000; + 3'd5: calc_mac_clear_bitmask = 8'b00100000; + 3'd6: calc_mac_clear_bitmask = 8'b01000000; + 3'd7: calc_mac_clear_bitmask = 8'b10000000; + endcase + end + endfunction + + function [NUM_MULTS:0] calc_mac_clear_square; + input [INDEX_WIDTH-4:0] current_col_index; + input [INDEX_WIDTH-1:0] b_addr_prev; + begin + if (b_addr_prev[INDEX_WIDTH-1:3] == current_col_index) + calc_mac_clear_square = {1'b0, calc_mac_clear_bitmask(b_addr_prev[2:0])}; + else + calc_mac_clear_square = {1'b0, {NUM_MULTS{1'b0}}}; + end + endfunction + + + // + // Wide Counters + // + reg [INDEX_WIDTH-1:0] din_addr_wide_reg[0:NUM_MULTS-1]; + + integer xi; + always @(posedge clk) + // + for (xi=0; xi 8'd0) begin + // + rdy_adv <= recomb_msb_cnt_delay_1 == 8'd0; + // + recomb_msb_dout_delay_0 <= {18{1'bX}}; + recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0; + recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1; + // + recomb_msb_cnt_delay_0 <= 8'd0; + recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0; + recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1; + // + fat_bram_xy_bank_reg <= BANK_FAT_ABH; + fat_bram_xy_addr_reg <= recomb_msb_cnt_delay_2; + fat_bram_x_dout_reg <= recomb_msb_dout_delay_2; +// fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b1; + // + end else begin + // + fat_bram_xy_bank_reg <= 3'bXXX; + fat_bram_xy_addr_reg <= 8'hXX; + fat_bram_x_dout_reg <= {18{1'bX}}; + fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b0; + // + end + // + end + // + 2'b01: begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABL; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; + fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout}; +// fat_bram_y_dout_reg + fat_bram_xy_dout_valid_reg <= 1'b1; + // + fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1; + // + end + // + 2'b10: begin + // + if (fat_bram_xy_cnt_msb < 8'd2) begin + // + recomb_msb_dout_carry_0 <= recomb_msb_dout; + recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0; + // + fat_bram_xy_bank_reg <= 3'bXXX; + fat_bram_xy_addr_reg <= 8'hXX; + fat_bram_x_dout_reg <= {18{1'bX}}; + // fat_bram_y_dout_reg + fat_bram_xy_dout_valid_reg <= 1'b0; + // + end else begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABH; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_msb; + fat_bram_x_dout_reg <= {2'b00, recomb_msb_dout}; + // fat_bram_y_dout_reg + fat_bram_xy_dout_valid_reg <= 1'b1; + // + end + // + fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1; + // + end + // + 2'b11: begin + // + if (fat_bram_xy_cnt_lsb == index_last) begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABL; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; + fat_bram_x_dout_reg <= {2'b00, recomb_lsb_dout}; +// fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b1; + // + fat_bram_xy_cnt_lsb <= 8'd0; + // + end else begin + // + fat_bram_xy_bank_reg <= BANK_FAT_ABH; + fat_bram_xy_addr_reg <= fat_bram_xy_cnt_lsb; + fat_bram_x_dout_reg <= {1'b0, {1'b0, recomb_lsb_dout} + {1'b0, recomb_msb_dout_carry_1}}; +// fat_bram_y_dout_reg <= {18{1'bX}}; + fat_bram_xy_dout_valid_reg <= 1'b1; + // + fat_bram_xy_cnt_lsb <= fat_bram_xy_cnt_lsb + 1'b1; + // + recomb_msb_dout_carry_0 <= {16{1'bX}}; + recomb_msb_dout_carry_1 <= recomb_msb_dout_carry_0; + // + end + // + recomb_msb_dout_delay_0 <= recomb_msb_dout; + recomb_msb_dout_delay_1 <= recomb_msb_dout_delay_0; + recomb_msb_dout_delay_2 <= recomb_msb_dout_delay_1; + // + recomb_msb_cnt_delay_0 <= fat_bram_xy_cnt_msb; + recomb_msb_cnt_delay_1 <= recomb_msb_cnt_delay_0; + recomb_msb_cnt_delay_2 <= recomb_msb_cnt_delay_1; + // + fat_bram_xy_cnt_msb <= fat_bram_xy_cnt_msb + 1'b1; + // + end + // + endcase + // + end + + + + +endmodule diff --git a/rtl/modexpng_recombinator_block.v b/rtl/modexpng_recombinator_block.v new file mode 100644 index 0000000..efe0ac5 --- /dev/null +++ b/rtl/modexpng_recombinator_block.v @@ -0,0 +1,35 @@ +module modexpng_recombinator_block +( + clk, + ce, clr, + din, dout +); + + input clk; + input ce; + input clr; + input [46:0] din; + output [15:0] dout; + + reg [14:0] z; + reg [16:0] y; + reg [17:0] x; + //reg [15:0] w; + + //assign dout = w; + assign dout = x[15:0]; + + wire [14:0] din_z = din[46:32]; // TODO: maybe determine more precise bound here + wire [15:0] din_y = din[31:16]; + wire [15:0] din_x = din[15: 0]; + + always @(posedge clk) + // + if (ce) begin + z <= din_z; + y <= clr ? {1'b0, din_y} : {1'b0, din_y} + {2'b00, z}; + x <= clr ? {2'b00, din_x} : {2'b00, din_x} + {1'b0, y} + {{16{1'b0}}, x[17:16]}; + //w <= clr ? {16{1'bX}} : x[15:0]; + end + +endmodule -- cgit v1.2.3