diff options
Diffstat (limited to 'rtl/modexpng_mmm_x8_dual.v')
-rw-r--r-- | rtl/modexpng_mmm_x8_dual.v | 550 |
1 files changed, 550 insertions, 0 deletions
diff --git a/rtl/modexpng_mmm_x8_dual.v b/rtl/modexpng_mmm_x8_dual.v new file mode 100644 index 0000000..99a37fa --- /dev/null +++ b/rtl/modexpng_mmm_x8_dual.v @@ -0,0 +1,550 @@ +module modexpng_mmm_x8_dual +( + clk, rst_n, + ena, rdy, + mode, transfer, + index_last, + x_din, y_din, x_dout, y_dout, + x_din_addr, y_din_addr, x_dout_addr, y_dout_addr, + x_din_ena, y_din_ena, x_dout_ena, y_dout_ena, x_din_reg_ena, y_din_reg_ena, + x_din_bank, y_din_bank, x_dout_bank, y_dout_bank, + load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req, + load_x_din, load_y_din +); + + + // + // Includes + // + `include "modexpng_parameters.vh" + `include "modexpng_parameters_x8.vh" + `include "modexpng_mmm_fsm.vh" + + + // + // Parameters + // + parameter INDEX_WIDTH = 6; + + + // + // Ports + // + input clk; + input rst_n; + + input ena; + output rdy; + + input mode; // multiply: 0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2 + // load/unload: 0 = load, 1 = unload + input transfer; // 0 = multiply, 1 = load/unload + + input [INDEX_WIDTH-1:0] index_last; + + input [NUM_MULTS*WORD_WIDTH-1:0] x_din; + input [NUM_MULTS*WORD_WIDTH-1:0] y_din; + output [NUM_MULTS*WORD_WIDTH-1:0] x_dout; + output [NUM_MULTS*WORD_WIDTH-1:0] y_dout; + + output [INDEX_WIDTH-4:0] x_din_addr; + output [INDEX_WIDTH-4:0] y_din_addr; + output [INDEX_WIDTH-4:0] x_dout_addr; + output [INDEX_WIDTH-4:0] y_dout_addr; + + output [ 1-1:0] x_din_ena; + output [ 1-1:0] y_din_ena; + output [NUM_MULTS-1:0] x_dout_ena; + output [NUM_MULTS-1:0] y_dout_ena; + output [ 1-1:0] x_din_reg_ena; + output [ 1-1:0] y_din_reg_ena; + + output [3-1:0] x_din_bank; + output [3-1:0] y_din_bank; + output [3-1:0] x_dout_bank; + output [3-1:0] y_dout_bank; + + output load_phase; // 0 = T1, T2; 1 = N, N_COEFF + output [ INDEX_WIDTH:0] load_xy_addr; // address + output load_xy_addr_vld; // address valid + output load_xy_req; // data request + + input [WORD_WIDTH-1:0] load_x_din; // data input + input [WORD_WIDTH-1:0] load_y_din; // data input + + + // + // FSM State and Next States + // + reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE; + reg [FSM_STATE_WIDTH-1:0] fsm_state_next; + reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle; + reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square; + + + // + // FSM Idle Next State + // + always @* + // + case ({transfer, mode}) + 2'b00, + 2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG; + 2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1; + 2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload? + endcase + + + // + // Column Counter + // + wire [ INDEX_WIDTH-4:0] col_index; + wire col_index_done; + wire [ INDEX_WIDTH-4:0] col_index_zero; + wire [ INDEX_WIDTH-4:0] col_index_next; + wire [ INDEX_WIDTH-4:0] col_index_prev; + + modexpng_mmm_col_index # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + mmm_col_index + ( + .clk (clk), + .index_last (index_last), + .fsm_state_next (fsm_state_next), + .col_index (col_index), + .col_index_done (col_index_done), + .col_index_zero (col_index_zero), + .col_index_next (col_index_next), + .col_index_prev (col_index_prev) + ); + + + // + // Load Address Generator + // + wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0]; + wire load_addr_zero; + wire load_t1t2_addr_done; + wire load_nn_coeff_addr_done; + + modexpng_mmm_transporter # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + transporter + ( + .clk (clk), + .ena (ena), + .index_last (index_last), + .fsm_state (fsm_state), + .fsm_state_next (fsm_state_next), + .load_phase (load_phase), + .load_xy_addr (load_xy_addr), + .load_xy_addr_vld (load_xy_addr_vld), + .load_xy_req (load_xy_req), + .load_addr_zero (load_addr_zero), + .load_t1t2_addr_done (load_t1t2_addr_done), + .load_nn_coeff_addr_done (load_nn_coeff_addr_done) + ); + + + // + // X, Y Address + // + wire [INDEX_WIDTH-1:0] x_din_addr_cnt; + wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last; + wire [ 3-1:0] x_din_addr_cnt_lower_prev; + wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev; + + modexpng_mmm_din_addr # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + din_addr_x + ( + .clk (clk), + .rst_n (rst_n), + .index_last (index_last), + .fsm_state_next (fsm_state_next), + .col_index_zero (col_index_zero), + .col_index_next (col_index_next), + .din_addr (x_din_addr), + .din_bank (x_din_bank), + .din_ena (x_din_ena), + .din_reg_ena (x_din_reg_ena), + .din_addr_cnt (x_din_addr_cnt), + .din_addr_cnt_last (x_din_addr_cnt_last), + .din_addr_cnt_lower_prev (x_din_addr_cnt_lower_prev), + .din_addr_cnt_upper_prev (x_din_addr_cnt_upper_prev) + ); + + modexpng_mmm_dout_addr # + ( + .INDEX_WIDTH(INDEX_WIDTH) + ) + dout_addr_xy + ( + .clk (clk), + .rst_n (rst_n), + .fsm_state (fsm_state), + .load_xy_addr (load_xy_addr), + .load_addr_zero (load_addr_zero), + .load_nn_coeff_addr_done (load_nn_coeff_addr_done), + .x_dout_addr (x_dout_addr), + .y_dout_addr (y_dout_addr), + .x_dout_ena (x_dout_ena), + .y_dout_ena (y_dout_ena), + .x_dout_bank (x_dout_bank), + .y_dout_bank (y_dout_bank) + ); + + + // + // Helper Memories ("Scratchpad") + // + reg [INDEX_WIDTH-1:0] pad_xy_rd_addr; + reg pad_xy_rd_ena = 1'b0; + wire [ WORD_WIDTH-1:0] pad_x_rd_dout; + wire [ WORD_WIDTH-1:0] pad_y_rd_dout; + + wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}}; + wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1; + + modexpng_mmm_pad pad + ( + .clk (clk), + .rst_n (rst_n), + .fsm_state (fsm_state), + .load_xy_addr_lsb (load_xy_addr_lsb), + .load_x_din (load_x_din), + .load_y_din (load_y_din), + .pad_x_rd_addr (pad_xy_rd_addr), + .pad_y_rd_addr (pad_xy_rd_addr), + .pad_x_rd_ena (pad_xy_rd_ena), + .pad_y_rd_ena (pad_xy_rd_ena), + .pad_x_rd_dout (pad_x_rd_dout), + .pad_y_rd_dout (pad_y_rd_dout) + ); + + + always @(posedge clk or negedge rst_n) + // + if (!rst_n) begin + pad_xy_rd_ena <= 1'b0; + end else case (fsm_state_next) + + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: + pad_xy_rd_ena <= 1'b1; + + default: + pad_xy_rd_ena <= 1'b0; + + endcase + + always @(posedge clk) + // + case (fsm_state_next) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: + pad_xy_rd_addr <= pad_xy_rd_addr_zero; + + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: + pad_xy_rd_addr <= pad_xy_rd_addr_next; + + default: + pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}}; + + endcase + + + + + // + // Flags + // + + wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last; + + always @* + // + fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;; + + + // + // MAC Arrays + // + reg mac_x_ce = 1'b0; + reg mac_x_ce_aux = 1'b0; + reg [NUM_MULTS -1:0] mac_x_clr; + reg mac_x_clr_aux; + reg [NUM_MULTS -2:0] mac_x_casc_a; + reg mac_x_casc_a_aux; + wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a; + reg [ 1 * WORD_WIDTH -1:0] mac_x_a_aux; + //wire [ 1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1]; + reg [ 1 * WORD_WIDTH -1:0] mac_x_b; + wire [NUM_MULTS * MAC_WIDTH -1:0] mac_x_p; + wire [ 1 * MAC_WIDTH -1:0] mac_x_p_aux; + + reg mac_y_ce = 1'b0; + reg mac_y_ce_aux = 1'b0; + reg [NUM_MULTS -1:0] mac_y_clr; + reg mac_y_clr_aux; + reg [NUM_MULTS -2:0] mac_y_casc_a; + reg mac_y_casc_a_aux; + wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a; + reg [ 1 * WORD_WIDTH -1:0] mac_y_a_aux; + //wire [ 1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1]; + reg [ 1 * WORD_WIDTH -1:0] mac_y_b; + wire [NUM_MULTS * MAC_WIDTH -1:0] mac_y_p; + wire [ 1 * MAC_WIDTH -1:0] mac_y_p_aux; + + modexpng_mac_array mac_array_x + ( + .clk (clk), + .ce (mac_x_ce), + .ce_aux (mac_x_ce_aux), + .clr (mac_x_clr), + .clr_aux (mac_x_clr_aux), + .casc_a (mac_x_casc_a), + .casc_a_aux (mac_x_casc_a_aux), + .a_in (mac_x_a), + .a_in_aux (mac_x_a_aux), + .b_in (mac_x_b), + .p_out (mac_x_p), + .p_out_aux (mac_x_p_aux) + ); + + modexpng_mac_array mac_array_y + ( + .clk (clk), + .ce (mac_y_ce), + .ce_aux (mac_y_ce_aux), + .clr (mac_y_clr), + .clr_aux (mac_y_clr_aux), + .casc_a (mac_y_casc_a), + .casc_a_aux (mac_y_casc_a_aux), + .a_in (mac_y_a), + .a_in_aux (mac_y_a_aux), + .b_in (mac_y_b), + .p_out (mac_y_p), + .p_out_aux (mac_y_p_aux) + ); + + genvar gen_z; + + generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1) + begin : gen_xy_din + //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z]; + //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z]; + //gen_xy_dout + assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH]; + + //assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z]; + //assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z]; + end + endgenerate + + + // + // MAC Clock Enable Logic + // + reg mac_xy_ce_adv = 1'b0; + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0; + else case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1; + default: mac_xy_ce_adv <= 1'b0; + endcase + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00; + else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}}; + + + // + // MAC Clear Logic + // + wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value = + calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev); + + reg [NUM_MULTS-1:0] mac_xy_clr_adv; + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}}; + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value; + default: mac_xy_clr_adv <= {NUM_MULTS{1'bX}}; + endcase + + always @(posedge clk) + // + {mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}}; + + + // + // MAC Cascade Logic + // + reg [NUM_MULTS-2:0] mac_xy_casc_a_adv; + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_SQUARE_COL_0_TRIG, + FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}}; + FSM_STATE_MULT_SQUARE_COL_0_BUSY, + FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}}; + default: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}}; + endcase + + always @(posedge clk) + // + {mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}}; + + + + // + // DOUT Mapping + // + generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1) + begin : gen_xy_dout + assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z]; + assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z]; + end + endgenerate + + + // + // DOUT + // + reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1]; + reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1]; + + + + + integer int_z; + always @(posedge clk) + // + case (fsm_state) + // + FSM_STATE_LOAD_T1T2_3, + FSM_STATE_LOAD_NN_COEFF_3: + for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin + x_dout_reg[int_z] <= load_x_din; + y_dout_reg[int_z] <= load_y_din; + end + // + default: + for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin + x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}}; + y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}}; + end + // + endcase + + + + // + // FSM Process + // + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; + else fsm_state <= fsm_state_next; + + + // + // FSM Transition Logic + // + always @* begin + // + fsm_state_next = FSM_STATE_IDLE; + // + case (fsm_state) + FSM_STATE_IDLE: fsm_state_next = ena ? fsm_state_after_idle : FSM_STATE_IDLE; + + FSM_STATE_LOAD_T1T2_1: fsm_state_next = FSM_STATE_LOAD_T1T2_2 ; + FSM_STATE_LOAD_T1T2_2: fsm_state_next = FSM_STATE_LOAD_T1T2_3 ; + FSM_STATE_LOAD_T1T2_3: fsm_state_next = load_t1t2_addr_done ? FSM_STATE_LOAD_NN_COEFF_1 : FSM_STATE_LOAD_T1T2_1; + + FSM_STATE_LOAD_NN_COEFF_1: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ; + FSM_STATE_LOAD_NN_COEFF_2: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ; + FSM_STATE_LOAD_NN_COEFF_3: fsm_state_next = load_nn_coeff_addr_done ? FSM_STATE_STOP : FSM_STATE_LOAD_NN_COEFF_1; + + FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ; + FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY; + FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ; + FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY; + + /* + FSM_STATE_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_0_BUSY ; + FSM_STATE_TRIANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG : FSM_STATE_TRIANGLE_COL_0_BUSY; + FSM_STATE_TRIANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_N_BUSY ; + FSM_STATE_TRIANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle : FSM_STATE_TRIANGLE_COL_N_BUSY; + + FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_0_BUSY ; + FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY; + FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_N_BUSY ; + FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle : FSM_STATE_RECTANGLE_COL_N_BUSY; + */ + + FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ; + + endcase + // + end + + + // + // Ready Output + // + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) rdy_reg <= 1'b1; + else case (fsm_state) + FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0; + FSM_STATE_STOP: rdy_reg <= 1'b1; + endcase + + function [ NUM_MULTS-1:0] calc_mac_clear_square; + input [INDEX_WIDTH-4:0] col_index_delayed; + input [ 3-1:0] x_din_addr_cnt_lower_delayed; + input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed; + begin + if (x_din_addr_cnt_upper_delayed == col_index_delayed) + case (x_din_addr_cnt_lower_delayed) + 3'b000: calc_mac_clear_square = 8'b00000001; + 3'b001: calc_mac_clear_square = 8'b00000010; + 3'b010: calc_mac_clear_square = 8'b00000100; + 3'b011: calc_mac_clear_square = 8'b00001000; + 3'b100: calc_mac_clear_square = 8'b00010000; + 3'b101: calc_mac_clear_square = 8'b00100000; + 3'b110: calc_mac_clear_square = 8'b01000000; + 3'b111: calc_mac_clear_square = 8'b10000000; + endcase + else + calc_mac_clear_square = {NUM_MULTS{1'b0}}; + end + endfunction + + +endmodule |