module modexpng_mmm_x8_dual
(
clk, rst_n,
ena, rdy,
mode, transfer,
index_last,
x_din, y_din, x_dout, y_dout,
x_din_addr, y_din_addr, x_dout_addr, y_dout_addr,
x_din_ena, y_din_ena, x_dout_ena, y_dout_ena, x_din_reg_ena, y_din_reg_ena,
x_din_bank, y_din_bank, x_dout_bank, y_dout_bank,
load_phase, load_xy_addr, load_xy_addr_vld, load_xy_req,
load_x_din, load_y_din
);
//
// Includes
//
`include "modexpng_parameters.vh"
`include "modexpng_parameters_x8.vh"
`include "modexpng_mmm_fsm.vh"
//
// Parameters
//
parameter INDEX_WIDTH = 6;
//
// Ports
//
input clk;
input rst_n;
input ena;
output rdy;
input mode; // multiply: 0 = T1:T1*T1, T2:T2*T1, 1 = T1:T1*T2, T2:T2*T2
// load/unload: 0 = load, 1 = unload
input transfer; // 0 = multiply, 1 = load/unload
input [INDEX_WIDTH-1:0] index_last;
input [NUM_MULTS*WORD_WIDTH-1:0] x_din;
input [NUM_MULTS*WORD_WIDTH-1:0] y_din;
output [NUM_MULTS*WORD_WIDTH-1:0] x_dout;
output [NUM_MULTS*WORD_WIDTH-1:0] y_dout;
output [INDEX_WIDTH-4:0] x_din_addr;
output [INDEX_WIDTH-4:0] y_din_addr;
output [INDEX_WIDTH-4:0] x_dout_addr;
output [INDEX_WIDTH-4:0] y_dout_addr;
output [ 1-1:0] x_din_ena;
output [ 1-1:0] y_din_ena;
output [NUM_MULTS-1:0] x_dout_ena;
output [NUM_MULTS-1:0] y_dout_ena;
output [ 1-1:0] x_din_reg_ena;
output [ 1-1:0] y_din_reg_ena;
output [3-1:0] x_din_bank;
output [3-1:0] y_din_bank;
output [3-1:0] x_dout_bank;
output [3-1:0] y_dout_bank;
output load_phase; // 0 = T1, T2; 1 = N, N_COEFF
output [ INDEX_WIDTH:0] load_xy_addr; // address
output load_xy_addr_vld; // address valid
output load_xy_req; // data request
input [WORD_WIDTH-1:0] load_x_din; // data input
input [WORD_WIDTH-1:0] load_y_din; // data input
//
// FSM State and Next States
//
reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
reg [FSM_STATE_WIDTH-1:0] fsm_state_next;
reg [FSM_STATE_WIDTH-1:0] fsm_state_after_idle;
reg [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
//
// FSM Idle Next State
//
always @*
//
case ({transfer, mode})
2'b00,
2'b01: fsm_state_after_idle = FSM_STATE_MULT_SQUARE_COL_0_TRIG;
2'b10: fsm_state_after_idle = FSM_STATE_LOAD_T1T2_1;
2'b11: fsm_state_after_idle = FSM_STATE_IDLE; //unload?
endcase
//
// Column Counter
//
wire [ INDEX_WIDTH-4:0] col_index;
wire col_index_done;
wire [ INDEX_WIDTH-4:0] col_index_zero;
wire [ INDEX_WIDTH-4:0] col_index_next;
wire [ INDEX_WIDTH-4:0] col_index_prev;
modexpng_mmm_col_index #
(
.INDEX_WIDTH(INDEX_WIDTH)
)
mmm_col_index
(
.clk (clk),
.index_last (index_last),
.fsm_state_next (fsm_state_next),
.col_index (col_index),
.col_index_done (col_index_done),
.col_index_zero (col_index_zero),
.col_index_next (col_index_next),
.col_index_prev (col_index_prev)
);
//
// Load Address Generator
//
wire [INDEX_WIDTH-1:0] load_xy_addr_lsb = load_xy_addr[INDEX_WIDTH-1:0];
wire load_addr_zero;
wire load_t1t2_addr_done;
wire load_nn_coeff_addr_done;
modexpng_mmm_transporter #
(
.INDEX_WIDTH(INDEX_WIDTH)
)
transporter
(
.clk (clk),
.ena (ena),
.index_last (index_last),
.fsm_state (fsm_state),
.fsm_state_next (fsm_state_next),
.load_phase (load_phase),
.load_xy_addr (load_xy_addr),
.load_xy_addr_vld (load_xy_addr_vld),
.load_xy_req (load_xy_req),
.load_addr_zero (load_addr_zero),
.load_t1t2_addr_done (load_t1t2_addr_done),
.load_nn_coeff_addr_done (load_nn_coeff_addr_done)
);
//
// X, Y Address
//
wire [INDEX_WIDTH-1:0] x_din_addr_cnt;
wire [INDEX_WIDTH-1:0] x_din_addr_cnt_last;
wire [ 3-1:0] x_din_addr_cnt_lower_prev;
wire [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_prev;
modexpng_mmm_din_addr #
(
.INDEX_WIDTH(INDEX_WIDTH)
)
din_addr_x
(
.clk (clk),
.rst_n (rst_n),
.index_last (index_last),
.fsm_state_next (fsm_state_next),
.col_index_zero (col_index_zero),
.col_index_next (col_index_next),
.din_addr (x_din_addr),
.din_bank (x_din_bank),
.din_ena (x_din_ena),
.din_reg_ena (x_din_reg_ena),
.din_addr_cnt (x_din_addr_cnt),
.din_addr_cnt_last (x_din_addr_cnt_last),
.din_addr_cnt_lower_prev (x_din_addr_cnt_lower_prev),
.din_addr_cnt_upper_prev (x_din_addr_cnt_upper_prev)
);
modexpng_mmm_dout_addr #
(
.INDEX_WIDTH(INDEX_WIDTH)
)
dout_addr_xy
(
.clk (clk),
.rst_n (rst_n),
.fsm_state (fsm_state),
.load_xy_addr (load_xy_addr),
.load_addr_zero (load_addr_zero),
.load_nn_coeff_addr_done (load_nn_coeff_addr_done),
.x_dout_addr (x_dout_addr),
.y_dout_addr (y_dout_addr),
.x_dout_ena (x_dout_ena),
.y_dout_ena (y_dout_ena),
.x_dout_bank (x_dout_bank),
.y_dout_bank (y_dout_bank)
);
//
// Helper Memories ("Scratchpad")
//
reg [INDEX_WIDTH-1:0] pad_xy_rd_addr;
reg pad_xy_rd_ena = 1'b0;
wire [ WORD_WIDTH-1:0] pad_x_rd_dout;
wire [ WORD_WIDTH-1:0] pad_y_rd_dout;
wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_zero = {INDEX_WIDTH{1'b0}};
wire [INDEX_WIDTH-1:0] pad_xy_rd_addr_next = pad_xy_rd_addr + 1'b1;
modexpng_mmm_pad pad
(
.clk (clk),
.rst_n (rst_n),
.fsm_state (fsm_state),
.load_xy_addr_lsb (load_xy_addr_lsb),
.load_x_din (load_x_din),
.load_y_din (load_y_din),
.pad_x_rd_addr (pad_xy_rd_addr),
.pad_y_rd_addr (pad_xy_rd_addr),
.pad_x_rd_ena (pad_xy_rd_ena),
.pad_y_rd_ena (pad_xy_rd_ena),
.pad_x_rd_dout (pad_x_rd_dout),
.pad_y_rd_dout (pad_y_rd_dout)
);
always @(posedge clk or negedge rst_n)
//
if (!rst_n) begin
pad_xy_rd_ena <= 1'b0;
end else case (fsm_state_next)
FSM_STATE_MULT_SQUARE_COL_0_TRIG,
FSM_STATE_MULT_SQUARE_COL_0_BUSY,
FSM_STATE_MULT_SQUARE_COL_N_TRIG,
FSM_STATE_MULT_SQUARE_COL_N_BUSY:
pad_xy_rd_ena <= 1'b1;
default:
pad_xy_rd_ena <= 1'b0;
endcase
always @(posedge clk)
//
case (fsm_state_next)
FSM_STATE_MULT_SQUARE_COL_0_TRIG,
FSM_STATE_MULT_SQUARE_COL_N_TRIG:
pad_xy_rd_addr <= pad_xy_rd_addr_zero;
FSM_STATE_MULT_SQUARE_COL_0_BUSY,
FSM_STATE_MULT_SQUARE_COL_N_BUSY:
pad_xy_rd_addr <= pad_xy_rd_addr_next;
default:
pad_xy_rd_addr <= {INDEX_WIDTH{1'bX}};
endcase
//
// Flags
//
wire mult_square_addr_done = x_din_addr_cnt == x_din_addr_cnt_last;
always @*
//
fsm_state_after_mult_square = col_index_done ? /*FSM_STATE_MULT_TRIANGLE_TRIG*/FSM_STATE_STOP : FSM_STATE_MULT_SQUARE_COL_N_TRIG;;
//
// MAC Arrays
//
reg mac_x_ce = 1'b0;
reg mac_x_ce_aux = 1'b0;
reg [NUM_MULTS -1:0] mac_x_clr;
reg mac_x_clr_aux;
reg [NUM_MULTS -2:0] mac_x_casc_a;
reg mac_x_casc_a_aux;
wire [NUM_MULTS * WORD_WIDTH -1:0] mac_x_a;
reg [ 1 * WORD_WIDTH -1:0] mac_x_a_aux;
//wire [ 1 * WORD_WIDTH -1:0] mac_x_a_split[0:NUM_MULTS-1];
reg [ 1 * WORD_WIDTH -1:0] mac_x_b;
wire [NUM_MULTS * MAC_WIDTH -1:0] mac_x_p;
wire [ 1 * MAC_WIDTH -1:0] mac_x_p_aux;
reg mac_y_ce = 1'b0;
reg mac_y_ce_aux = 1'b0;
reg [NUM_MULTS -1:0] mac_y_clr;
reg mac_y_clr_aux;
reg [NUM_MULTS -2:0] mac_y_casc_a;
reg mac_y_casc_a_aux;
wire [NUM_MULTS * WORD_WIDTH -1:0] mac_y_a;
reg [ 1 * WORD_WIDTH -1:0] mac_y_a_aux;
//wire [ 1 * WORD_WIDTH -1:0] mac_y_a_split[0:NUM_MULTS-1];
reg [ 1 * WORD_WIDTH -1:0] mac_y_b;
wire [NUM_MULTS * MAC_WIDTH -1:0] mac_y_p;
wire [ 1 * MAC_WIDTH -1:0] mac_y_p_aux;
modexpng_mac_array mac_array_x
(
.clk (clk),
.ce (mac_x_ce),
.ce_aux (mac_x_ce_aux),
.clr (mac_x_clr),
.clr_aux (mac_x_clr_aux),
.casc_a (mac_x_casc_a),
.casc_a_aux (mac_x_casc_a_aux),
.a_in (mac_x_a),
.a_in_aux (mac_x_a_aux),
.b_in (mac_x_b),
.p_out (mac_x_p),
.p_out_aux (mac_x_p_aux)
);
modexpng_mac_array mac_array_y
(
.clk (clk),
.ce (mac_y_ce),
.ce_aux (mac_y_ce_aux),
.clr (mac_y_clr),
.clr_aux (mac_y_clr_aux),
.casc_a (mac_y_casc_a),
.casc_a_aux (mac_y_casc_a_aux),
.a_in (mac_y_a),
.a_in_aux (mac_y_a_aux),
.b_in (mac_y_b),
.p_out (mac_y_p),
.p_out_aux (mac_y_p_aux)
);
genvar gen_z;
generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
begin : gen_xy_din
//assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
//assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
//gen_xy_dout
assign mac_x_a[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_din[gen_z*WORD_WIDTH+:WORD_WIDTH];
//assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
//assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
end
endgenerate
//
// MAC Clock Enable Logic
//
reg mac_xy_ce_adv = 1'b0;
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) mac_xy_ce_adv <= 1'b0;
else case (fsm_state)
FSM_STATE_MULT_SQUARE_COL_0_TRIG,
FSM_STATE_MULT_SQUARE_COL_0_BUSY,
FSM_STATE_MULT_SQUARE_COL_N_TRIG,
FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_ce_adv <= 1'b1;
default: mac_xy_ce_adv <= 1'b0;
endcase
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) {mac_y_ce, mac_x_ce} <= 2'b00;
else {mac_y_ce, mac_x_ce} <= {2{mac_xy_ce_adv}};
//
// MAC Clear Logic
//
wire [NUM_MULTS-1:0] calc_mac_x_clear_square_value =
calc_mac_clear_square(col_index_prev, x_din_addr_cnt_lower_prev, x_din_addr_cnt_upper_prev);
reg [NUM_MULTS-1:0] mac_xy_clr_adv;
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_MULT_SQUARE_COL_0_TRIG,
FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_clr_adv <= {NUM_MULTS{1'b1}};
FSM_STATE_MULT_SQUARE_COL_0_BUSY,
FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_clr_adv <= calc_mac_x_clear_square_value;
default: mac_xy_clr_adv <= {NUM_MULTS{1'bX}};
endcase
always @(posedge clk)
//
{mac_y_clr, mac_x_clr} <= {2{mac_xy_clr_adv}};
//
// MAC Cascade Logic
//
reg [NUM_MULTS-2:0] mac_xy_casc_a_adv;
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_MULT_SQUARE_COL_0_TRIG,
FSM_STATE_MULT_SQUARE_COL_N_TRIG: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b0}};
FSM_STATE_MULT_SQUARE_COL_0_BUSY,
FSM_STATE_MULT_SQUARE_COL_N_BUSY: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'b1}};
default: mac_xy_casc_a_adv <= {(NUM_MULTS-1){1'bX}};
endcase
always @(posedge clk)
//
{mac_y_casc_a, mac_x_casc_a} <= {2{mac_xy_casc_a_adv}};
//
// DOUT Mapping
//
generate for (gen_z=0; gen_z<NUM_MULTS; gen_z=gen_z+1)
begin : gen_xy_dout
assign x_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = x_dout_reg[gen_z];
assign y_dout[gen_z*WORD_WIDTH+:WORD_WIDTH] = y_dout_reg[gen_z];
end
endgenerate
//
// DOUT
//
reg [WORD_WIDTH-1:0] x_dout_reg[0:NUM_MULTS-1];
reg [WORD_WIDTH-1:0] y_dout_reg[0:NUM_MULTS-1];
integer int_z;
always @(posedge clk)
//
case (fsm_state)
//
FSM_STATE_LOAD_T1T2_3,
FSM_STATE_LOAD_NN_COEFF_3:
for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
x_dout_reg[int_z] <= load_x_din;
y_dout_reg[int_z] <= load_y_din;
end
//
default:
for (int_z=0; int_z<NUM_MULTS; int_z=int_z+1) begin
x_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
y_dout_reg[int_z] <= {WORD_WIDTH{1'bX}};
end
//
endcase
//
// FSM Process
//
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
else fsm_state <= fsm_state_next;
//
// FSM Transition Logic
//
always @* begin
//
fsm_state_next = FSM_STATE_IDLE;
//
case (fsm_state)
FSM_STATE_IDLE: fsm_state_next = ena ? fsm_state_after_idle : FSM_STATE_IDLE;
FSM_STATE_LOAD_T1T2_1: fsm_state_next = FSM_STATE_LOAD_T1T2_2 ;
FSM_STATE_LOAD_T1T2_2: fsm_state_next = FSM_STATE_LOAD_T1T2_3 ;
FSM_STATE_LOAD_T1T2_3: fsm_state_next = load_t1t2_addr_done ? FSM_STATE_LOAD_NN_COEFF_1 : FSM_STATE_LOAD_T1T2_1;
FSM_STATE_LOAD_NN_COEFF_1: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_2 ;
FSM_STATE_LOAD_NN_COEFF_2: fsm_state_next = FSM_STATE_LOAD_NN_COEFF_3 ;
FSM_STATE_LOAD_NN_COEFF_3: fsm_state_next = load_nn_coeff_addr_done ? FSM_STATE_STOP : FSM_STATE_LOAD_NN_COEFF_1;
FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_TRIG : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
/*
FSM_STATE_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_0_BUSY ;
FSM_STATE_TRIANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_TRIANGLE_COL_N_TRIG : FSM_STATE_TRIANGLE_COL_0_BUSY;
FSM_STATE_TRIANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_TRIANGLE_COL_N_BUSY ;
FSM_STATE_TRIANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_triangle : FSM_STATE_TRIANGLE_COL_N_BUSY;
FSM_STATE_RECTANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_0_BUSY ;
FSM_STATE_RECTANGLE_COL_0_BUSY: fsm_state_next = din_addr_narrow_done ? FSM_STATE_RECTANGLE_COL_N_TRIG : FSM_STATE_RECTANGLE_COL_0_BUSY;
FSM_STATE_RECTANGLE_COL_N_TRIG: fsm_state_next = FSM_STATE_RECTANGLE_COL_N_BUSY ;
FSM_STATE_RECTANGLE_COL_N_BUSY: fsm_state_next = din_addr_narrow_done ? fsm_state_after_rectangle : FSM_STATE_RECTANGLE_COL_N_BUSY;
*/
FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ;
endcase
//
end
//
// Ready Output
//
reg rdy_reg = 1'b1;
assign rdy = rdy_reg;
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) rdy_reg <= 1'b1;
else case (fsm_state)
FSM_STATE_IDLE: if (ena) rdy_reg <= 1'b0;
FSM_STATE_STOP: rdy_reg <= 1'b1;
endcase
function [ NUM_MULTS-1:0] calc_mac_clear_square;
input [INDEX_WIDTH-4:0] col_index_delayed;
input [ 3-1:0] x_din_addr_cnt_lower_delayed;
input [INDEX_WIDTH-4:0] x_din_addr_cnt_upper_delayed;
begin
if (x_din_addr_cnt_upper_delayed == col_index_delayed)
case (x_din_addr_cnt_lower_delayed)
3'b000: calc_mac_clear_square = 8'b00000001;
3'b001: calc_mac_clear_square = 8'b00000010;
3'b010: calc_mac_clear_square = 8'b00000100;
3'b011: calc_mac_clear_square = 8'b00001000;
3'b100: calc_mac_clear_square = 8'b00010000;
3'b101: calc_mac_clear_square = 8'b00100000;
3'b110: calc_mac_clear_square = 8'b01000000;
3'b111: calc_mac_clear_square = 8'b10000000;
endcase
else
calc_mac_clear_square = {NUM_MULTS{1'b0}};
end
endfunction
endmodule