//======================================================================
//
// modexpa7_systolic_multiplier.v
// -----------------------------------------------------------------------------
// Systolic Montgomery multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================
module modexpa7_systolic_multiplier #
(
//
// This sets the address widths of memory buffers. Internal data
// width is 32 bits, so for e.g. 2048-bit operands buffers must store
// 2048 / 32 = 64 words, and these need 5-bit address bus, because
// 2 ** 6 = 64.
//
parameter OPERAND_ADDR_WIDTH = 4,
//
// Explain.
//
parameter SYSTOLIC_ARRAY_POWER = 2
)
(
input clk,
input rst_n,
input ena,
output rdy,
output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr,
input [ 32-1:0] a_bram_out,
input [ 32-1:0] b_bram_out,
input [ 32-1:0] n_bram_out,
input [ 32-1:0] n_coeff_bram_out,
output [ 32-1:0] r_bram_in,
output r_bram_wr,
input [OPERAND_ADDR_WIDTH-1:0] ab_num_words
);
//
// Include Settings
//
`include "pe/modexpa7_primitive_switch.v"
`include "modexpa7_settings.v"
//
// FSM Declaration
//
localparam [ 7: 0] FSM_STATE_IDLE = 8'h00;
localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11;
localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12;
localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13;
localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14;
localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21;
localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22;
localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23;
localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24;
localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31;
localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32;
localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33;
localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34;
localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41;
localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42;
localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43;
localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44;
localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51;
localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52;
localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53;
localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54;
localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61;
localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62;
localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63;
localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64;
localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71;
localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72;
localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73;
localparam [ 7: 0] FSM_STATE_STOP = 8'hFF;
//
// FSM State / Next State
//
reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
reg [ 7: 0] fsm_next_state;
//
// Enable Delay and Trigger
//
reg ena_dly = 1'b0;
/* delay enable by one clock cycle */
always @(posedge clk) ena_dly <= ena;
/* trigger new operation when enable goes high */
wire ena_trig = ena && !ena_dly;
//
// Ready Flag Logic
//
reg rdy_reg = 1'b1;
assign rdy = rdy_reg;
always @(posedge clk or negedge rst_n)
/* reset flag */
if (rst_n == 1'b0) rdy_reg <= 1'b1;
else begin
/* clear flag when operation is started */
if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
/* set flag after operation is finished */
if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
end
//
// Parameters Latch
//
reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch;
/* save number of words in a and b when new operation starts */
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_LOAD_B_START)
ab_num_words_latch <= ab_num_words;
//
// Systolic Cycle Counters
//
/* handy values */
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
/* counters */
reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init;
reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load;
reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload;
/* handy increment values */
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1;
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1;
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1;
/* handy stop flags */
wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0;
wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0;
wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0;
/* delayed load counter */
reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly;
always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load;
//
// Multiplier Iteration Counter
//
/* handy values */
wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}};
wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}};
/* counter */
reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt;
/* handy increment value and stop flag */
wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1;
wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0;
//
// Initialization Counter Control Logic
//
always @(posedge clk) begin
//
case (fsm_state)
FSM_STATE_LOAD_B_START,
FSM_STATE_LOAD_N_COEFF_START,
FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero;
FSM_STATE_LOAD_B_SHIFT,
FSM_STATE_LOAD_N_COEFF_SHIFT,
FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next;
endcase
//
case (fsm_state)
FSM_STATE_LOAD_B_START,
FSM_STATE_LOAD_N_COEFF_START,
FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero;
FSM_STATE_LOAD_B_WRITE,
FSM_STATE_LOAD_N_COEFF_WRITE,
FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
endcase
//
end
//
// Operand Loader
//
/*
* Explain how parallelized loader works here...
*
*/
/* loader banks */
localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0;
localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1;
localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2;
/* loader input */
reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1];
reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1];
reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1];
reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1];
/* loader output */
wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1];
/* generate parallelized loader */
//
// Loader currently stores B, N_COEFF and N, it can be coded another way
// to initially store B, then AB, then Q. Some memory can be saved thay way.
// Maybe later...
//
genvar i;
generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
//
begin : gen_bram_1rw_readfirst_loader
//
bram_1rw_readfirst #
(
.MEM_WIDTH (32),
.MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2)
)
bram_loader
(
.clk (clk),
.a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}),
.a_wr (loader_wren[i]),
.a_in (loader_din[i]),
.a_out (loader_dout[i])
);
//
end
//
endgenerate
//
// Block Memory Addresses
//
/*
* Explain why there are two memory sizes.
*
*/
/* the very first addresses */
wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}};
wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
/* the very last addresses */
wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch};
wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1};
/* address registers */
reg [OPERAND_ADDR_WIDTH-1:0] a_addr;
reg [OPERAND_ADDR_WIDTH-1:0] b_addr;
reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr;
reg [OPERAND_ADDR_WIDTH-1:0] n_addr;
reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext;
reg [OPERAND_ADDR_WIDTH-1:0] q_addr;
reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext;
reg [OPERAND_ADDR_WIDTH-1:0] s_addr;
reg [OPERAND_ADDR_WIDTH-1:0] sn_addr;
reg [OPERAND_ADDR_WIDTH-1:0] r_addr;
/* handy increment values */
wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] s_addr_next = s_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] sn_addr_next = sn_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1;
/* handy stop flags */
wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0;
wire s_addr_done = (s_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire sn_addr_done = (sn_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0;
/* delayed B address */
reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly;
always @(posedge clk) b_addr_dly <= b_addr;
reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly;
always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr;
reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly;
always @(posedge clk) n_addr_dly <= n_addr;
/* map registers to top-level ports */
assign a_bram_addr = a_addr;
assign b_bram_addr = b_addr;
assign n_coeff_bram_addr = n_coeff_addr;
assign n_bram_addr = n_addr;
assign r_bram_addr = r_addr;
//
// Flag
//
reg flag_select_s;
//
// Memory Address Control Logic
//
always @(posedge clk) begin
//
case (fsm_next_state)
FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero;
FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero;
FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero;
FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next;
FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next;
FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next;
endcase
//
case (fsm_state)
FSM_STATE_MULT_Q_N_RELOAD:
if (qn_addr_ext == {1'b0, bram_addr_last})
n_addr <= bram_addr_zero;
else if (qn_addr_ext > {1'b0, bram_addr_last})
n_addr <= n_addr_next;
endcase
//
case (fsm_state)
FSM_STATE_SAVE_START: r_addr <= bram_addr_zero;
FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next;
endcase
//
case (fsm_next_state)
FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero;
FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr;
endcase
//
end
//
// Internal Memories
//
/* memory inputs */
reg [31: 0] ab_data_in;
reg [31: 0] q_data_in;
reg [31: 0] qn_data_in;
wire [31: 0] s_data_in;
wire [31: 0] sn_data_in;
reg [31: 0] r_data_in;
/* memory outputs */
wire [31: 0] ab_data_out;
wire [31: 0] q_data_out;
wire [31: 0] qn_data_out;
wire [31: 0] s_data_out;
wire [31: 0] sn_data_out;
/* write enables */
reg ab_wren;
reg q_wren;
reg qn_wren;
reg s_wren;
reg sn_wren;
reg r_wren;
/* map */
assign r_bram_in = r_data_in;
assign r_bram_wr = r_wren;
bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out));
bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out));
bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1))
bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out));
bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out));
bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out));
//
// Wide Operand Loader
//
integer j;
/* shift logic */
always @(posedge clk)
//
case (fsm_state)
//
FSM_STATE_LOAD_B_SHIFT: begin
/* update the rightmost part of loader buffer */
loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}};
/* shift the loader buffer to the left */
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_din[j-1] <= loader_din[j];
end
//
FSM_STATE_LOAD_N_COEFF_SHIFT: begin
/* update the rightmost part of loader buffer */
loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}};
/* shift the loader buffer to the left */
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_din[j-1] <= loader_din[j];
end
//
FSM_STATE_LOAD_N_SHIFT: begin
/* update the rightmost part of loader buffer */
loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}};
/* shift the loader buffer to the left */
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_din[j-1] <= loader_din[j];
end
//
endcase
/* write enable logic */
always @(posedge clk)
//
case (fsm_next_state)
FSM_STATE_LOAD_B_WRITE,
FSM_STATE_LOAD_N_COEFF_WRITE,
FSM_STATE_LOAD_N_WRITE:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_wren[j] <= 1'b1;
default:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_wren[j] <= 1'b0;
endcase
/* loader address update logic */
always @(posedge clk) begin
//
case (fsm_state)
FSM_STATE_LOAD_B_START,
FSM_STATE_LOAD_N_COEFF_START,
FSM_STATE_LOAD_N_START:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_addr_lsb[j] <= syst_cnt_zero;
FSM_STATE_LOAD_B_WRITE,
FSM_STATE_LOAD_N_COEFF_WRITE,
FSM_STATE_LOAD_N_WRITE:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init;
endcase
//
case (fsm_next_state)
FSM_STATE_MULT_A_B_START,
FSM_STATE_MULT_AB_N_COEFF_START,
FSM_STATE_MULT_Q_N_START,
FSM_STATE_MULT_A_B_RELOAD,
FSM_STATE_MULT_AB_N_COEFF_RELOAD,
FSM_STATE_MULT_Q_N_RELOAD:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_addr_lsb[j] <= syst_cnt_zero;
FSM_STATE_MULT_A_B_CRUNCH,
FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
FSM_STATE_MULT_Q_N_CRUNCH:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init;
endcase
//
case (fsm_next_state)
FSM_STATE_LOAD_B_START,
FSM_STATE_MULT_A_B_START:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_addr_msb[j] <= LOADER_ADDR_MSB_B;
FSM_STATE_LOAD_N_COEFF_START,
FSM_STATE_MULT_AB_N_COEFF_START:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF;
FSM_STATE_LOAD_N_START,
FSM_STATE_MULT_Q_N_START:
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
loader_addr_msb[j] <= LOADER_ADDR_MSB_N;
endcase
//
end
//
// Systolic Array of Processing Elements
//
reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1];
reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1];
reg [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1];
reg [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1];
wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1];
wire [31: 0] pe_c_out[0:SYSTOLIC_ARRAY_LENGTH-1];
//
// These can be turned into a FIFO (maybe later?)...
//
reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1];
generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1)
begin : modexpa7_systolic_pe_multiplier
modexpa7_systolic_pe systolic_pe_inst
(
.clk (clk),
.a (pe_a[i]),
.b (pe_b[i]),
.t (pe_t[i]),
.c_in (pe_c_in[i]),
.p (pe_p[i]),
.c_out (pe_c_out[i])
);
end
endgenerate
//
// Shift Registers
//
reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load;
reg [SYSTOLIC_PE_LATENCY :0] shreg_latency;
reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload;
wire shreg_done_load = shreg_load[syst_cnt_last];
wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY];
wire shreg_done_unload = shreg_unload[syst_cnt_last];
reg shreg_now_loading;
reg shreg_now_latency;
reg shreg_now_unloading;
reg shreg_done_latency_dly;
always @(posedge clk)
shreg_done_latency_dly <= shreg_done_latency;
always @(posedge clk)
//
case (fsm_state)
//
FSM_STATE_MULT_A_B_START,
FSM_STATE_MULT_AB_N_COEFF_START,
FSM_STATE_MULT_Q_N_START,
FSM_STATE_MULT_A_B_RELOAD,
FSM_STATE_MULT_AB_N_COEFF_RELOAD,
FSM_STATE_MULT_Q_N_RELOAD: begin
shreg_now_loading <= 1'b1;
shreg_now_latency <= 1'b1;
shreg_now_unloading <= 1'b0;
shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1};
shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0};
end
//
FSM_STATE_MULT_A_B_CRUNCH,
FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
FSM_STATE_MULT_Q_N_CRUNCH: begin
shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0};
shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0};
shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]};
if (shreg_done_load) shreg_now_loading <= 1'b0;
if (shreg_done_latency) shreg_now_latency <= 1'b0;
if (shreg_done_latency) shreg_now_unloading <= 1'b1;
else if (shreg_done_unload) shreg_now_unloading <= 1'b0;
end
//
default: begin
shreg_now_loading <= 1'b0;
shreg_now_latency <= 1'b0;
shreg_now_unloading <= 1'b0;
end
//
endcase
always @(posedge clk) begin
//
case (fsm_state)
FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero;
FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero;
FSM_STATE_MULT_Q_N_START: begin qn_addr_ext <= bram_addr_ext_zero;
ab_addr_ext <= bram_addr_ext_zero;
end
FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next;
FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next;
FSM_STATE_MULT_Q_N_RELOAD: begin qn_addr_ext <= qn_addr_ext_next;
ab_addr_ext <= ab_addr_ext_next;
end
endcase
//
case (fsm_state)
FSM_STATE_MULT_Q_N_RELOAD: begin
if (qn_addr_ext == {1'b0, bram_addr_last}) begin
s_addr <= bram_addr_zero;
sn_addr <= bram_addr_zero;
end
if ((qn_addr_ext > {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin
s_addr <= s_addr_next;
sn_addr <= sn_addr_next;
end
if (qn_addr_ext == bram_addr_ext_last) begin
s_addr <= bram_addr_zero;
sn_addr <= bram_addr_zero;
end
end
FSM_STATE_MULT_Q_N_FINAL,
FSM_STATE_SAVE_START,
FSM_STATE_SAVE_WRITE: begin
s_addr <= !s_addr_done ? s_addr_next : s_addr;
sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr;
end
endcase
//
case (fsm_next_state)
FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero;
FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next;
endcase
//
case (fsm_next_state)
FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero;
FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr;
endcase
//
end
always @(posedge clk) begin
//
if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin
ab_wren <= shreg_done_latency_dly;
ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
end else begin
ab_wren <= 1'b0;
ab_data_in <= 32'hXXXXXXXX;
end
//
if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin
q_wren <= shreg_done_latency_dly;
q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
end else begin
q_wren <= 1'b0;
q_data_in <= 32'hXXXXXXXX;
end
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin
qn_wren <= shreg_done_latency_dly;
qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX;
end else begin
qn_wren <= 1'b0;
qn_data_in <= 32'hXXXXXXXX;
end
//
case (fsm_state)
FSM_STATE_SAVE_START: r_wren <= 1'b1;
FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done;
default: r_wren <= 1'b0;
endcase
//
end
always @(posedge clk)
//
case (fsm_next_state)
FSM_STATE_MULT_A_B_START,
FSM_STATE_MULT_AB_N_COEFF_START,
FSM_STATE_MULT_Q_N_START,
FSM_STATE_MULT_A_B_RELOAD,
FSM_STATE_MULT_AB_N_COEFF_RELOAD,
FSM_STATE_MULT_Q_N_RELOAD:
//
syst_cnt_load <= syst_cnt_zero;
FSM_STATE_MULT_A_B_CRUNCH,
FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
FSM_STATE_MULT_Q_N_CRUNCH:
//
syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load;
endcase
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_MULT_A_B_CRUNCH,
FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
FSM_STATE_MULT_Q_N_CRUNCH: begin
if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero;
else if (shreg_now_unloading)
syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload;
end
endcase
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_MULT_A_B_CRUNCH,
FSM_STATE_MULT_AB_N_COEFF_CRUNCH,
FSM_STATE_MULT_Q_N_CRUNCH: begin
if (shreg_now_unloading)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
pe_c_out_mem[syst_cnt_unload][j] <= pe_c_out[j];
if (shreg_now_unloading) begin
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
pe_t_mem[syst_cnt_unload][j-1] <= pe_p[j];
if (syst_cnt_unload > syst_cnt_zero)
pe_t_mem[syst_cnt_unload-1'b1][SYSTOLIC_ARRAY_LENGTH-1] <= pe_p[0];
else
pe_t_mem[syst_cnt_last][SYSTOLIC_ARRAY_LENGTH-1] <= 32'd0;
end
end
endcase
//
// T and C_IN can be moved to a separate code block
//
always @(posedge clk) begin
//
if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH)
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
//
if (shreg_now_loading) begin
pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out;
pe_b[j] <= loader_dout[j];
pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
end else begin
pe_a[j] <= 32'hXXXXXXXX;
pe_b[j] <= 32'hXXXXXXXX;
pe_t[j] <= 32'hXXXXXXXX;
pe_c_in[j] <= 32'hXXXXXXXX;
end
//
if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH)
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
//
if (shreg_now_loading) begin
pe_a[j] <= ab_data_out;
pe_b[j] <= loader_dout[j];
pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
end else begin
pe_a[j] <= 32'hXXXXXXXX;
pe_b[j] <= 32'hXXXXXXXX;
pe_t[j] <= 32'hXXXXXXXX;
pe_c_in[j] <= 32'hXXXXXXXX;
end
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
//
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
//
if (shreg_now_loading) begin
pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out;
pe_b[j] <= loader_dout[j];
pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j];
pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j];
end else begin
pe_a[j] <= 32'hXXXXXXXX;
pe_b[j] <= 32'hXXXXXXXX;
pe_t[j] <= 32'hXXXXXXXX;
pe_c_in[j] <= 32'hXXXXXXXX;
end
//
//
end
//
// Adder
//
/*
* This adder is used to calculate S = AB + QN.
*
*/
reg add1_ce; // clock enable
reg [31: 0] add1_s; // sum output
wire add1_c_in; // carry input
wire [31: 0] add1_a; // A-input
reg [31: 0] add1_b; // B-input
reg add1_c_in_mask; // flag to not carry anything into the very first word
reg add1_c_out; // carry output
/* add masking into carry feedback chain */
assign add1_c_in = add1_c_out & ~add1_c_in_mask;
/* mask carry for the very first word of N */
//always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0;
always @(posedge clk)
//
if (add1_ce)
//
{add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in};
assign add1_a = qn_data_in;
always @(posedge clk)
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX;
else
add1_b <= 32'hXXXXXXXX;
always @(posedge clk)
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0;
else
add1_c_in_mask <= 1'b0;
always @(posedge clk)
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
add1_ce <= shreg_done_latency_dly;
else
add1_ce <= 1'b0;
assign s_data_in = add1_s;
assign sn_data_in = sub1_d;
always @(posedge clk) begin
//
s_wren <= add1_ce;
sn_wren <= sub1_ce;
end
//
// Subtractor
//
/*
* This subtractor is used to calculate SN = S - N.
*
*/
reg sub1_ce; // clock enable
reg [31: 0] sub1_d; // difference output
wire sub1_b_in; // borrow input
wire [31: 0] sub1_a; // A-input
reg [31: 0] sub1_b; // B-input
reg sub1_b_in_mask; // flag to not borrow anything from the very first word
reg sub1_b_out; // borrow output
/* add masking into borrow feedback chain */
assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask;
always @(posedge clk)
//
if (sub1_ce)
//
{sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in};
assign sub1_a = add1_s;
always @(posedge clk)
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX;
else
sub1_b <= 32'hXXXXXXXX;
always @(posedge clk)
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0;
else
sub1_b_in_mask <= 1'b0;
always @(posedge clk)
//
if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH)
sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr});
else
sub1_ce <= 1'b0;
assign s_data_in = add1_s;
always @(posedge clk)
//
s_wren <= add1_ce;
always @(posedge clk)
//
if (fsm_state == FSM_STATE_MULT_Q_N_FINAL)
flag_select_s <= sub1_b_out & ~add1_c_out;
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_SAVE_START,
FSM_STATE_SAVE_WRITE:
r_data_in <= flag_select_s ? s_data_out : sn_data_out;
endcase
//
// FSM Process
//
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
else fsm_state <= fsm_next_state;
//
// FSM Transition Logic
//
always @* begin
//
fsm_next_state = FSM_STATE_STOP;
//
case (fsm_state)
FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START;
else fsm_next_state = FSM_STATE_IDLE;
//
FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE;
else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL;
else fsm_next_state = FSM_STATE_LOAD_B_SHIFT;
FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START;
//
FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE;
else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL;
else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT;
FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START;
//
FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE;
else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL;
else fsm_next_state = FSM_STATE_LOAD_N_SHIFT;
FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START;
//
FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD;
else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL;
else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH;
FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START;
//
FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD;
else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL;
else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH;
FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START;
//
FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD;
else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL;
else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH;
FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START;
//
FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE;
FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL;
else fsm_next_state = FSM_STATE_SAVE_WRITE;
FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP;
//
FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
endcase
//
end
endmodule
//======================================================================
// End of file
//======================================================================