//======================================================================
//
// modexpa7_systolic_multiplier.v
// -----------------------------------------------------------------------------
// Systolic Montgomery multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2017, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================
module modexpa7_systolic_multiplier #
(
//
// This sets the address widths of memory buffers. Internal data
// width is 32 bits, so for e.g. 1024-bit operands buffers must store
// 1024 / 32 = 32 words, and these need 5-bit address bus, because
// 2 ** 5 = 32.
//
parameter OPERAND_ADDR_WIDTH = 5,
//
// This sets the width of the systolic cycle counter. TODO: Explain.
//
parameter SYSTOLIC_ARRAY_POWER = 3
)
(
input clk,
input rst_n,
input ena,
output rdy,
output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr,
output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr,
input [ 32-1:0] a_bram_out,
input [ 32-1:0] b_bram_out,
input [ 32-1:0] n_bram_out,
input [ 32-1:0] n_coeff_bram_out,
output [ 32-1:0] r_bram_in,
output r_bram_wr,
input [OPERAND_ADDR_WIDTH-1:0] n_num_words
);
//
// Constants
//
localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER;
localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER;
localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH;
localparam SYSTOLIC_PE_LATENCY = 4;
//
// FSM Declaration
//
localparam [ 3: 0] FSM_STATE_IDLE = 4'd0;
localparam [ 3: 0] FSM_STATE_INIT_ZERO_ADDR = 4'd1;
localparam [ 3: 0] FSM_STATE_INIT_NEXT_ADDR = 4'd2;
localparam [ 3: 0] FSM_STATE_INIT_LAST_ADDR = 4'd3;
localparam [ 3: 0] FSM_STATE_PIPE_CRUNCH = 4'd4;
localparam [ 3: 0] FSM_STATE_PIPE_RELOAD = 4'd5;
localparam [ 3: 0] FSM_STATE_SAVE_ZERO_ADDR = 4'd6;
localparam [ 3: 0] FSM_STATE_SAVE_NEXT_ADDR = 4'd7;
localparam [ 3: 0] FSM_STATE_SAVE_LAST_ADDR = 4'd8;
localparam [ 3: 0] FSM_STATE_STOP = 4'd9;
reg [ 3: 0] fsm_state = FSM_STATE_IDLE;
reg [ 3: 0] fsm_next_state;
//
// Enable Delay (Trigger)
//
reg ena_dly = 1'b0;
wire ena_trig = ena && !ena_dly;
always @(posedge clk) ena_dly <= ena;
//
// Parameters Latch
//
reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch;
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR)
n_num_words_latch <= n_num_words;
//
// Addresses
//
localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}};
wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch;
//
// BRAM Addresses
//
reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_reg;
reg [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_reg;
reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_reg;
reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_reg;
reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_reg;
reg [OPERAND_ADDR_WIDTH-1:0] r_bram_addr_reg;
wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr = s_bram_addr_reg;
reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_dly;
reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_dly;
reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_dly;
reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_dly;
wire [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_next = b_bram_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_next = a_bram_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_next = n_bram_addr + 1'b1;
wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_next = s_bram_addr + 1'b1;
wire b_bram_addr_done =
(b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
wire s_bram_addr_done =
(s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0;
assign b_bram_addr = b_bram_addr_reg;
assign a_bram_addr = a_bram_addr_reg;
assign n_coeff_bram_addr = n_coeff_bram_addr_reg;
assign n_bram_addr = n_bram_addr_reg;
assign r_bram_addr = r_bram_addr_reg;
always @(posedge clk) b_bram_addr_dly <= b_bram_addr;
always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr;
always @(posedge clk) n_bram_addr_dly <= n_bram_addr;
always @(posedge clk) s_bram_addr_dly <= s_bram_addr;
always @(posedge clk)
//
case (fsm_next_state)
FSM_STATE_INIT_ZERO_ADDR: b_bram_addr_reg <= bram_addr_zero;
FSM_STATE_INIT_NEXT_ADDR: b_bram_addr_reg <= b_bram_addr_next;
endcase
always @(posedge clk)
case (fsm_next_state)
FSM_STATE_SAVE_ZERO_ADDR: s_bram_addr_reg <= bram_addr_zero;
FSM_STATE_SAVE_NEXT_ADDR: s_bram_addr_reg <= s_bram_addr_next;
endcase
always @(posedge clk)
//
case (fsm_next_state)
FSM_STATE_INIT_LAST_ADDR: a_bram_addr_reg <= bram_addr_zero;
FSM_STATE_PIPE_RELOAD: a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr;
endcase
always @(posedge clk)
//
case (fsm_next_state)
FSM_STATE_INIT_ZERO_ADDR: n_coeff_bram_addr_reg <= bram_addr_zero;
FSM_STATE_INIT_NEXT_ADDR: n_coeff_bram_addr_reg <= n_coeff_bram_addr_next;
endcase
//
// Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles...
//
wire [SYSTOLIC_PE_LATENCY:0] pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1};
reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb;
reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb;
wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb_next =
{pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]};
wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb_next =
{pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]};
wire pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY];
wire pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY];
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR,
FSM_STATE_PIPE_RELOAD: pe_latency_ab_lsb <= pe_latency_start;
FSM_STATE_PIPE_CRUNCH: pe_latency_ab_lsb <= pe_latency_ab_lsb_done ?
pe_latency_ab_lsb : pe_latency_ab_lsb_next;
endcase
//
// Buffers
//
integer i, j;
reg [31: 0] b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
reg [31: 0] n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
reg [31: 0] n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_ZERO_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
b_buf[i][j] <= 32'd0;
FSM_STATE_INIT_NEXT_ADDR,
FSM_STATE_INIT_LAST_ADDR:
b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out;
endcase
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_ZERO_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
n_coeff_buf[i][j] <= 32'd0;
FSM_STATE_INIT_NEXT_ADDR,
FSM_STATE_INIT_LAST_ADDR:
n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out;
endcase
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_ZERO_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
n_buf[i][j] <= 32'd0;
FSM_STATE_INIT_NEXT_ADDR,
FSM_STATE_INIT_LAST_ADDR:
n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out;
endcase
//
// Cycle Counters
//
reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab;
reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_q;
reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn;
reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_s;
reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt;
reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0];
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1];
wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}};
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}};
wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_half = {1'b0, n_num_words};
wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_last = {n_num_words, 1'b1};
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER];
wire mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0;
wire mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0;
wire mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0;
wire mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0;
wire syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0;
wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab_next = mult_cnt_ab + 1'b1;
wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_q_next = mult_cnt_q + 1'b1;
wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn_next = mult_cnt_qn + 1'b1;
wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_s_next = mult_cnt_s + 1'b1;
wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1;
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR,
FSM_STATE_PIPE_RELOAD: syst_cnt <= syst_cnt_zero;
FSM_STATE_PIPE_CRUNCH: syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next;
endcase
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR: mult_cnt_ab <= mult_cnt_zero;
FSM_STATE_PIPE_RELOAD: mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next;
endcase
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR: mult_cnt_q <= mult_cnt_zero;
FSM_STATE_PIPE_RELOAD: if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next;
endcase
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR: mult_cnt_qn <= mult_cnt_zero;
FSM_STATE_PIPE_RELOAD: if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next;
endcase
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR: mult_cnt_s <= mult_cnt_zero;
FSM_STATE_PIPE_RELOAD: if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next;
endcase
always @(posedge clk) begin
syst_cnt_dly[0] <= syst_cnt;
for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1)
syst_cnt_dly[i] <= syst_cnt_dly[i-1];
end
//
// Systolic Array
//
wire [31: 0] mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0];
wire [31: 0] mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
wire [31: 0] mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0];
wire [31: 0] mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
wire [31: 0] mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0];
wire [31: 0] mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0];
wire [31: 0] mul_ab_a = (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0;
reg [31: 0] mul_q_a_int;
reg [31: 0] mul_q_a;
reg [31: 0] mul_qn_a_int;
reg [31: 0] mul_qn_a;
reg [31: 0] t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
reg [31: 0] c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
reg [31: 0] t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
reg [31: 0] c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
reg [31: 0] t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
reg [31: 0] c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0];
genvar syst;
generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1)
begin : gen_mul
/*modexpa7_*/pe_mul mul_ab_inst
(
.clk (clk),
.a (mul_ab_a),
.b (b_buf[syst_cnt][syst]),
.t (t_ab[syst_cnt][syst]),
.c_in (c_ab_in[syst_cnt][syst]),
.p (mul_ab_p[syst]),
.c_out (mul_ab_c_out[syst])
);
/*modexpa7_*/pe_mul mul_q_inst
(
.clk (clk),
.a (mul_q_a),
.b (n_coeff_buf[syst_cnt][syst]),
.t (t_q[syst_cnt][syst]),
.c_in (c_q_in[syst_cnt][syst]),
.p (mul_q_p[syst]),
.c_out (mul_q_c_out[syst])
);
/*modexpa7_*/pe_mul mul_qn_inst
(
.clk (clk),
.a (mul_qn_a),
.b (n_buf[syst_cnt][syst]),
.t (t_qn[syst_cnt][syst]),
.c_in (c_qn_in[syst_cnt][syst]),
.p (mul_qn_p[syst]),
.c_out (mul_qn_c_out[syst])
);
end
endgenerate
//
// c_ab
//
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
c_ab_in[i][j] <= 32'd0;
FSM_STATE_PIPE_CRUNCH:
if (pe_latency_ab_lsb_done)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j];
endcase
//
// c_q
//
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
c_q_in[i][j] <= 32'd0;
FSM_STATE_PIPE_CRUNCH:
if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero))
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j];
endcase
//
// c_qn
//
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
c_qn_in[i][j] <= 32'd0;
FSM_STATE_PIPE_CRUNCH:
if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero))
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j];
endcase
//
// t_ab
//
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
t_ab[i][j] <= 32'd0;
FSM_STATE_PIPE_CRUNCH:
if (pe_latency_ab_lsb_done) begin
if (syst_cnt_latency > syst_cnt_zero)
t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0];
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j];
end
endcase
//
// t_q
//
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
t_q[i][j] <= 32'd0;
FSM_STATE_PIPE_CRUNCH:
if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin
if (syst_cnt_latency > syst_cnt_zero)
t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0];
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
t_q[syst_cnt_latency][j-1] <= mul_q_p[j];
end
endcase
//
// t_qn
//
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR:
for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1)
for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
t_qn[i][j] <= 32'd0;
FSM_STATE_PIPE_CRUNCH:
if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin
if (syst_cnt_latency > syst_cnt_zero)
t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0];
for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1)
t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j];
end
endcase
//
// Latency 2
//
always @(posedge clk)
//
if (fsm_next_state == FSM_STATE_PIPE_CRUNCH)
//
case (fsm_state)
FSM_STATE_INIT_LAST_ADDR,
FSM_STATE_PIPE_RELOAD: pe_latency_ab_msb <= pe_latency_start;
FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done)
pe_latency_ab_msb <= pe_latency_ab_msb_done ?
pe_latency_ab_msb : pe_latency_ab_msb_next;
endcase
//
// Adder
//
reg pe_add_ce;
reg [31: 0] pe_add_a0;
reg [31: 0] pe_add_a1;
reg [31: 0] pe_add_a2;
reg [31: 0] pe_add_b0;
reg pe_add_c_in;
wire [31: 0] pe_add_s;
wire pe_add_c_out;
reg pe_sub_ce;
reg [31: 0] pe_sub_a0;
reg [31: 0] pe_sub_b0;
reg pe_sub_b_in;
wire [31: 0] pe_sub_d;
wire pe_sub_b_out;
always @(posedge clk)
pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done;
always @(posedge clk)
pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero);
always @(posedge clk)
//
if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done)
pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out;
always @(posedge clk)
//
if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero))
pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out;
modexpa7_pe_add pe_add_inst
(
.clk (clk),
.ce (pe_add_ce),
.a (pe_add_a2),
.b (pe_add_b0),
.c_in (pe_add_c_in),
.s (pe_add_s),
.c_out (pe_add_c_out)
);
modexpa7_pe_sub pe_sub_inst
(
.clk (clk),
.ce (pe_sub_ce),
.a (pe_sub_a0),
.b (pe_sub_b0),
.b_in (pe_sub_b_in),
.d (pe_sub_d),
.b_out (pe_sub_b_out)
);
always @(posedge clk)
//
if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin
pe_add_a0 <= mul_ab_p[0];
pe_add_a1 <= pe_add_a0;
pe_add_a2 <= pe_add_a1;
end
always @(posedge clk)
//
if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
pe_sub_a0 <= pe_add_s;
always @(posedge clk)
//
if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
pe_add_b0 <= mul_qn_p[0];
always @(posedge clk)
//
if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out;
always @(posedge clk)
//
case (fsm_next_state)
FSM_STATE_INIT_ZERO_ADDR: n_bram_addr_reg <= bram_addr_zero;
FSM_STATE_INIT_NEXT_ADDR: n_bram_addr_reg <= n_bram_addr_next;
FSM_STATE_PIPE_RELOAD: begin
if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero;
if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next;
end
endcase
//
// Ready Flag Logic
//
reg rdy_reg = 1'b1;
assign rdy = rdy_reg;
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) rdy_reg <= 1'b1;
else begin
if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig;
if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1;
end
//
//
//
always @(posedge clk)
//
if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
mul_q_a_int <= mul_ab_p[0];
always @(posedge clk)
//
if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
mul_qn_a_int <= mul_q_p[0];
always @(posedge clk)
//
if (fsm_state == FSM_STATE_PIPE_RELOAD)
mul_q_a <= mul_q_a_int; // TODO: Add masking! Maybe not needed after all?..
always @(posedge clk)
//
if (fsm_state == FSM_STATE_PIPE_RELOAD)
mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0;
//
// Debug
//
//always @(posedge clk) begin
//
//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
//$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]);
//
//if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero))
//$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]);
//
//if (fsm_state == FSM_STATE_PIPE_RELOAD)
//$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s);
//
//if (fsm_state == FSM_STATE_PIPE_RELOAD)
//$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d);
//
//end
wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_rd;
reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr;
wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr_next = s_bram_addr_wr + 1'b1;
reg s_bram_en;
wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_rd;
reg [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr;
wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1;
reg sn_bram_en;
assign s_bram_addr_rd = s_bram_addr;
assign sn_bram_addr_rd = s_bram_addr;
wire [31: 0] s_bram_din;
wire [31: 0] s_bram_dout;
wire [31: 0] sn_bram_din;
wire [31: 0] sn_bram_dout;
assign s_bram_din = pe_add_s;
assign sn_bram_din = pe_sub_d;
always @(posedge clk)
//
s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half);
always @(posedge clk)
//
sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half);
always @(posedge clk) begin
//
if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero;
if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next;
end
always @(posedge clk) begin
//
if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero;
if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next;
end
bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
bram_s (.clk(clk),
.a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(),
.b_addr(s_bram_addr_rd), .b_out(s_bram_dout));
bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
bram_sn (.clk(clk),
.a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(),
.b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout));
reg r_bram_en;
always @(posedge clk)
//
case (fsm_state)
FSM_STATE_SAVE_ZERO_ADDR,
FSM_STATE_SAVE_NEXT_ADDR: r_bram_en <= 1'b1;
default: r_bram_en <= 1'b0;
endcase
reg r_bram_wr_reg;
assign r_bram_wr = r_bram_wr_reg;
always @(posedge clk)
//
r_bram_wr_reg <= r_bram_en;
wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out;
reg [31: 0] r_bram_in_reg;
assign r_bram_in = r_bram_in_reg;
always @(posedge clk)
//
if (r_bram_en)
r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout;
always @(posedge clk)
//
if (r_bram_en)
r_bram_addr_reg <= s_bram_addr_dly;
//
// FSM Transition Logic
//
always @(posedge clk or negedge rst_n)
//
if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
else fsm_state <= fsm_next_state;
always @* begin
//
fsm_next_state = FSM_STATE_STOP;
//
case (fsm_state)
FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_ZERO_ADDR;
else fsm_next_state = FSM_STATE_IDLE;
FSM_STATE_INIT_ZERO_ADDR: fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
FSM_STATE_INIT_NEXT_ADDR: if (b_bram_addr_done) fsm_next_state = FSM_STATE_INIT_LAST_ADDR;
else fsm_next_state = FSM_STATE_INIT_NEXT_ADDR;
FSM_STATE_INIT_LAST_ADDR: fsm_next_state = FSM_STATE_PIPE_CRUNCH;
FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) fsm_next_state = pe_latency_ab_msb_done ?
FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH;
else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
FSM_STATE_PIPE_RELOAD: if (mult_cnt_s_done) fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR;
else fsm_next_state = FSM_STATE_PIPE_CRUNCH;
FSM_STATE_SAVE_ZERO_ADDR: fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
FSM_STATE_SAVE_NEXT_ADDR: if (s_bram_addr_done) fsm_next_state = FSM_STATE_SAVE_LAST_ADDR;
else fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR;
FSM_STATE_SAVE_LAST_ADDR: fsm_next_state = FSM_STATE_STOP;
FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
endcase
end
endmodule
//======================================================================
// End of file
//======================================================================