aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-23 04:10:47 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-07-23 04:10:47 +0300
commitb33f595c014250072e9d787057293ef685eab5f3 (patch)
tree348d27c613c965e156a94e9be4fb1c984af0de23
parent3ca2b94aae8af47788ec236f624857c15c4e73b1 (diff)
Wrote top-level module. 4096-bit core with 16-tap systolic array synthesizes just fine:
10% slices 8% block memory 33% DSPs
-rw-r--r--src/rtl/modexpa7_top.v481
-rw-r--r--src/rtl/pe/modexpa7_primitive_switch.v2
2 files changed, 482 insertions, 1 deletions
diff --git a/src/rtl/modexpa7_top.v b/src/rtl/modexpa7_top.v
new file mode 100644
index 0000000..0c4eabe
--- /dev/null
+++ b/src/rtl/modexpa7_top.v
@@ -0,0 +1,481 @@
+//======================================================================
+//
+// Copyright (c) 2016, NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+`timescale 1ns / 1ps
+
+module modexpa7_top #
+ (
+ parameter OPERAND_ADDR_WIDTH = 7,
+ parameter SYSTOLIC_ARRAY_POWER = 4
+ )
+ (
+ input clk,
+ input rst_n,
+
+ input init,
+ output ready,
+
+ input next,
+ output valid,
+
+ input [OPERAND_ADDR_WIDTH-1:0] modulus_num_words,
+ input [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits,
+
+ input bus_cs,
+ input bus_we,
+ input [OPERAND_ADDR_WIDTH+1:0] bus_addr,
+ input [ 32-1:0] bus_data_wr,
+ output [ 32-1:0] bus_data_rd
+ );
+
+
+ /*
+ * FSM Declaration
+ */
+
+ localparam [ 2: 0] FSM_STATE_IDLE = 3'd0;
+
+ localparam [ 2: 0] FSM_STATE_PRECALC_START = 3'd1;
+ localparam [ 2: 0] FSM_STATE_PRECALC_CRUNCH = 3'd2;
+ localparam [ 2: 0] FSM_STATE_PRECALC_FINAL = 3'd3;
+
+ localparam [ 2: 0] FSM_STATE_EXPONENT_START = 3'd4;
+ localparam [ 2: 0] FSM_STATE_EXPONENT_CRUNCH = 3'd5;
+ localparam [ 2: 0] FSM_STATE_EXPONENT_FINAL = 3'd6;
+
+ localparam [ 7: 0] FSM_STATE_STOP = 3'd7;
+
+
+ /*
+ * FSM State / Next State
+ */
+
+ reg [ 7: 0] fsm_state = FSM_STATE_IDLE;
+ reg [ 7: 0] fsm_next_state;
+
+
+ /*
+ * Enable Delay (Trigger)
+ */
+
+ reg init_dly = 1'b0;
+ reg next_dly = 1'b0;
+
+ // delay init and next by one clock cycle
+ always @(posedge clk) init_dly <= init;
+ always @(posedge clk) next_dly <= next;
+
+ // trigger new operation when one of the control inputs goes from low to high
+ wire init_trig = init && !init_dly;
+ wire next_trig = next && !next_dly;
+
+
+ /*
+ * Ready and Valid Flags Logic
+ */
+
+ reg ready_reg = 1'b0;
+ reg valid_reg = 1'b0;
+
+ assign ready = ready_reg;
+ assign valid = valid_reg;
+
+ // ready flag logic
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state
+ else case (fsm_state)
+ FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started
+ FSM_STATE_STOP: if (!ready_reg) ready_reg <= 1'b1; // set flag after operation is finished
+ endcase
+
+ // valid flag logic
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state
+ else case (fsm_state)
+ FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started
+ FSM_STATE_STOP: if (!valid_reg) valid_reg <= 1'b1; // set flag after operation is finished
+ endcase
+
+
+ /*
+ * Parameters Latch
+ */
+ reg [OPERAND_ADDR_WIDTH-1:0] modulus_num_words_latch;
+ reg [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits_latch;
+
+ // save number of words in modulus when pre-calculation has been triggered,
+ // i.e. user has apparently loaded a new modulus into the core
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_PRECALC_START)
+ modulus_num_words_latch <= modulus_num_words;
+
+ // save number of bits in exponent when exponentiation has been triggered,
+ // i.e. user has loaded a new message into the core and wants exponentiate
+ always @(posedge clk)
+ //
+ if (fsm_next_state == FSM_STATE_EXPONENT_START)
+ exponent_num_bits_latch <= exponent_num_bits;
+
+
+ /*
+ * Split bus address into bank/word parts.
+ */
+ wire [ 2 - 1 : 0] bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+1:OPERAND_ADDR_WIDTH];
+ wire [OPERAND_ADDR_WIDTH - 1 : 0] bus_addr_word = bus_addr[OPERAND_ADDR_WIDTH-1:0];
+
+
+ /*
+ * Define bank offsets.
+ */
+ localparam [ 1: 0] BANK_MODULUS = 2'b00; // 0
+ localparam [ 1: 0] BANK_MESSAGE = 2'b01; // 1
+ localparam [ 1: 0] BANK_EXPONENT = 2'b10; // 2
+ localparam [ 1: 0] BANK_RESULT = 2'b11; // 3
+
+
+ /*
+ * Instantiate user-accessible memories.
+ *
+ * We have four block memories: N for modulus, M for message, D for exponent
+ * and R for result. Memories N, M and D and writeable from the user's side,
+ * memory R is writeable from the core's side and is read-only by user.
+ *
+ * Note, that the core does squaring and multiplication simultaneously, so
+ * there are two identical systolic multipliers inside. It's better to have two
+ * copies of modulus to give router some freeding in placing the multipliers,
+ * that's why there are actually two identical block memories N1 and N2 instead of N.
+ * User reads from the first one, but writes to both of them. Note that the synthesis
+ * tool might get too clever and find out that N1 and N2 are identical and decide
+ * to throw one of them away, use (* KEEP="TRUE" *) or something like that then.
+ *
+ * We also need N3 and N4, because during pre-computation F and N_COEFF are calculated
+ * at the same time, so we need two more copies of modulus to allow different words
+ * of it to be read at the same time.
+ */
+
+ wire [OPERAND_ADDR_WIDTH-1:0] core_n1_addr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_n2_addr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_n3_addr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_n4_addr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_m_addr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_d_addr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_r_addr;
+
+ wire [ 32-1:0] core_n1_data;
+ wire [ 32-1:0] core_n2_data;
+ wire [ 32-1:0] core_n3_data;
+ wire [ 32-1:0] core_n4_data;
+ wire [ 32-1:0] core_m_data;
+ wire [ 32-1:0] core_d_data;
+ wire [ 32-1:0] core_r_data;
+
+ wire [ 32-1:0] user_n_data;
+ wire [ 32-1:0] user_m_data;
+ wire [ 32-1:0] user_d_data;
+ wire [ 32-1:0] user_r_data;
+
+ wire core_r_wren;
+ wire user_n_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MODULUS);
+ wire user_m_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MESSAGE);
+ wire user_d_wren = bus_cs && bus_we && (bus_addr_bank == BANK_EXPONENT);
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_n1 (.clk(clk),
+ .a_addr(bus_addr_word), .a_out(user_n_data), .a_wr(user_n_wren), .a_in(bus_data_wr),
+ .b_addr(core_n1_addr), .b_out(core_n1_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_n2 (.clk(clk),
+ .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr),
+ .b_addr(core_n2_addr), .b_out(core_n2_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_n3 (.clk(clk),
+ .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr),
+ .b_addr(core_n3_addr), .b_out(core_n3_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_n4 (.clk(clk),
+ .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr),
+ .b_addr(core_n4_addr), .b_out(core_n4_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_m (.clk(clk),
+ .a_addr(bus_addr_word), .a_out(user_m_data), .a_wr(user_m_wren), .a_in(bus_data_wr),
+ .b_addr(core_m_addr), .b_out(core_m_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_d (.clk(clk),
+ .a_addr(bus_addr_word), .a_out(user_d_data), .a_wr(user_d_wren), .a_in(bus_data_wr),
+ .b_addr(core_d_addr), .b_out(core_d_data));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_r (.clk(clk),
+ .a_addr(core_r_addr), .a_out(), .a_wr(core_r_wren), .a_in(core_r_data),
+ .b_addr(bus_addr_word), .b_out(user_r_data));
+
+
+ /*
+ * Instantiate internal memories.
+ *
+ * We have two block memories: F for Montgomery factor and N_COEFF for modulus-dependent
+ * coefficient, they are written to during pre-calculation and read from during exponentiation.
+ *
+ * Note, that there are actually two identical block memories N_COEFF1 and N_COEFF2 instead of
+ * just one N_COEFF, read the explanation above. F is only used by one of the multipliers, so
+ * we don't need F1 and F2.
+ */
+
+ wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_wr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_rd;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff_addr_wr;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff1_addr_rd;
+ wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff2_addr_rd;
+
+ wire [ 32-1:0] core_f_data_wr;
+ wire [ 32-1:0] core_f_data_rd;
+ wire [ 32-1:0] core_n_coeff_data_wr;
+ wire [ 32-1:0] core_n_coeff1_data_rd;
+ wire [ 32-1:0] core_n_coeff2_data_rd;
+
+ wire core_f_wren;
+ wire core_n_coeff_wren;
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_f (.clk(clk),
+ .a_addr(core_f_addr_wr), .a_out(), .a_wr(core_f_wren), .a_in(core_f_data_wr),
+ .b_addr(core_f_addr_rd), .b_out(core_f_data_rd));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_n_coeff1 (.clk(clk),
+ .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr),
+ .b_addr(core_n_coeff1_addr_rd), .b_out(core_n_coeff1_data_rd));
+
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH))
+ bram_n_coeff2 (.clk(clk),
+ .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr),
+ .b_addr(core_n_coeff2_addr_rd), .b_out(core_n_coeff2_data_rd));
+
+
+ /*
+ * Montgomery factor calculation module.
+ */
+ (* EQUIVALENT_REGISTER_REMOVAL="NO" *)
+ reg precalc_f_ena = 1'b0;
+ wire precalc_r_rdy;
+
+ modexpa7_factor #
+ (
+ .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH)
+ )
+ precalc_f
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+
+ .ena (precalc_f_ena),
+ .rdy (precalc_r_rdy),
+
+ .n_bram_addr (core_n3_addr),
+ .f_bram_addr (core_f_addr_wr),
+
+ .n_bram_out (core_n3_data),
+
+ .f_bram_in (core_f_data_wr),
+ .f_bram_wr (core_f_wren),
+
+ .n_num_words (modulus_num_words_latch)
+ );
+
+
+ /*
+ * Modulus-depentent coefficient calculation module.
+ */
+ (* EQUIVALENT_REGISTER_REMOVAL="NO" *)
+ reg precalc_n_coeff_ena = 1'b0;
+ wire precalc_n_coeff_rdy;
+
+ modexpa7_n_coeff #
+ (
+ .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH)
+ )
+ precalc_n_coeff
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+
+ .ena (precalc_n_coeff_ena),
+ .rdy (precalc_n_coeff_rdy),
+
+ .n_bram_addr (core_n4_addr),
+ .n_coeff_bram_addr (core_n_coeff_addr_wr),
+
+ .n_bram_out (core_n4_data),
+
+ .n_coeff_bram_in (core_n_coeff_data_wr),
+ .n_coeff_bram_wr (core_n_coeff_wren),
+
+ .n_num_words (modulus_num_words_latch)
+ );
+
+ /*
+ * Exponentiation module.
+ */
+
+ reg exponent_ena = 1'b0;
+ wire exponent_rdy;
+
+ modexpa7_exponentiator #
+ (
+ .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH),
+ .SYSTOLIC_ARRAY_POWER (SYSTOLIC_ARRAY_POWER)
+ )
+ exponent_r
+ (
+ .clk (clk),
+ .rst_n (rst_n),
+
+ .ena (exponent_ena),
+ .rdy (exponent_rdy),
+
+ .m_bram_addr (core_m_addr),
+ .d_bram_addr (core_d_addr),
+ .f_bram_addr (core_f_addr_rd),
+ .n1_bram_addr (core_n1_addr),
+ .n2_bram_addr (core_n2_addr),
+ .n_coeff1_bram_addr (core_n_coeff1_addr_rd),
+ .n_coeff2_bram_addr (core_n_coeff2_addr_rd),
+ .r_bram_addr (core_r_addr),
+
+ .m_bram_out (core_m_data),
+ .d_bram_out (core_d_data),
+ .f_bram_out (core_f_data_rd),
+ .n1_bram_out (core_n1_data),
+ .n2_bram_out (core_n2_data),
+ .n_coeff1_bram_out (core_n_coeff1_data_rd),
+ .n_coeff2_bram_out (core_n_coeff2_data_rd),
+
+ .r_bram_in (core_r_data),
+ .r_bram_wr (core_r_wren),
+
+ .m_num_words (modulus_num_words_latch),
+ .d_num_bits (exponent_num_bits_latch)
+ );
+
+
+ /*
+ * Sub-Module Enable Logic
+ */
+
+ always @(posedge clk) begin
+ precalc_f_ena <= (fsm_next_state == FSM_STATE_PRECALC_START) ? 1'b1 : 1'b0;
+ precalc_n_coeff_ena <= (fsm_next_state == FSM_STATE_PRECALC_START) ? 1'b1 : 1'b0;
+ exponent_ena <= (fsm_next_state == FSM_STATE_EXPONENT_START) ? 1'b1 : 1'b0;
+ end
+
+
+
+
+ /*
+ * FSM Process
+ */
+
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
+ else fsm_state <= fsm_next_state;
+
+
+ /*
+ * FSM Transition Logic
+ */
+
+ // handy flag that tells whether both pre-calculations modules are idle
+ wire precalc_rdy = precalc_n_coeff_rdy && precalc_r_rdy;
+
+ always @* begin
+ //
+ fsm_next_state = FSM_STATE_STOP;
+ //
+ case (fsm_state)
+ //
+ FSM_STATE_IDLE: if (init_trig) fsm_next_state = FSM_STATE_PRECALC_START; // init has priority over next
+ else if (next_trig) fsm_next_state = FSM_STATE_EXPONENT_START;
+ else fsm_next_state = FSM_STATE_IDLE;
+ //
+ FSM_STATE_PRECALC_START: fsm_next_state = FSM_STATE_PRECALC_CRUNCH;
+ FSM_STATE_PRECALC_CRUNCH: if (precalc_rdy) fsm_next_state = FSM_STATE_PRECALC_FINAL;
+ else fsm_next_state = FSM_STATE_PRECALC_CRUNCH;
+ FSM_STATE_PRECALC_FINAL: fsm_next_state = FSM_STATE_STOP;
+ //
+ FSM_STATE_EXPONENT_START: fsm_next_state = FSM_STATE_EXPONENT_CRUNCH;
+ FSM_STATE_EXPONENT_CRUNCH: if (exponent_rdy) fsm_next_state = FSM_STATE_EXPONENT_FINAL;
+ else fsm_next_state = FSM_STATE_EXPONENT_CRUNCH;
+ FSM_STATE_EXPONENT_FINAL: fsm_next_state = FSM_STATE_STOP;
+ //
+ FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE;
+ //
+ endcase
+ //
+ end
+
+
+ /*
+ * Bus read mux.
+ */
+
+ // delay bus_addr_bank by 1 clock cycle to remember from where we've just been reading
+ reg [1: 0] bus_addr_bank_dly;
+ always @(posedge clk)
+ if (bus_cs) bus_addr_bank_dly <= bus_addr_bank;
+
+ // map mux to output port
+ reg [31: 0] bus_data_rd_mux;
+ assign bus_data_rd = bus_data_rd_mux;
+
+ // select the right data word
+ always @(*)
+ //
+ case (bus_addr_bank_dly)
+ //
+ BANK_MODULUS: bus_data_rd_mux = user_n_data;
+ BANK_MESSAGE: bus_data_rd_mux = user_m_data;
+ BANK_EXPONENT: bus_data_rd_mux = user_d_data;
+ BANK_RESULT: bus_data_rd_mux = user_r_data;
+ //
+ endcase
+
+
+endmodule
diff --git a/src/rtl/pe/modexpa7_primitive_switch.v b/src/rtl/pe/modexpa7_primitive_switch.v
index d38069b..3551d7a 100644
--- a/src/rtl/pe/modexpa7_primitive_switch.v
+++ b/src/rtl/pe/modexpa7_primitive_switch.v
@@ -1,4 +1,4 @@
-//`define USE_VENDOR_PRIMITIVES
+`define USE_VENDOR_PRIMITIVES
`ifdef USE_VENDOR_PRIMITIVES