From b33f595c014250072e9d787057293ef685eab5f3 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Sun, 23 Jul 2017 04:10:47 +0300 Subject: Wrote top-level module. 4096-bit core with 16-tap systolic array synthesizes just fine: 10% slices 8% block memory 33% DSPs --- src/rtl/modexpa7_top.v | 481 +++++++++++++++++++++++++++++++++ src/rtl/pe/modexpa7_primitive_switch.v | 2 +- 2 files changed, 482 insertions(+), 1 deletion(-) create mode 100644 src/rtl/modexpa7_top.v diff --git a/src/rtl/modexpa7_top.v b/src/rtl/modexpa7_top.v new file mode 100644 index 0000000..0c4eabe --- /dev/null +++ b/src/rtl/modexpa7_top.v @@ -0,0 +1,481 @@ +//====================================================================== +// +// Copyright (c) 2016, NORDUnet A/S All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// - Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +//====================================================================== + +`timescale 1ns / 1ps + +module modexpa7_top # + ( + parameter OPERAND_ADDR_WIDTH = 7, + parameter SYSTOLIC_ARRAY_POWER = 4 + ) + ( + input clk, + input rst_n, + + input init, + output ready, + + input next, + output valid, + + input [OPERAND_ADDR_WIDTH-1:0] modulus_num_words, + input [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits, + + input bus_cs, + input bus_we, + input [OPERAND_ADDR_WIDTH+1:0] bus_addr, + input [ 32-1:0] bus_data_wr, + output [ 32-1:0] bus_data_rd + ); + + + /* + * FSM Declaration + */ + + localparam [ 2: 0] FSM_STATE_IDLE = 3'd0; + + localparam [ 2: 0] FSM_STATE_PRECALC_START = 3'd1; + localparam [ 2: 0] FSM_STATE_PRECALC_CRUNCH = 3'd2; + localparam [ 2: 0] FSM_STATE_PRECALC_FINAL = 3'd3; + + localparam [ 2: 0] FSM_STATE_EXPONENT_START = 3'd4; + localparam [ 2: 0] FSM_STATE_EXPONENT_CRUNCH = 3'd5; + localparam [ 2: 0] FSM_STATE_EXPONENT_FINAL = 3'd6; + + localparam [ 7: 0] FSM_STATE_STOP = 3'd7; + + + /* + * FSM State / Next State + */ + + reg [ 7: 0] fsm_state = FSM_STATE_IDLE; + reg [ 7: 0] fsm_next_state; + + + /* + * Enable Delay (Trigger) + */ + + reg init_dly = 1'b0; + reg next_dly = 1'b0; + + // delay init and next by one clock cycle + always @(posedge clk) init_dly <= init; + always @(posedge clk) next_dly <= next; + + // trigger new operation when one of the control inputs goes from low to high + wire init_trig = init && !init_dly; + wire next_trig = next && !next_dly; + + + /* + * Ready and Valid Flags Logic + */ + + reg ready_reg = 1'b0; + reg valid_reg = 1'b0; + + assign ready = ready_reg; + assign valid = valid_reg; + + // ready flag logic + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state + else case (fsm_state) + FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (!ready_reg) ready_reg <= 1'b1; // set flag after operation is finished + endcase + + // valid flag logic + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state + else case (fsm_state) + FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (!valid_reg) valid_reg <= 1'b1; // set flag after operation is finished + endcase + + + /* + * Parameters Latch + */ + reg [OPERAND_ADDR_WIDTH-1:0] modulus_num_words_latch; + reg [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits_latch; + + // save number of words in modulus when pre-calculation has been triggered, + // i.e. user has apparently loaded a new modulus into the core + always @(posedge clk) + // + if (fsm_next_state == FSM_STATE_PRECALC_START) + modulus_num_words_latch <= modulus_num_words; + + // save number of bits in exponent when exponentiation has been triggered, + // i.e. user has loaded a new message into the core and wants exponentiate + always @(posedge clk) + // + if (fsm_next_state == FSM_STATE_EXPONENT_START) + exponent_num_bits_latch <= exponent_num_bits; + + + /* + * Split bus address into bank/word parts. + */ + wire [ 2 - 1 : 0] bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+1:OPERAND_ADDR_WIDTH]; + wire [OPERAND_ADDR_WIDTH - 1 : 0] bus_addr_word = bus_addr[OPERAND_ADDR_WIDTH-1:0]; + + + /* + * Define bank offsets. + */ + localparam [ 1: 0] BANK_MODULUS = 2'b00; // 0 + localparam [ 1: 0] BANK_MESSAGE = 2'b01; // 1 + localparam [ 1: 0] BANK_EXPONENT = 2'b10; // 2 + localparam [ 1: 0] BANK_RESULT = 2'b11; // 3 + + + /* + * Instantiate user-accessible memories. + * + * We have four block memories: N for modulus, M for message, D for exponent + * and R for result. Memories N, M and D and writeable from the user's side, + * memory R is writeable from the core's side and is read-only by user. + * + * Note, that the core does squaring and multiplication simultaneously, so + * there are two identical systolic multipliers inside. It's better to have two + * copies of modulus to give router some freeding in placing the multipliers, + * that's why there are actually two identical block memories N1 and N2 instead of N. + * User reads from the first one, but writes to both of them. Note that the synthesis + * tool might get too clever and find out that N1 and N2 are identical and decide + * to throw one of them away, use (* KEEP="TRUE" *) or something like that then. + * + * We also need N3 and N4, because during pre-computation F and N_COEFF are calculated + * at the same time, so we need two more copies of modulus to allow different words + * of it to be read at the same time. + */ + + wire [OPERAND_ADDR_WIDTH-1:0] core_n1_addr; + wire [OPERAND_ADDR_WIDTH-1:0] core_n2_addr; + wire [OPERAND_ADDR_WIDTH-1:0] core_n3_addr; + wire [OPERAND_ADDR_WIDTH-1:0] core_n4_addr; + wire [OPERAND_ADDR_WIDTH-1:0] core_m_addr; + wire [OPERAND_ADDR_WIDTH-1:0] core_d_addr; + wire [OPERAND_ADDR_WIDTH-1:0] core_r_addr; + + wire [ 32-1:0] core_n1_data; + wire [ 32-1:0] core_n2_data; + wire [ 32-1:0] core_n3_data; + wire [ 32-1:0] core_n4_data; + wire [ 32-1:0] core_m_data; + wire [ 32-1:0] core_d_data; + wire [ 32-1:0] core_r_data; + + wire [ 32-1:0] user_n_data; + wire [ 32-1:0] user_m_data; + wire [ 32-1:0] user_d_data; + wire [ 32-1:0] user_r_data; + + wire core_r_wren; + wire user_n_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MODULUS); + wire user_m_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MESSAGE); + wire user_d_wren = bus_cs && bus_we && (bus_addr_bank == BANK_EXPONENT); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n1 (.clk(clk), + .a_addr(bus_addr_word), .a_out(user_n_data), .a_wr(user_n_wren), .a_in(bus_data_wr), + .b_addr(core_n1_addr), .b_out(core_n1_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n2 (.clk(clk), + .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), + .b_addr(core_n2_addr), .b_out(core_n2_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n3 (.clk(clk), + .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), + .b_addr(core_n3_addr), .b_out(core_n3_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n4 (.clk(clk), + .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), + .b_addr(core_n4_addr), .b_out(core_n4_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_m (.clk(clk), + .a_addr(bus_addr_word), .a_out(user_m_data), .a_wr(user_m_wren), .a_in(bus_data_wr), + .b_addr(core_m_addr), .b_out(core_m_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_d (.clk(clk), + .a_addr(bus_addr_word), .a_out(user_d_data), .a_wr(user_d_wren), .a_in(bus_data_wr), + .b_addr(core_d_addr), .b_out(core_d_data)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_r (.clk(clk), + .a_addr(core_r_addr), .a_out(), .a_wr(core_r_wren), .a_in(core_r_data), + .b_addr(bus_addr_word), .b_out(user_r_data)); + + + /* + * Instantiate internal memories. + * + * We have two block memories: F for Montgomery factor and N_COEFF for modulus-dependent + * coefficient, they are written to during pre-calculation and read from during exponentiation. + * + * Note, that there are actually two identical block memories N_COEFF1 and N_COEFF2 instead of + * just one N_COEFF, read the explanation above. F is only used by one of the multipliers, so + * we don't need F1 and F2. + */ + + wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_wr; + wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_rd; + wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff_addr_wr; + wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff1_addr_rd; + wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff2_addr_rd; + + wire [ 32-1:0] core_f_data_wr; + wire [ 32-1:0] core_f_data_rd; + wire [ 32-1:0] core_n_coeff_data_wr; + wire [ 32-1:0] core_n_coeff1_data_rd; + wire [ 32-1:0] core_n_coeff2_data_rd; + + wire core_f_wren; + wire core_n_coeff_wren; + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_f (.clk(clk), + .a_addr(core_f_addr_wr), .a_out(), .a_wr(core_f_wren), .a_in(core_f_data_wr), + .b_addr(core_f_addr_rd), .b_out(core_f_data_rd)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n_coeff1 (.clk(clk), + .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), + .b_addr(core_n_coeff1_addr_rd), .b_out(core_n_coeff1_data_rd)); + + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n_coeff2 (.clk(clk), + .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), + .b_addr(core_n_coeff2_addr_rd), .b_out(core_n_coeff2_data_rd)); + + + /* + * Montgomery factor calculation module. + */ + (* EQUIVALENT_REGISTER_REMOVAL="NO" *) + reg precalc_f_ena = 1'b0; + wire precalc_r_rdy; + + modexpa7_factor # + ( + .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH) + ) + precalc_f + ( + .clk (clk), + .rst_n (rst_n), + + .ena (precalc_f_ena), + .rdy (precalc_r_rdy), + + .n_bram_addr (core_n3_addr), + .f_bram_addr (core_f_addr_wr), + + .n_bram_out (core_n3_data), + + .f_bram_in (core_f_data_wr), + .f_bram_wr (core_f_wren), + + .n_num_words (modulus_num_words_latch) + ); + + + /* + * Modulus-depentent coefficient calculation module. + */ + (* EQUIVALENT_REGISTER_REMOVAL="NO" *) + reg precalc_n_coeff_ena = 1'b0; + wire precalc_n_coeff_rdy; + + modexpa7_n_coeff # + ( + .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH) + ) + precalc_n_coeff + ( + .clk (clk), + .rst_n (rst_n), + + .ena (precalc_n_coeff_ena), + .rdy (precalc_n_coeff_rdy), + + .n_bram_addr (core_n4_addr), + .n_coeff_bram_addr (core_n_coeff_addr_wr), + + .n_bram_out (core_n4_data), + + .n_coeff_bram_in (core_n_coeff_data_wr), + .n_coeff_bram_wr (core_n_coeff_wren), + + .n_num_words (modulus_num_words_latch) + ); + + /* + * Exponentiation module. + */ + + reg exponent_ena = 1'b0; + wire exponent_rdy; + + modexpa7_exponentiator # + ( + .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH), + .SYSTOLIC_ARRAY_POWER (SYSTOLIC_ARRAY_POWER) + ) + exponent_r + ( + .clk (clk), + .rst_n (rst_n), + + .ena (exponent_ena), + .rdy (exponent_rdy), + + .m_bram_addr (core_m_addr), + .d_bram_addr (core_d_addr), + .f_bram_addr (core_f_addr_rd), + .n1_bram_addr (core_n1_addr), + .n2_bram_addr (core_n2_addr), + .n_coeff1_bram_addr (core_n_coeff1_addr_rd), + .n_coeff2_bram_addr (core_n_coeff2_addr_rd), + .r_bram_addr (core_r_addr), + + .m_bram_out (core_m_data), + .d_bram_out (core_d_data), + .f_bram_out (core_f_data_rd), + .n1_bram_out (core_n1_data), + .n2_bram_out (core_n2_data), + .n_coeff1_bram_out (core_n_coeff1_data_rd), + .n_coeff2_bram_out (core_n_coeff2_data_rd), + + .r_bram_in (core_r_data), + .r_bram_wr (core_r_wren), + + .m_num_words (modulus_num_words_latch), + .d_num_bits (exponent_num_bits_latch) + ); + + + /* + * Sub-Module Enable Logic + */ + + always @(posedge clk) begin + precalc_f_ena <= (fsm_next_state == FSM_STATE_PRECALC_START) ? 1'b1 : 1'b0; + precalc_n_coeff_ena <= (fsm_next_state == FSM_STATE_PRECALC_START) ? 1'b1 : 1'b0; + exponent_ena <= (fsm_next_state == FSM_STATE_EXPONENT_START) ? 1'b1 : 1'b0; + end + + + + + /* + * FSM Process + */ + + always @(posedge clk or negedge rst_n) + // + if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; + else fsm_state <= fsm_next_state; + + + /* + * FSM Transition Logic + */ + + // handy flag that tells whether both pre-calculations modules are idle + wire precalc_rdy = precalc_n_coeff_rdy && precalc_r_rdy; + + always @* begin + // + fsm_next_state = FSM_STATE_STOP; + // + case (fsm_state) + // + FSM_STATE_IDLE: if (init_trig) fsm_next_state = FSM_STATE_PRECALC_START; // init has priority over next + else if (next_trig) fsm_next_state = FSM_STATE_EXPONENT_START; + else fsm_next_state = FSM_STATE_IDLE; + // + FSM_STATE_PRECALC_START: fsm_next_state = FSM_STATE_PRECALC_CRUNCH; + FSM_STATE_PRECALC_CRUNCH: if (precalc_rdy) fsm_next_state = FSM_STATE_PRECALC_FINAL; + else fsm_next_state = FSM_STATE_PRECALC_CRUNCH; + FSM_STATE_PRECALC_FINAL: fsm_next_state = FSM_STATE_STOP; + // + FSM_STATE_EXPONENT_START: fsm_next_state = FSM_STATE_EXPONENT_CRUNCH; + FSM_STATE_EXPONENT_CRUNCH: if (exponent_rdy) fsm_next_state = FSM_STATE_EXPONENT_FINAL; + else fsm_next_state = FSM_STATE_EXPONENT_CRUNCH; + FSM_STATE_EXPONENT_FINAL: fsm_next_state = FSM_STATE_STOP; + // + FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; + // + endcase + // + end + + + /* + * Bus read mux. + */ + + // delay bus_addr_bank by 1 clock cycle to remember from where we've just been reading + reg [1: 0] bus_addr_bank_dly; + always @(posedge clk) + if (bus_cs) bus_addr_bank_dly <= bus_addr_bank; + + // map mux to output port + reg [31: 0] bus_data_rd_mux; + assign bus_data_rd = bus_data_rd_mux; + + // select the right data word + always @(*) + // + case (bus_addr_bank_dly) + // + BANK_MODULUS: bus_data_rd_mux = user_n_data; + BANK_MESSAGE: bus_data_rd_mux = user_m_data; + BANK_EXPONENT: bus_data_rd_mux = user_d_data; + BANK_RESULT: bus_data_rd_mux = user_r_data; + // + endcase + + +endmodule diff --git a/src/rtl/pe/modexpa7_primitive_switch.v b/src/rtl/pe/modexpa7_primitive_switch.v index d38069b..3551d7a 100644 --- a/src/rtl/pe/modexpa7_primitive_switch.v +++ b/src/rtl/pe/modexpa7_primitive_switch.v @@ -1,4 +1,4 @@ -//`define USE_VENDOR_PRIMITIVES +`define USE_VENDOR_PRIMITIVES `ifdef USE_VENDOR_PRIMITIVES -- cgit v1.2.3