//====================================================================== // // Copyright (c) 2016, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== `timescale 1ns / 1ps module modexpa7_top # ( parameter OPERAND_ADDR_WIDTH = 7, parameter SYSTOLIC_ARRAY_POWER = 1 ) ( input clk, input rst_n, input init, output ready, input next, output valid, input crt_mode, input [OPERAND_ADDR_WIDTH-1:0] modulus_num_words, input [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits, input bus_cs, input bus_we, input [OPERAND_ADDR_WIDTH+1:0] bus_addr, input [ 32-1:0] bus_data_wr, output [ 32-1:0] bus_data_rd ); /* * FSM Declaration */ localparam [ 2: 0] FSM_STATE_IDLE = 3'd0; localparam [ 2: 0] FSM_STATE_PRECALC_START = 3'd1; localparam [ 2: 0] FSM_STATE_PRECALC_CRUNCH = 3'd2; localparam [ 2: 0] FSM_STATE_PRECALC_FINAL = 3'd3; localparam [ 2: 0] FSM_STATE_EXPONENT_START = 3'd4; localparam [ 2: 0] FSM_STATE_EXPONENT_CRUNCH = 3'd5; localparam [ 2: 0] FSM_STATE_EXPONENT_FINAL = 3'd6; localparam [ 7: 0] FSM_STATE_STOP = 3'd7; /* * FSM State / Next State */ reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; /* * Enable Delay (Trigger) */ reg init_dly = 1'b0; reg next_dly = 1'b0; // delay init and next by one clock cycle always @(posedge clk) init_dly <= init; always @(posedge clk) next_dly <= next; // trigger new operation when one of the control inputs goes from low to high wire init_trig = init && !init_dly; wire next_trig = next && !next_dly; /* * Ready and Valid Flags Logic */ reg ready_reg = 1'b0; reg valid_reg = 1'b0; assign ready = ready_reg; assign valid = valid_reg; // ready flag logic always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state else case (fsm_state) FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started FSM_STATE_STOP: if (!ready_reg) ready_reg <= 1'b1; // set flag after operation is finished endcase // valid flag logic always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state else case (fsm_state) FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started FSM_STATE_STOP: if (!valid_reg) valid_reg <= 1'b1; // set flag after operation is finished endcase /* * Parameters Latch */ reg [OPERAND_ADDR_WIDTH-1:0] modulus_num_words_latch; reg [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits_latch; // save number of words in modulus when pre-calculation has been triggered, // i.e. user has apparently loaded a new modulus into the core always @(posedge clk) // if (fsm_next_state == FSM_STATE_PRECALC_START) modulus_num_words_latch <= modulus_num_words; // save number of bits in exponent when exponentiation has been triggered, // i.e. user has loaded a new message into the core and wants exponentiate always @(posedge clk) // if (fsm_next_state == FSM_STATE_EXPONENT_START) exponent_num_bits_latch <= exponent_num_bits; /* * Split bus address into bank/word parts. */ wire [ 2 - 1 : 0] bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+1:OPERAND_ADDR_WIDTH]; wire [OPERAND_ADDR_WIDTH - 1 : 0] bus_addr_word = bus_addr[OPERAND_ADDR_WIDTH-1:0]; /* * Define bank offsets. */ localparam [ 1: 0] BANK_MODULUS = 2'b00; // 0 localparam [ 1: 0] BANK_MESSAGE = 2'b01; // 1 localparam [ 1: 0] BANK_EXPONENT = 2'b10; // 2 localparam [ 1: 0] BANK_RESULT = 2'b11; // 3 /* * Instantiate user-accessible memories. * * We have four block memories: N for modulus, M for message, D for exponent * and R for result. Memories N, M and D and writeable from the user's side, * memory R is writeable from the core's side and is read-only by user. * * Note, that the core does squaring and multiplication simultaneously, so * there are two identical systolic multipliers inside. It's better to have two * copies of modulus to give router some freeding in placing the multipliers, * that's why there are actually two identical block memories N1 and N2 instead of N. * User reads from the first one, but writes to both of them. Note that the synthesis * tool might get too clever and find out that N1 and N2 are identical and decide * to throw one of them away, use (* KEEP="TRUE" *) or something like that then. * * We also need N3 and N4, because during pre-computation F and N_COEFF are calculated * at the same time, so we need two more copies of modulus to allow different words * of it to be read at the same time. */ wire [OPERAND_ADDR_WIDTH-1:0] core_n1_addr; wire [OPERAND_ADDR_WIDTH-1:0] core_n2_addr; wire [OPERAND_ADDR_WIDTH-1:0] core_n3_addr; wire [OPERAND_ADDR_WIDTH-1:0] core_n4_addr; wire [OPERAND_ADDR_WIDTH-1:0] core_m_addr; wire [OPERAND_ADDR_WIDTH-1:0] core_d_addr; wire [OPERAND_ADDR_WIDTH-1:0] core_r_addr; wire [ 32-1:0] core_n1_data; wire [ 32-1:0] core_n2_data; wire [ 32-1:0] core_n3_data; wire [ 32-1:0] core_n4_data; wire [ 32-1:0] core_m_data; wire [ 32-1:0] core_d_data; wire [ 32-1:0] core_r_data; wire [ 32-1:0] user_n_data; wire [ 32-1:0] user_m_data; wire [ 32-1:0] user_d_data; wire [ 32-1:0] user_r_data; wire core_r_wren; wire user_n_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MODULUS); wire user_m_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MESSAGE); wire user_d_wren = bus_cs && bus_we && (bus_addr_bank == BANK_EXPONENT); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_n1 (.clk(clk), .a_addr(bus_addr_word), .a_out(user_n_data), .a_wr(user_n_wren), .a_in(bus_data_wr), .b_addr(core_n1_addr), .b_out(core_n1_data)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_n2 (.clk(clk), .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), .b_addr(core_n2_addr), .b_out(core_n2_data)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_n3 (.clk(clk), .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), .b_addr(core_n3_addr), .b_out(core_n3_data)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_n4 (.clk(clk), .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_wren), .a_in(bus_data_wr), .b_addr(core_n4_addr), .b_out(core_n4_data)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_m (.clk(clk), .a_addr(bus_addr_word), .a_out(user_m_data), .a_wr(user_m_wren), .a_in(bus_data_wr), .b_addr(core_m_addr), .b_out(core_m_data)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_d (.clk(clk), .a_addr(bus_addr_word), .a_out(user_d_data), .a_wr(user_d_wren), .a_in(bus_data_wr), .b_addr(core_d_addr), .b_out(core_d_data)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_r (.clk(clk), .a_addr(core_r_addr), .a_out(), .a_wr(core_r_wren), .a_in(core_r_data), .b_addr(bus_addr_word), .b_out(user_r_data)); /* * Instantiate internal memories. * * We have two block memories: F for Montgomery factor and N_COEFF for modulus-dependent * coefficient, they are written to during pre-calculation and read from during exponentiation. * * Note, that there are actually two identical block memories N_COEFF1 and N_COEFF2 instead of * just one N_COEFF, read the explanation above. F is only used by one of the multipliers, so * we don't need F1 and F2. */ wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_wr; wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_rd; wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff_addr_wr; wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff1_addr_rd; wire [OPERAND_ADDR_WIDTH-1:0] core_n_coeff2_addr_rd; wire [ 32-1:0] core_f_data_wr; wire [ 32-1:0] core_f_data_rd; wire [ 32-1:0] core_n_coeff_data_wr; wire [ 32-1:0] core_n_coeff1_data_rd; wire [ 32-1:0] core_n_coeff2_data_rd; wire core_f_wren; wire core_n_coeff_wren; bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_f (.clk(clk), .a_addr(core_f_addr_wr), .a_out(), .a_wr(core_f_wren), .a_in(core_f_data_wr), .b_addr(core_f_addr_rd), .b_out(core_f_data_rd)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_n_coeff1 (.clk(clk), .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), .b_addr(core_n_coeff1_addr_rd), .b_out(core_n_coeff1_data_rd)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_n_coeff2 (.clk(clk), .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), .b_addr(core_n_coeff2_addr_rd), .b_out(core_n_coeff2_data_rd)); /* * Montgomery factor calculation module. */ (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg precalc_f_ena = 1'b0; wire precalc_r_rdy; modexpa7_factor # ( .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH) ) precalc_f ( .clk (clk), .rst_n (rst_n), .ena (precalc_f_ena), .rdy (precalc_r_rdy), .n_bram_addr (core_n3_addr), .f_bram_addr (core_f_addr_wr), .n_bram_out (core_n3_data), .f_bram_in (core_f_data_wr), .f_bram_wr (core_f_wren), .n_num_words (modulus_num_words_latch) ); /* * Modulus-depentent coefficient calculation module. */ (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg precalc_n_coeff_ena = 1'b0; wire precalc_n_coeff_rdy; modexpa7_n_coeff # ( .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH) ) precalc_n_coeff ( .clk (clk), .rst_n (rst_n), .ena (precalc_n_coeff_ena), .rdy (precalc_n_coeff_rdy), .n_bram_addr (core_n4_addr), .n_coeff_bram_addr (core_n_coeff_addr_wr), .n_bram_out (core_n4_data), .n_coeff_bram_in (core_n_coeff_data_wr), .n_coeff_bram_wr (core_n_coeff_wren), .n_num_words (modulus_num_words_latch) ); /* * Exponentiation module. */ reg exponent_ena = 1'b0; wire exponent_rdy; modexpa7_exponentiator # ( .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH), .SYSTOLIC_ARRAY_POWER (SYSTOLIC_ARRAY_POWER) ) exponent_r ( .clk (clk), .rst_n (rst_n), .ena (exponent_ena), .rdy (exponent_rdy), .crt (crt_mode), .m_bram_addr (core_m_addr), .d_bram_addr (core_d_addr), .f_bram_addr (core_f_addr_rd), .n1_bram_addr (core_n1_addr), .n2_bram_addr (core_n2_addr), .n_coeff1_bram_addr (core_n_coeff1_addr_rd), .n_coeff2_bram_addr (core_n_coeff2_addr_rd), .r_bram_addr (core_r_addr), .m_bram_out (core_m_data), .d_bram_out (core_d_data), .f_bram_out (core_f_data_rd), .n1_bram_out (core_n1_data), .n2_bram_out (core_n2_data), .n_coeff1_bram_out (core_n_coeff1_data_rd), .n_coeff2_bram_out (core_n_coeff2_data_rd), .r_bram_in (core_r_data), .r_bram_wr (core_r_wren), .m_num_words (modulus_num_words_latch), .d_num_bits (exponent_num_bits_latch) ); /* * Sub-Module Enable Logic */ always @(posedge clk) begin precalc_f_ena <= (fsm_next_state == FSM_STATE_PRECALC_START) ? 1'b1 : 1'b0; precalc_n_coeff_ena <= (fsm_next_state == FSM_STATE_PRECALC_START) ? 1'b1 : 1'b0; exponent_ena <= (fsm_next_state == FSM_STATE_EXPONENT_START) ? 1'b1 : 1'b0; end /* * FSM Process */ always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; /* * FSM Transition Logic */ // handy flag that tells whether both pre-calculations modules are idle wire precalc_rdy = precalc_n_coeff_rdy && precalc_r_rdy; always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) // FSM_STATE_IDLE: if (init_trig) fsm_next_state = FSM_STATE_PRECALC_START; // init has priority over next else if (next_trig) fsm_next_state = FSM_STATE_EXPONENT_START; else fsm_next_state = FSM_STATE_IDLE; // FSM_STATE_PRECALC_START: fsm_next_state = FSM_STATE_PRECALC_CRUNCH; FSM_STATE_PRECALC_CRUNCH: if (precalc_rdy) fsm_next_state = FSM_STATE_PRECALC_FINAL; else fsm_next_state = FSM_STATE_PRECALC_CRUNCH; FSM_STATE_PRECALC_FINAL: fsm_next_state = FSM_STATE_STOP; // FSM_STATE_EXPONENT_START: fsm_next_state = FSM_STATE_EXPONENT_CRUNCH; FSM_STATE_EXPONENT_CRUNCH: if (exponent_rdy) fsm_next_state = FSM_STATE_EXPONENT_FINAL; else fsm_next_state = FSM_STATE_EXPONENT_CRUNCH; FSM_STATE_EXPONENT_FINAL: fsm_next_state = FSM_STATE_STOP; // FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; // endcase // end /* * Bus read mux. */ // delay bus_addr_bank by 1 clock cycle to remember from where we've just been reading reg [1: 0] bus_addr_bank_dly; always @(posedge clk) if (bus_cs) bus_addr_bank_dly <= bus_addr_bank; // map mux to output port reg [31: 0] bus_data_rd_mux; assign bus_data_rd = bus_data_rd_mux; // select the right data word always @(*) // case (bus_addr_bank_dly) // BANK_MODULUS: bus_data_rd_mux = user_n_data; BANK_MESSAGE: bus_data_rd_mux = user_m_data; BANK_EXPONENT: bus_data_rd_mux = user_d_data; BANK_RESULT: bus_data_rd_mux = user_r_data; // endcase endmodule