//====================================================================== // // modexpa7_n_coeff.v // ----------------------------------------------------------------------------- // Montgomery modulus-dependent coefficient calculation block. // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_n_coeff # ( // // This sets the address widths of memory buffers. Internal data // width is 32 bits, so for e.g. 1024-bit operands buffers must store // 1024 / 32 = 32 words, and these need 5-bit address bus, because // 2 ** 5 = 32. // parameter OPERAND_ADDR_WIDTH = 5 ) ( input clk, input rst_n, input ena, output rdy, output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, input [ 32-1:0] n_bram_out, output [ 32-1:0] n_coeff_bram_in, output n_coeff_bram_wr, input [OPERAND_ADDR_WIDTH-1:0] n_num_words ); // // FSM Declaration // localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_INIT_1 = 8'hA1; localparam [ 7: 0] FSM_STATE_INIT_2 = 8'hA2; localparam [ 7: 0] FSM_STATE_INIT_3 = 8'hA3; localparam [ 7: 0] FSM_STATE_INIT_4 = 8'hA4; localparam [ 7: 0] FSM_STATE_INIT_5 = 8'hA5; localparam [ 7: 0] FSM_STATE_CALC_1 = 8'hB1; localparam [ 7: 0] FSM_STATE_CALC_2 = 8'hB2; localparam [ 7: 0] FSM_STATE_CALC_3 = 8'hB3; localparam [ 7: 0] FSM_STATE_CALC_4 = 8'hB4; /* localparam [ 7: 0] FSM_STATE_CALC_5 = 8'hB5; localparam [ 7: 0] FSM_STATE_CALC_6 = 8'hB6; localparam [ 7: 0] FSM_STATE_CALC_7 = 8'hB7; localparam [ 7: 0] FSM_STATE_CALC_8 = 8'hB8; localparam [ 7: 0] FSM_STATE_SAVE_1 = 8'hC1; localparam [ 7: 0] FSM_STATE_SAVE_2 = 8'hC2; localparam [ 7: 0] FSM_STATE_SAVE_3 = 8'hC3; localparam [ 7: 0] FSM_STATE_SAVE_4 = 8'hC4; localparam [ 7: 0] FSM_STATE_SAVE_5 = 8'hC5; */ localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; // // Enable Delay (Trigger) // reg ena_dly = 1'b0; wire ena_trig = ena && !ena_dly; always @(posedge clk) ena_dly <= ena; // // Parameters Latch // reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; always @(posedge clk) // if (fsm_next_state == FSM_STATE_INIT_1) n_num_words_latch <= n_num_words; // // Addresses // localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}}; wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch; /* // // Cycle Counters // reg [OPERAND_ADDR_WIDTH+5:0] cyc_cnt; // cycle counter wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}}; wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_last = {n_num_words, 1'b1, {5{1'b1}}}; wire [OPERAND_ADDR_WIDTH+5:0] cyc_cnt_next = cyc_cnt + 1'b1; wire cyc_cnt_done = (cyc_cnt == cyc_cnt_last) ? 1'b1 : 1'b0; always @(posedge clk) // if (fsm_next_state == FSM_STATE_CALC_1) // case (fsm_state) FSM_STATE_INIT_2: cyc_cnt <= cyc_cnt_zero; FSM_STATE_SAVE_5: cyc_cnt <= cyc_cnt_done ? cyc_cnt : cyc_cnt_next; endcase */ // // Ready Flag Logic // reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; end // // Block Memories // reg [OPERAND_ADDR_WIDTH-1:0] n_addr; reg [OPERAND_ADDR_WIDTH-1:0] r_addr; reg [OPERAND_ADDR_WIDTH-1:0] b_addr; reg [OPERAND_ADDR_WIDTH-1:0] nn_addr; reg [OPERAND_ADDR_WIDTH-1:0] t_addr_wr; reg [OPERAND_ADDR_WIDTH-1:0] t_addr_rd; reg [31: 0] r_data_in; reg [31: 0] b_data_in; reg [31: 0] nn_data_in; reg [31: 0] t_data_in; wire [31: 0] r_data_out; wire [31: 0] b_data_out; wire [31: 0] nn_data_out; wire [31: 0] t_data_out; reg r_wren; reg b_wren; reg nn_wren; reg t_wren; bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_r (.clk(clk), .a_addr(r_addr), .a_wr(r_wren), .a_in(r_data_in), .a_out(r_data_out)); bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_b (.clk(clk), .a_addr(b_addr), .a_wr(b_wren), .a_in(b_data_in), .a_out(b_data_out)); bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_nn (.clk(clk), .a_addr(nn_addr), .a_wr(nn_wren), .a_in(nn_data_in), .a_out(nn_data_out)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_t (.clk(clk), .a_addr(t_addr_wr), .a_wr(t_wren), .a_in(t_data_in), .a_out(), .b_addr(t_addr_rd), .b_out(t_data_out)); assign n_bram_addr = n_addr; wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] nn_addr_next = nn_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] t_addr_wr_next = t_addr_wr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] t_addr_rd_next = t_addr_rd + 1'b1; wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0; wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0; wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; wire nn_addr_done = (nn_addr == bram_addr_last) ? 1'b1 : 1'b0; wire t_addr_wr_done = (t_addr_wr == bram_addr_last) ? 1'b1 : 1'b0; wire t_addr_rd_done = (t_addr_rd == bram_addr_last) ? 1'b1 : 1'b0; // // Subtractor // wire [31: 0] add_s; wire add_c_in; reg add_b_lsb; reg add_c_in_mask; reg add_c_in_mask_dly; wire add_c_out; assign add_c_in = add_c_out & ~add_c_in_mask; always @(posedge clk) // add_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; always @(posedge clk) // add_b_lsb <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; always @(posedge clk) // add_c_in_mask_dly <= add_c_in_mask; ip_add32 add_inst ( .clk (clk), .a (~n_bram_out), .b ({{31{1'b0}}, add_b_lsb}), .c_in (add_c_in), .s (add_s), .c_out (add_c_out) ); // // Multiplier // reg [31: 0] pe_a; reg [31: 0] pe_b; reg [31: 0] pe_t; reg [31: 0] pe_c_in; wire [31: 0] pe_p; wire [31: 0] pe_c_out; modexpa7_pe_mul pe2 ( .clk (clk), .a (pe_a), .b (pe_b), .t (pe_t), .c_in (pe_c_in), .p (pe_p), .c_out (pe_c_out) ); /* always @(posedge clk) // case (fsm_next_state) FSM_STATE_CALC_2: f0_data_out_carry <= 1'b0; FSM_STATE_CALC_3, FSM_STATE_CALC_4, FSM_STATE_CALC_5, FSM_STATE_CALC_6: f0_data_out_carry <= f0_data_out[31]; default: f0_data_out_carry <= 1'bX; endcase */ /* reg sub_b_out_dly1; reg f0_data_out_carry_dly1; reg f0_data_out_carry_dly2; always @(posedge clk) sub_b_out_dly1 <= sub_b_out; always @(posedge clk) f0_data_out_carry_dly1 <= f0_data_out_carry; always @(posedge clk) f0_data_out_carry_dly2 <= f0_data_out_carry_dly1; reg flag_keep_f; always @(posedge clk) // if (fsm_next_state == FSM_STATE_SAVE_1) flag_keep_f <= sub_b_out_dly1 & ~f0_data_out_carry_dly2; */ always @* t_addr_rd = r_addr + nn_addr; always @(posedge clk) begin // case (fsm_next_state) FSM_STATE_INIT_1: n_addr <= bram_addr_zero; FSM_STATE_INIT_2, FSM_STATE_INIT_3, FSM_STATE_INIT_4, FSM_STATE_INIT_5: n_addr <= !n_addr_done ? n_addr_next : n_addr; endcase // case (fsm_next_state) FSM_STATE_INIT_4: nn_addr <= bram_addr_zero; FSM_STATE_INIT_5: nn_addr <= nn_addr_next; FSM_STATE_CALC_1: case (fsm_state) FSM_STATE_INIT_5: nn_addr <= bram_addr_zero; endcase endcase // case (fsm_next_state) FSM_STATE_INIT_4: r_addr <= bram_addr_zero; FSM_STATE_INIT_5: r_addr <= r_addr_next; FSM_STATE_CALC_1: r_addr <= bram_addr_zero; FSM_STATE_CALC_2, FSM_STATE_CALC_3, FSM_STATE_CALC_4: r_addr <= r_addr_next; endcase // case (fsm_next_state) FSM_STATE_INIT_4: b_addr <= bram_addr_zero; FSM_STATE_INIT_5: b_addr <= b_addr_next; endcase // end always @(posedge clk) begin // case (fsm_next_state) FSM_STATE_INIT_4, FSM_STATE_INIT_5: nn_wren <= 1'b1; default: nn_wren <= 1'b0; endcase // case (fsm_next_state) FSM_STATE_INIT_4, FSM_STATE_INIT_5: r_wren <= 1'b1; default: r_wren <= 1'b0; endcase // case (fsm_next_state) FSM_STATE_INIT_4, FSM_STATE_INIT_5: b_wren <= 1'b1; default: b_wren <= 1'b0; endcase /* case (fsm_next_state) FSM_STATE_SAVE_3, FSM_STATE_SAVE_4, FSM_STATE_SAVE_5: f_wren <= cyc_cnt_done; default: f_wren <= 1'b0; endcase */ end always @(posedge clk) begin // case (fsm_next_state) FSM_STATE_INIT_4, FSM_STATE_INIT_5: nn_data_in <= add_s; default: nn_data_in <= {32{1'bX}}; endcase // case (fsm_next_state) FSM_STATE_INIT_4, FSM_STATE_INIT_5: r_data_in <= {{31{1'b0}}, add_c_in_mask_dly}; default: r_data_in <= {32{1'bX}}; endcase // case (fsm_next_state) FSM_STATE_INIT_4, FSM_STATE_INIT_5: b_data_in <= {{31{1'b0}}, add_c_in_mask_dly}; default: b_data_in <= {32{1'bX}}; endcase /* case (fsm_next_state) FSM_STATE_CALC_3, FSM_STATE_CALC_4, FSM_STATE_CALC_5, FSM_STATE_CALC_6: f1_data_in <= f0_data_out_shifted; default: f1_data_in <= {32{1'bX}}; endcase // case (fsm_next_state) FSM_STATE_CALC_5, FSM_STATE_CALC_6, FSM_STATE_CALC_7, FSM_STATE_CALC_8: f2_data_in <= sub_d; default: f2_data_in <= {32{1'bX}}; endcase // case (fsm_next_state) FSM_STATE_SAVE_3, FSM_STATE_SAVE_4, FSM_STATE_SAVE_5: f_data_in <= flag_keep_f ? f1_data_out : f2_data_out; default: f_data_in <= {32{1'bX}}; endcase */ end // // FSM Transition Logic // always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_1; else fsm_next_state = FSM_STATE_IDLE; FSM_STATE_INIT_1: fsm_next_state = FSM_STATE_INIT_2; FSM_STATE_INIT_2: fsm_next_state = FSM_STATE_INIT_3; FSM_STATE_INIT_3: fsm_next_state = FSM_STATE_INIT_4; FSM_STATE_INIT_4: fsm_next_state = FSM_STATE_INIT_5; FSM_STATE_INIT_5: if (nn_addr_done) fsm_next_state = FSM_STATE_CALC_1; else fsm_next_state = FSM_STATE_INIT_5; FSM_STATE_CALC_1: fsm_next_state = FSM_STATE_CALC_2; FSM_STATE_CALC_2: fsm_next_state = FSM_STATE_CALC_3; FSM_STATE_CALC_3: fsm_next_state = FSM_STATE_CALC_4; FSM_STATE_CALC_4: fsm_next_state = FSM_STATE_STOP;//FSM_STATE_CALC_5; /* FSM_STATE_CALC_5: fsm_next_state = FSM_STATE_CALC_6; FSM_STATE_CALC_6: if (f1_addr_done) fsm_next_state = FSM_STATE_CALC_7; else fsm_next_state = FSM_STATE_CALC_6; FSM_STATE_CALC_7: fsm_next_state = FSM_STATE_CALC_8; FSM_STATE_CALC_8: fsm_next_state = FSM_STATE_SAVE_1; FSM_STATE_SAVE_1: fsm_next_state = FSM_STATE_SAVE_2; FSM_STATE_SAVE_2: fsm_next_state = FSM_STATE_SAVE_3; FSM_STATE_SAVE_3: fsm_next_state = FSM_STATE_SAVE_4; FSM_STATE_SAVE_4: if (f12_addr_done_dly) fsm_next_state = FSM_STATE_SAVE_5; else fsm_next_state = FSM_STATE_SAVE_4; FSM_STATE_SAVE_5: if (cyc_cnt_done) fsm_next_state = FSM_STATE_STOP; else fsm_next_state = FSM_STATE_CALC_1; */ FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; endcase end endmodule //====================================================================== // End of file //======================================================================