//====================================================================== // // modexpa7_systolic_multiplier.v // ----------------------------------------------------------------------------- // Systolic Montgomery multiplier. // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_systolic_multiplier # ( // // This sets the address widths of memory buffers. Internal data // width is 32 bits, so for e.g. 2048-bit operands buffers must store // 2048 / 32 = 64 words, and these need 6-bit address bus, because // 2 ** 6 = 64. // parameter OPERAND_ADDR_WIDTH = 4, // // Explain. // parameter SYSTOLIC_ARRAY_POWER = 2 ) ( input clk, input rst_n, input ena, output rdy, input reduce_only, output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, input [ 32-1:0] a_bram_out, input [ 32-1:0] b_bram_out, input [ 32-1:0] n_bram_out, input [ 32-1:0] n_coeff_bram_out, output [ 32-1:0] r_bram_in, output r_bram_wr, input [OPERAND_ADDR_WIDTH-1:0] n_num_words ); /* * Include Settings */ `include "cryptech_primitive_switch.vh" `include "modexpa7_settings.vh" /* * FSM Declaration */ localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_LOAD_START = 8'h11; localparam [ 7: 0] FSM_STATE_LOAD_SHIFT = 8'h12; localparam [ 7: 0] FSM_STATE_LOAD_WRITE = 8'h13; localparam [ 7: 0] FSM_STATE_LOAD_FINAL = 8'h14; localparam [ 7: 0] FSM_STATE_MULT_START = 8'h21; localparam [ 7: 0] FSM_STATE_MULT_CRUNCH = 8'h22; localparam [ 7: 0] FSM_STATE_MULT_FINAL = 8'h23; localparam [ 7: 0] FSM_STATE_ADD_START = 8'h31; localparam [ 7: 0] FSM_STATE_ADD_CRUNCH = 8'h32; localparam [ 7: 0] FSM_STATE_ADD_UNLOAD = 8'h33; localparam [ 7: 0] FSM_STATE_SUB_UNLOAD = 8'h34; localparam [ 7: 0] FSM_STATE_ADD_FINAL = 8'h35; localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h41; localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h42; localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h43; localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; /* * FSM State / Next State / Previous State */ reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; //reg [ 7: 0] fsm_prev_state; /* * Enable Delay and Trigger */ reg ena_dly = 1'b0; // delay enable by one clock cycle always @(posedge clk) ena_dly <= ena; // trigger new operation when enable goes high wire ena_trig = ena && !ena_dly; /* * Ready Flag Logic */ reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) // reset flag if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin // clear flag when operation is started if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; // set flag after operation is finished if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; end /* * Parameters Latch */ reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; reg [OPERAND_ADDR_WIDTH :0] p_num_words_latch; reg reduce_only_latch; // save number of words in n when new operation starts always @(posedge clk) // if ((fsm_state == FSM_STATE_IDLE) && ena_trig) n_num_words_latch <= n_num_words; always @(posedge clk) // if ((fsm_state == FSM_STATE_IDLE) && ena_trig) reduce_only_latch <= reduce_only; /* * Multiplication Phase */ localparam [ 1: 0] MULT_PHASE_A_B = 2'd1; localparam [ 1: 0] MULT_PHASE_AB_N_COEFF = 2'd2; localparam [ 1: 0] MULT_PHASE_Q_N = 2'd3; localparam [ 1: 0] MULT_PHASE_STALL = 2'd0; reg [ 1: 0] mult_phase; wire mult_phase_ab = (mult_phase == MULT_PHASE_A_B) ? 1'b1 : 1'b0; wire mult_phase_done = (mult_phase == MULT_PHASE_STALL) ? 1'b1 : 1'b0; always @(posedge clk) // case (fsm_next_state) FSM_STATE_LOAD_START: if (ena_trig) mult_phase <= MULT_PHASE_A_B; FSM_STATE_MULT_FINAL: case (mult_phase) MULT_PHASE_A_B: mult_phase <= MULT_PHASE_AB_N_COEFF; MULT_PHASE_AB_N_COEFF: mult_phase <= MULT_PHASE_Q_N; MULT_PHASE_Q_N: mult_phase <= MULT_PHASE_STALL; endcase endcase /* * Counters */ // handy values wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; // counter reg [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt; reg [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt; // handy increment value and stop flag wire [SYSTOLIC_ARRAY_POWER-1:0] load_mult_cnt_next = load_mult_cnt + 1'b1; wire [SYSTOLIC_CNTR_WIDTH-1:0] load_syst_cnt_next = load_syst_cnt + 1'b1; wire load_mult_cnt_done = (load_mult_cnt == load_mult_cnt_last) ? 1'b1 : 1'b0; wire load_syst_cnt_done = (load_syst_cnt == load_syst_cnt_last) ? 1'b1 : 1'b0; /* * Loader Count Logic */ always @(posedge clk) begin // case (fsm_state) FSM_STATE_LOAD_START: {load_syst_cnt, load_mult_cnt} <= {load_syst_cnt_zero, load_mult_cnt_zero}; // FSM_STATE_LOAD_SHIFT: load_mult_cnt <= load_mult_cnt_next; FSM_STATE_LOAD_WRITE: load_syst_cnt <= !load_syst_cnt_done ? load_syst_cnt_next : load_syst_cnt; endcase // end /* * Wide Operand Loader */ /* * Explain how parallelized loader works here... * */ // loader input reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_wr; wire [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_rd; reg loader_wren; reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; // loader output wire [ 32-1:0] loader_dout[0:SYSTOLIC_ARRAY_LENGTH-1]; // array_input wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] pe_a_wide; wire [32 * SYSTOLIC_ARRAY_LENGTH - 1 : 0] pe_b_wide; // generate parallelized loader genvar i; generate for (i=0; i {1'b0, bram_addr_last}) n_addr <= n_addr_next; end endcase // end /* * Multiplier Array */ reg pe_array_ena; wire pe_array_rdy; always @(posedge clk) // case (fsm_next_state) FSM_STATE_MULT_START: pe_array_ena <= 1'b1; default: pe_array_ena <= 1'b0; endcase always @(posedge clk) // if (fsm_next_state == FSM_STATE_MULT_START) // case (mult_phase) MULT_PHASE_A_B: p_num_words_latch <= {n_num_words_latch, 1'b1}; MULT_PHASE_AB_N_COEFF: p_num_words_latch <= {1'b0, n_num_words_latch}; MULT_PHASE_Q_N: p_num_words_latch <= {n_num_words_latch, 1'b1}; endcase assign a_bram_addr = a_addr; assign n_coeff_bram_addr = a_addr; assign q_addr_rd = a_addr; reg [31: 0] a_data_out; always @* // case (mult_phase) MULT_PHASE_A_B: a_data_out = a_bram_out; MULT_PHASE_AB_N_COEFF: a_data_out = n_coeff_bram_out; MULT_PHASE_Q_N: a_data_out = q_data_out; default: a_data_out = {32{1'bX}}; endcase modexpa7_systolic_multiplier_array # ( .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH), .SYSTOLIC_ARRAY_POWER (SYSTOLIC_ARRAY_POWER) ) systolic_pe_array ( .clk (clk), .rst_n (rst_n), .ena (pe_array_ena), .rdy (pe_array_rdy), .crt (reduce_only_latch && mult_phase_ab), .loader_addr_rd (loader_addr_rd), .pe_a_wide ({SYSTOLIC_ARRAY_LENGTH{a_data_out}}), .pe_b_wide (pe_b_wide), .a_bram_addr (a_addr), .p_bram_addr (p_addr_ext_wr), .p_bram_in (p_data_in), .p_bram_wr (p_wren), .n_num_words (n_num_words_latch), .p_num_words (p_num_words_latch) ); /* * Adder */ reg add1_ce; // clock enable wire [31: 0] add1_s; // sum output wire add1_c_in; // carry input wire [31: 0] add1_a; // A-input wire [31: 0] add1_b; // B-input reg add1_c_in_mask; // flag to not carry anything into the very first word wire add1_c_out; // carry output `CRYPTECH_PRIMITIVE_ADD32_CE add1_inst ( .clk (clk), .ce (add1_ce), .a (add1_a), .b (add1_b), .c_in (add1_c_in), .s (add1_s), .c_out (add1_c_out) ); /* * Subtractor */ reg sub1_ce; // clock enable wire [31: 0] sub1_d; // difference output wire sub1_b_in; // borrow input wire [31: 0] sub1_a; // A-input wire [31: 0] sub1_b; // B-input reg sub1_b_in_mask; // flag to not borrow anything from the very first word wire sub1_b_out; // borrow output `CRYPTECH_PRIMITIVE_SUB32_CE sub1_inst ( .clk (clk), .ce (sub1_ce), .a (sub1_a), .b (sub1_b), .b_in (sub1_b_in), .d (sub1_d), .b_out (sub1_b_out) ); // add masking into carry feedback chain assign add1_c_in = add1_c_out & ~add1_c_in_mask; // add masking into borrow feedback chain assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; // mask carry for the very first words of AB and QN always @(posedge clk) // add1_c_in_mask <= (fsm_state == FSM_STATE_ADD_START) ? 1'b1 : 1'b0; // mask borrow for the very first words of S and N always @(posedge clk) // sub1_b_in_mask <= add1_c_in_mask; // map adder inputs assign add1_a = ab_data_out; assign add1_b = qn_data_out; // map subtractor inputs assign sub1_a = add1_s; assign sub1_b = (qn_addr_ext_rd_dly2 <= {1'b0, bram_addr_last}) ? 32'd0 : n_bram_out; // clock enable always @(posedge clk) begin // case (fsm_state) FSM_STATE_ADD_START, FSM_STATE_ADD_CRUNCH: add1_ce <= 1'b1; default: add1_ce <= 1'b0; endcase // sub1_ce <= add1_ce; // end // map outputs assign s_data_in = add1_s; assign sn_data_in = sub1_d; // write enabled always @(posedge clk) begin // case (fsm_state) FSM_STATE_ADD_CRUNCH, FSM_STATE_ADD_UNLOAD: s_wren <= 1'b1; default: s_wren <= 1'b0; endcase // case (fsm_state) FSM_STATE_ADD_CRUNCH, FSM_STATE_ADD_UNLOAD, FSM_STATE_SUB_UNLOAD, FSM_STATE_ADD_FINAL: sn_wren <= s_wren; default: sn_wren <= 1'b0; endcase // case (fsm_state) FSM_STATE_SAVE_START, FSM_STATE_SAVE_WRITE: r_wren <= 1'b1; default: r_wren <= 1'b0; endcase // end // ... always @(posedge clk) begin // case (fsm_state) FSM_STATE_ADD_CRUNCH, FSM_STATE_ADD_UNLOAD: begin if (qn_addr_ext_rd_dly1 == {1'b0, bram_addr_zero}) s_addr <= bram_addr_zero; else if (qn_addr_ext_rd_dly2 > {1'b0, bram_addr_last}) s_addr <= s_addr_next; end FSM_STATE_ADD_FINAL: s_addr <= bram_addr_zero; FSM_STATE_SAVE_START, FSM_STATE_SAVE_WRITE: s_addr <= s_addr_next; endcase // case (fsm_state) FSM_STATE_ADD_CRUNCH, FSM_STATE_ADD_UNLOAD, FSM_STATE_SUB_UNLOAD: begin if (qn_addr_ext_rd_dly2 == {1'b0, bram_addr_zero}) sn_addr <= bram_addr_zero; else if (qn_addr_ext_rd_dly3 > {1'b0, bram_addr_last}) sn_addr <= sn_addr_next; end FSM_STATE_ADD_FINAL: sn_addr <= bram_addr_zero; FSM_STATE_SAVE_START, FSM_STATE_SAVE_WRITE: sn_addr <= sn_addr_next; endcase // case (fsm_state) FSM_STATE_SAVE_START: r_addr <= bram_addr_zero; FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next; endcase // end /* * Flag Update Logic */ always @(posedge clk) // if (fsm_state == FSM_STATE_ADD_FINAL) flag_select_s <= sub1_b_out & ~add1_c_out; /* * FSM Process */ always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; //always @(posedge clk) // //fsm_prev_state <= fsm_state; /* * FSM Transition Logic */ always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) // FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_START; else fsm_next_state = FSM_STATE_IDLE; // FSM_STATE_LOAD_START: fsm_next_state = FSM_STATE_LOAD_SHIFT; FSM_STATE_LOAD_SHIFT: if (load_mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_WRITE; else fsm_next_state = FSM_STATE_LOAD_SHIFT; FSM_STATE_LOAD_WRITE: if (load_syst_cnt_done) fsm_next_state = FSM_STATE_LOAD_FINAL; else fsm_next_state = FSM_STATE_LOAD_SHIFT; FSM_STATE_LOAD_FINAL: fsm_next_state = FSM_STATE_MULT_START; // FSM_STATE_MULT_START: fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_CRUNCH: if (pe_array_rdy) fsm_next_state = FSM_STATE_MULT_FINAL; else fsm_next_state = FSM_STATE_MULT_CRUNCH; FSM_STATE_MULT_FINAL: if (mult_phase_done) fsm_next_state = FSM_STATE_ADD_START; else fsm_next_state = FSM_STATE_LOAD_START; // FSM_STATE_ADD_START: fsm_next_state = FSM_STATE_ADD_CRUNCH; FSM_STATE_ADD_CRUNCH: if (ab_addr_ext_rd_done) fsm_next_state = FSM_STATE_ADD_UNLOAD; else fsm_next_state = FSM_STATE_ADD_CRUNCH; FSM_STATE_ADD_UNLOAD: fsm_next_state = FSM_STATE_SUB_UNLOAD; FSM_STATE_SUB_UNLOAD: fsm_next_state = FSM_STATE_ADD_FINAL; FSM_STATE_ADD_FINAL: fsm_next_state = FSM_STATE_SAVE_START; // FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE; FSM_STATE_SAVE_WRITE: if (s_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL; else fsm_next_state = FSM_STATE_SAVE_WRITE; FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP; // FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; // endcase // end endmodule //====================================================================== // End of file //======================================================================