//====================================================================== // // modexpa7_exponentiator.v // ----------------------------------------------------------------------------- // Modular Montgomery Exponentiator. // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_exponentiator # ( // // This sets the address widths of memory buffers. Internal data // width is 32 bits, so for e.g. 2048-bit operands buffers must store // 2048 / 32 = 64 words, and these need 5-bit address bus, because // 2 ** 6 = 64. // parameter OPERAND_ADDR_WIDTH = 4, // // Explain. // parameter SYSTOLIC_ARRAY_POWER = 2 ) ( input clk, input rst_n, input ena, output rdy, input crt, output [OPERAND_ADDR_WIDTH-1:0] m_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] d_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] f_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n1_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n2_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff1_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff2_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, input [ 32-1:0] m_bram_out, input [ 32-1:0] d_bram_out, input [ 32-1:0] f_bram_out, input [ 32-1:0] n1_bram_out, input [ 32-1:0] n2_bram_out, input [ 32-1:0] n_coeff1_bram_out, input [ 32-1:0] n_coeff2_bram_out, output [ 32-1:0] r_bram_in, output r_bram_wr, input [OPERAND_ADDR_WIDTH-1:0] m_num_words, input [OPERAND_ADDR_WIDTH+4:0] d_num_bits ); // // FSM Declaration // localparam [ 7: 0] FSM_STATE_EXP_IDLE = 8'h00; // localparam [ 7: 0] FSM_STATE_EXP_INIT_1 = 8'hA1; localparam [ 7: 0] FSM_STATE_EXP_INIT_2 = 8'hA2; localparam [ 7: 0] FSM_STATE_EXP_INIT_3 = 8'hA3; localparam [ 7: 0] FSM_STATE_EXP_INIT_4 = 8'hA4; localparam [ 7: 0] FSM_STATE_EXP_LOAD_1 = 8'hB1; localparam [ 7: 0] FSM_STATE_EXP_LOAD_2 = 8'hB2; localparam [ 7: 0] FSM_STATE_EXP_LOAD_3 = 8'hB3; localparam [ 7: 0] FSM_STATE_EXP_LOAD_4 = 8'hB4; localparam [ 7: 0] FSM_STATE_EXP_CALC_1 = 8'hC1; localparam [ 7: 0] FSM_STATE_EXP_CALC_2 = 8'hC2; localparam [ 7: 0] FSM_STATE_EXP_CALC_3 = 8'hC3; localparam [ 7: 0] FSM_STATE_EXP_FILL_1 = 8'hD1; localparam [ 7: 0] FSM_STATE_EXP_FILL_2 = 8'hD2; localparam [ 7: 0] FSM_STATE_EXP_FILL_3 = 8'hD3; localparam [ 7: 0] FSM_STATE_EXP_FILL_4 = 8'hD4; localparam [ 7: 0] FSM_STATE_EXP_NEXT = 8'hE0; localparam [ 7: 0] FSM_STATE_EXP_SAVE_1 = 8'hF1; localparam [ 7: 0] FSM_STATE_EXP_SAVE_2 = 8'hF2; localparam [ 7: 0] FSM_STATE_EXP_SAVE_3 = 8'hF3; localparam [ 7: 0] FSM_STATE_EXP_SAVE_4 = 8'hF4; // localparam [ 7: 0] FSM_STATE_MUL_INIT_1 = 8'h11; localparam [ 7: 0] FSM_STATE_MUL_INIT_2 = 8'h12; localparam [ 7: 0] FSM_STATE_MUL_INIT_3 = 8'h13; localparam [ 7: 0] FSM_STATE_MUL_INIT_4 = 8'h14; localparam [ 7: 0] FSM_STATE_MUL_CALC_1 = 8'h21; localparam [ 7: 0] FSM_STATE_MUL_CALC_2 = 8'h22; localparam [ 7: 0] FSM_STATE_MUL_CALC_3 = 8'h23; // localparam [ 7: 0] FSM_STATE_CRT_INIT_A_1 = 8'h31; localparam [ 7: 0] FSM_STATE_CRT_INIT_A_2 = 8'h32; localparam [ 7: 0] FSM_STATE_CRT_INIT_A_3 = 8'h33; localparam [ 7: 0] FSM_STATE_CRT_INIT_A_4 = 8'h34; localparam [ 7: 0] FSM_STATE_CRT_CALC_A_1 = 8'h41; localparam [ 7: 0] FSM_STATE_CRT_CALC_A_2 = 8'h42; localparam [ 7: 0] FSM_STATE_CRT_CALC_A_3 = 8'h43; // localparam [ 7: 0] FSM_STATE_CRT_INIT_B_1 = 8'h51; localparam [ 7: 0] FSM_STATE_CRT_INIT_B_2 = 8'h52; localparam [ 7: 0] FSM_STATE_CRT_INIT_B_3 = 8'h53; localparam [ 7: 0] FSM_STATE_CRT_INIT_B_4 = 8'h54; localparam [ 7: 0] FSM_STATE_CRT_CALC_B_1 = 8'h61; localparam [ 7: 0] FSM_STATE_CRT_CALC_B_2 = 8'h62; localparam [ 7: 0] FSM_STATE_CRT_CALC_B_3 = 8'h63; // localparam [ 7: 0] FSM_STATE_CRT_INIT_C_1 = 8'h71; localparam [ 7: 0] FSM_STATE_CRT_INIT_C_2 = 8'h72; localparam [ 7: 0] FSM_STATE_CRT_INIT_C_3 = 8'h73; localparam [ 7: 0] FSM_STATE_CRT_INIT_C_4 = 8'h74; localparam [ 7: 0] FSM_STATE_CRT_CALC_C_1 = 8'h81; localparam [ 7: 0] FSM_STATE_CRT_CALC_C_2 = 8'h82; localparam [ 7: 0] FSM_STATE_CRT_CALC_C_3 = 8'h83; // localparam [ 7: 0] FSM_STATE_EXP_STOP = 8'hFF; /* * // * * MUL_INIT: P1 <= F * P2 <= F * P3 <= F * T2 <= M * * MUL_CALC: TP = T2 * P3 * * // * * CRT_INIT_A: T2 <= M * * CRT_CALC_A: TP = T2 * P3 ("reduce only") * * CRT_INIT_B: P1 <= F * P2 <= F * P3 <= F * T2 <= TP * * CRT_CALC_B: TP = T2 * P3 * * CRT_INIT_C: T2 <= TP * * CRT_CALC_C: TP = T2 * P3 * * // * * EXP_INIT: P1 <= TP * P2 <= TP * P3 <= TP * T1 <= 1 * T2 <= 1 * * EXP_LOAD: T0 <= T1 * * EXP_CALC: PP = P1 * P2 * TP = T2 * P3 * * EXP_FILL: P1 <= PP * P2 <= PP * P3 <= PP * T1 <= D[i] ? TP : T0 * T2 <= D[i] ? TP : T0 * * EXP_SAVE: R <= T1 * * // * */ // // FSM State / Next State // reg [ 7: 0] fsm_state = FSM_STATE_EXP_IDLE; reg [ 7: 0] fsm_next_state; // // Enable Delay and Trigger // reg ena_dly = 1'b0; /* delay enable by one clock cycle */ always @(posedge clk) ena_dly <= ena; /* trigger new operation when enable goes high */ wire ena_trig = ena && !ena_dly; // // Ready Flag Logic // reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) /* reset flag */ if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin /* clear flag when operation is started */ if (fsm_state == FSM_STATE_EXP_IDLE) rdy_reg <= ~ena_trig; /* set flag after operation is finished */ if (fsm_state == FSM_STATE_EXP_STOP) rdy_reg <= 1'b1; end // // Parameters Latch // reg [OPERAND_ADDR_WIDTH-1:0] m_num_words_latch; reg [OPERAND_ADDR_WIDTH+4:0] d_num_bits_latch; /* save number of words in a and b when new operation starts */ always @(posedge clk) // if ((fsm_state == FSM_STATE_EXP_IDLE) && ena_trig) {m_num_words_latch, d_num_bits_latch} <= {m_num_words, d_num_bits}; // // Block Memory Addresses // /* * Explain what every memory does. * */ /* the very first addresses */ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {{OPERAND_ADDR_WIDTH{1'b0}}}; /* the very last addresses */ wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {m_num_words_latch}; wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last_crt = {m_num_words_latch[OPERAND_ADDR_WIDTH-2:0], 1'b1}; /* address registers */ reg [OPERAND_ADDR_WIDTH-1:0] m_addr; reg [OPERAND_ADDR_WIDTH-1:0] d_addr; reg [OPERAND_ADDR_WIDTH-1:0] f_addr; reg [OPERAND_ADDR_WIDTH-1:0] r_addr; reg [OPERAND_ADDR_WIDTH-1:0] t0_addr; reg [OPERAND_ADDR_WIDTH-1:0] t1_addr; reg [OPERAND_ADDR_WIDTH-1:0] t2_addr_wr; wire [OPERAND_ADDR_WIDTH-1:0] t2_addr_rd; reg [OPERAND_ADDR_WIDTH-1:0] p_addr_wr; wire [OPERAND_ADDR_WIDTH-1:0] p1_addr_rd; wire [OPERAND_ADDR_WIDTH-1:0] p2_addr_rd; wire [OPERAND_ADDR_WIDTH-1:0] p3_addr_rd; wire [OPERAND_ADDR_WIDTH-1:0] pp_addr_wr; reg [OPERAND_ADDR_WIDTH-1:0] pp_addr_rd; wire [OPERAND_ADDR_WIDTH-1:0] tp_addr_wr; reg [OPERAND_ADDR_WIDTH-1:0] tp_addr_rd; /* handy increment values */ wire [OPERAND_ADDR_WIDTH-1:0] m_addr_next = m_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] d_addr_next = d_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] f_addr_next = f_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] r_addr_next = r_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] t0_addr_next = t0_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] t1_addr_next = t1_addr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] t2_addr_wr_next = t2_addr_wr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] p_addr_wr_next = p_addr_wr + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] pp_addr_rd_next = pp_addr_rd + 1'b1; wire [OPERAND_ADDR_WIDTH-1:0] tp_addr_rd_next = tp_addr_rd + 1'b1; /* handy stop flags */ wire m_addr_done = (m_addr == bram_addr_last) ? 1'b1 : 1'b0; wire m_addr_done_crt = (m_addr == bram_addr_last_crt) ? 1'b1 : 1'b0; wire d_addr_done = (d_addr == bram_addr_last) ? 1'b1 : 1'b0; wire f_addr_done = (f_addr == bram_addr_last) ? 1'b1 : 1'b0; wire r_addr_done = (r_addr == bram_addr_last) ? 1'b1 : 1'b0; wire t0_addr_done = (t0_addr == bram_addr_last) ? 1'b1 : 1'b0; wire t1_addr_done = (t1_addr == bram_addr_last) ? 1'b1 : 1'b0; wire t2_addr_wr_done = (t2_addr_wr == bram_addr_last) ? 1'b1 : 1'b0; wire t2_addr_wr_done_crt = (t2_addr_wr == bram_addr_last_crt) ? 1'b1 : 1'b0; wire p_addr_wr_done = (p_addr_wr == bram_addr_last) ? 1'b1 : 1'b0; wire pp_addr_rd_done = (pp_addr_rd == bram_addr_last) ? 1'b1 : 1'b0; wire tp_addr_rd_done = (tp_addr_rd == bram_addr_last) ? 1'b1 : 1'b0; /* map registers to top-level ports */ assign m_bram_addr = m_addr; assign d_bram_addr = d_addr; assign f_bram_addr = f_addr; assign r_bram_addr = r_addr; // // Internal Memories // /* memory inputs */ reg [31: 0] r_data_in; reg [31: 0] t0_data_in; reg [31: 0] t1_data_in; reg [31: 0] t2_data_in; reg [31: 0] p_data_in; wire [31: 0] pp_data_in; wire [31: 0] tp_data_in; /* memory outputs */ wire [31: 0] t0_data_out; wire [31: 0] t1_data_out; wire [31: 0] t2_data_out; wire [31: 0] p1_data_out; wire [31: 0] p2_data_out; wire [31: 0] p3_data_out; wire [31: 0] pp_data_out; wire [31: 0] tp_data_out; /* write enables */ reg r_wren; reg t0_wren; reg t1_wren; reg t2_wren; reg p_wren; wire pp_wren; wire tp_wren; /* map */ assign r_bram_in = r_data_in; assign r_bram_wr = r_wren; bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_t0 (.clk(clk), .a_addr(t0_addr), .a_wr(t0_wren), .a_in(t0_data_in), .a_out(t0_data_out)); bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_t1 (.clk(clk), .a_addr(t1_addr), .a_wr(t1_wren), .a_in(t1_data_in), .a_out(t1_data_out)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_t2 (.clk(clk), .a_addr(t2_addr_wr), .a_wr(t2_wren), .a_in(t2_data_in), .a_out(), .b_addr(t2_addr_rd), .b_out(t2_data_out)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_p1 (.clk(clk), .a_addr(p_addr_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(), .b_addr(p1_addr_rd), .b_out(p1_data_out)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_p2 (.clk(clk), .a_addr(p_addr_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(), .b_addr(p2_addr_rd), .b_out(p2_data_out)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_p3 (.clk(clk), .a_addr(p_addr_wr), .a_wr(p_wren), .a_in(p_data_in), .a_out(), .b_addr(p3_addr_rd), .b_out(p3_data_out)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_pp (.clk(clk), .a_addr(pp_addr_wr), .a_wr(pp_wren), .a_in(pp_data_in), .a_out(), .b_addr(pp_addr_rd), .b_out(pp_data_out)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_tp (.clk(clk), .a_addr(tp_addr_wr), .a_wr(tp_wren), .a_in(tp_data_in), .a_out(), .b_addr(tp_addr_rd), .b_out(tp_data_out)); // // Bit Counter // reg [OPERAND_ADDR_WIDTH+4:0] bit_cnt; wire [OPERAND_ADDR_WIDTH+4:0] bit_cnt_zero = {{OPERAND_ADDR_WIDTH{1'b0}}, {5{1'b0}}}; wire [OPERAND_ADDR_WIDTH+4:0] bit_cnt_last = d_num_bits_latch; wire [OPERAND_ADDR_WIDTH+4:0] bit_cnt_next = bit_cnt + 1'b1; /* handy flag */ wire bit_cnt_done = (bit_cnt == bit_cnt_last) ? 1'b1 : 1'b0; always @(posedge clk) // if (fsm_next_state == FSM_STATE_EXP_LOAD_1) // case (fsm_state) FSM_STATE_EXP_INIT_4: bit_cnt <= bit_cnt_zero; FSM_STATE_EXP_NEXT: bit_cnt <= !bit_cnt_done ? bit_cnt_next : bit_cnt; endcase // // Flags // reg flag_update_r; always @(posedge clk) // if (fsm_next_state == FSM_STATE_EXP_CALC_3) flag_update_r <= d_bram_out[bit_cnt[4:0]]; // // Memory Address Control Logic // always @(posedge clk) begin // // m_addr // case (fsm_next_state) FSM_STATE_MUL_INIT_1: m_addr <= bram_addr_zero; FSM_STATE_MUL_INIT_2, FSM_STATE_MUL_INIT_3, FSM_STATE_MUL_INIT_4: m_addr <= !m_addr_done ? m_addr_next : m_addr; // FSM_STATE_CRT_INIT_A_1: m_addr <= bram_addr_zero; FSM_STATE_CRT_INIT_A_2, FSM_STATE_CRT_INIT_A_3, FSM_STATE_CRT_INIT_A_4: m_addr <= !m_addr_done_crt ? m_addr_next : m_addr; endcase // // d_addr // case (fsm_next_state) FSM_STATE_EXP_CALC_1: d_addr <= bit_cnt[OPERAND_ADDR_WIDTH+4:5]; endcase // // f_addr // case (fsm_next_state) FSM_STATE_MUL_INIT_1: f_addr <= bram_addr_zero; FSM_STATE_MUL_INIT_2, FSM_STATE_MUL_INIT_3, FSM_STATE_MUL_INIT_4: f_addr <= !f_addr_done ? f_addr_next : f_addr; // FSM_STATE_CRT_INIT_B_1: f_addr <= bram_addr_zero; FSM_STATE_CRT_INIT_B_2, FSM_STATE_CRT_INIT_B_3, FSM_STATE_CRT_INIT_B_4: f_addr <= !f_addr_done ? f_addr_next : f_addr; // endcase // // r_addr // case (fsm_next_state) FSM_STATE_EXP_SAVE_3: r_addr <= bram_addr_zero; FSM_STATE_EXP_SAVE_4: r_addr <= r_addr_next; endcase // // p_addr_wr // case (fsm_next_state) // FSM_STATE_MUL_INIT_3: p_addr_wr <= bram_addr_zero; FSM_STATE_MUL_INIT_4: p_addr_wr <= p_addr_wr_next; // FSM_STATE_CRT_INIT_B_3: p_addr_wr <= bram_addr_zero; FSM_STATE_CRT_INIT_B_4: p_addr_wr <= p_addr_wr_next; // FSM_STATE_EXP_INIT_3: p_addr_wr <= bram_addr_zero; FSM_STATE_EXP_INIT_4: p_addr_wr <= p_addr_wr_next; // FSM_STATE_EXP_FILL_3: p_addr_wr <= bram_addr_zero; FSM_STATE_EXP_FILL_4: p_addr_wr <= p_addr_wr_next; endcase // // t0_addr // case (fsm_next_state) FSM_STATE_EXP_LOAD_3: t0_addr <= bram_addr_zero; FSM_STATE_EXP_LOAD_4: t0_addr <= t0_addr_next; // FSM_STATE_EXP_FILL_1: t0_addr <= bram_addr_zero; FSM_STATE_EXP_FILL_2, FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: t0_addr <= !t0_addr_done ? t0_addr_next : t0_addr; endcase // // t1_addr // case (fsm_next_state) FSM_STATE_EXP_INIT_3: t1_addr <= bram_addr_zero; FSM_STATE_EXP_INIT_4: t1_addr <= t1_addr_next; // FSM_STATE_EXP_LOAD_1: t1_addr <= bram_addr_zero; FSM_STATE_EXP_LOAD_2, FSM_STATE_EXP_LOAD_3, FSM_STATE_EXP_LOAD_4: t1_addr <= !t1_addr_done ? t1_addr_next : t1_addr; // FSM_STATE_EXP_FILL_3: t1_addr <= bram_addr_zero; FSM_STATE_EXP_FILL_4: t1_addr <= t1_addr_next; // FSM_STATE_EXP_SAVE_1: t1_addr <= bram_addr_zero; FSM_STATE_EXP_SAVE_2, FSM_STATE_EXP_SAVE_3, FSM_STATE_EXP_SAVE_4: t1_addr <= !t1_addr_done ? t1_addr_next : t1_addr; endcase // // t2_addr_wr // case (fsm_next_state) // FSM_STATE_MUL_INIT_3: t2_addr_wr <= bram_addr_zero; FSM_STATE_MUL_INIT_4: t2_addr_wr <= t2_addr_wr_next; // FSM_STATE_CRT_INIT_A_3: t2_addr_wr <= bram_addr_zero; FSM_STATE_CRT_INIT_A_4: t2_addr_wr <= t2_addr_wr_next; // FSM_STATE_CRT_INIT_B_3: t2_addr_wr <= bram_addr_zero; FSM_STATE_CRT_INIT_B_4: t2_addr_wr <= t2_addr_wr_next; // FSM_STATE_CRT_INIT_C_3: t2_addr_wr <= bram_addr_zero; FSM_STATE_CRT_INIT_C_4: t2_addr_wr <= t2_addr_wr_next; // FSM_STATE_EXP_INIT_3: t2_addr_wr <= bram_addr_zero; FSM_STATE_EXP_INIT_4: t2_addr_wr <= t2_addr_wr_next; // FSM_STATE_EXP_FILL_3: t2_addr_wr <= bram_addr_zero; FSM_STATE_EXP_FILL_4: t2_addr_wr <= t2_addr_wr_next; endcase // // pp_addr_rd // case (fsm_next_state) FSM_STATE_EXP_FILL_1: pp_addr_rd <= bram_addr_zero; FSM_STATE_EXP_FILL_2, FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: pp_addr_rd <= !pp_addr_rd_done ? pp_addr_rd_next : pp_addr_rd; endcase // // tp_addr_rd // case (fsm_next_state) FSM_STATE_EXP_INIT_1: tp_addr_rd <= bram_addr_zero; FSM_STATE_EXP_INIT_2, FSM_STATE_EXP_INIT_3, FSM_STATE_EXP_INIT_4: tp_addr_rd <= !tp_addr_rd_done ? tp_addr_rd_next : tp_addr_rd; // FSM_STATE_CRT_INIT_B_1: tp_addr_rd <= bram_addr_zero; FSM_STATE_CRT_INIT_B_2, FSM_STATE_CRT_INIT_B_3, FSM_STATE_CRT_INIT_B_4: tp_addr_rd <= !tp_addr_rd_done ? tp_addr_rd_next : tp_addr_rd; // FSM_STATE_CRT_INIT_C_1: tp_addr_rd <= bram_addr_zero; FSM_STATE_CRT_INIT_C_2, FSM_STATE_CRT_INIT_C_3, FSM_STATE_CRT_INIT_C_4: tp_addr_rd <= !tp_addr_rd_done ? tp_addr_rd_next : tp_addr_rd; // FSM_STATE_EXP_FILL_1: tp_addr_rd <= bram_addr_zero; FSM_STATE_EXP_FILL_2, FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: tp_addr_rd <= !tp_addr_rd_done ? tp_addr_rd_next : tp_addr_rd; endcase // end // // Memory Write Enable Logic // always @(posedge clk) begin // // r_wren // case (fsm_next_state) FSM_STATE_EXP_SAVE_3, FSM_STATE_EXP_SAVE_4: r_wren <= 1'b1; default: r_wren <= 1'b0; endcase // // p_wren // case (fsm_next_state) FSM_STATE_MUL_INIT_3, FSM_STATE_MUL_INIT_4, FSM_STATE_CRT_INIT_B_3, FSM_STATE_CRT_INIT_B_4, FSM_STATE_EXP_INIT_3, FSM_STATE_EXP_INIT_4, FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: p_wren <= 1'b1; default: p_wren <= 1'b0; endcase // // t0_wren // case (fsm_next_state) FSM_STATE_EXP_LOAD_3, FSM_STATE_EXP_LOAD_4: t0_wren <= 1'b1; default: t0_wren <= 1'b0; endcase // // t1_wren // case (fsm_next_state) FSM_STATE_EXP_INIT_3, FSM_STATE_EXP_INIT_4, FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: t1_wren <= 1'b1; default: t1_wren <= 1'b0; endcase // // t2_wren // case (fsm_next_state) FSM_STATE_MUL_INIT_3, FSM_STATE_MUL_INIT_4, FSM_STATE_CRT_INIT_A_3, FSM_STATE_CRT_INIT_A_4, FSM_STATE_CRT_INIT_B_3, FSM_STATE_CRT_INIT_B_4, FSM_STATE_CRT_INIT_C_3, FSM_STATE_CRT_INIT_C_4, FSM_STATE_EXP_INIT_3, FSM_STATE_EXP_INIT_4, FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: t2_wren <= 1'b1; default: t2_wren <= 1'b0; endcase // end // // Memory Input Selector // always @(posedge clk) begin // // r_data_in // case (fsm_next_state) FSM_STATE_EXP_SAVE_3, FSM_STATE_EXP_SAVE_4: r_data_in <= t1_data_out; default: r_data_in <= 32'dX; endcase // // p_data_in // case (fsm_next_state) // FSM_STATE_MUL_INIT_3, FSM_STATE_MUL_INIT_4: p_data_in <= f_bram_out; // FSM_STATE_CRT_INIT_B_3, FSM_STATE_CRT_INIT_B_4: p_data_in <= f_bram_out; // FSM_STATE_EXP_INIT_3, FSM_STATE_EXP_INIT_4: p_data_in <= tp_data_out; // FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: p_data_in <= pp_data_out; // default: p_data_in <= 32'dX; endcase // // t0_data_in // case (fsm_next_state) FSM_STATE_EXP_LOAD_3, FSM_STATE_EXP_LOAD_4: t0_data_in <= t1_data_out; default: t0_data_in <= 32'dX; endcase // // t1_data_in // case (fsm_next_state) FSM_STATE_EXP_INIT_3: t1_data_in <= 32'd1; FSM_STATE_EXP_INIT_4: t1_data_in <= 32'd0; // FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: t1_data_in <= flag_update_r ? tp_data_out : t0_data_out; default: t1_data_in <= 32'dX; endcase // // t2_data_in // case (fsm_next_state) // FSM_STATE_MUL_INIT_3, FSM_STATE_MUL_INIT_4: t2_data_in <= m_bram_out; // FSM_STATE_CRT_INIT_A_3, FSM_STATE_CRT_INIT_A_4: t2_data_in <= m_bram_out; // FSM_STATE_CRT_INIT_B_3, FSM_STATE_CRT_INIT_B_4: t2_data_in <= tp_data_out; // FSM_STATE_CRT_INIT_C_3, FSM_STATE_CRT_INIT_C_4: t2_data_in <= tp_data_out; // FSM_STATE_EXP_INIT_3: t2_data_in <= 32'd1; FSM_STATE_EXP_INIT_4: t2_data_in <= 32'd0; // FSM_STATE_EXP_FILL_3, FSM_STATE_EXP_FILL_4: t2_data_in <= flag_update_r ? tp_data_out : t0_data_out; default: t2_data_in <= 32'dX; endcase // end // // Double Multiplier // reg mul_ena; reg mul_crt; wire mul_rdy_pp; wire mul_rdy_tp; wire mul_rdy_all = mul_rdy_pp & mul_rdy_tp; modexpa7_systolic_multiplier # ( .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH), .SYSTOLIC_ARRAY_POWER (SYSTOLIC_ARRAY_POWER) ) mul_pp ( .clk (clk), .rst_n (rst_n), .ena (mul_ena), .rdy (mul_rdy_pp), .reduce_only (1'b0), .a_bram_addr (p1_addr_rd), .b_bram_addr (p2_addr_rd), .n_bram_addr (n1_bram_addr), .n_coeff_bram_addr (n_coeff1_bram_addr), .r_bram_addr (pp_addr_wr), .a_bram_out (p1_data_out), .b_bram_out (p2_data_out), .n_bram_out (n1_bram_out), .n_coeff_bram_out (n_coeff1_bram_out), .r_bram_in (pp_data_in), .r_bram_wr (pp_wren), .n_num_words (m_num_words_latch) ); modexpa7_systolic_multiplier # ( .OPERAND_ADDR_WIDTH (OPERAND_ADDR_WIDTH), .SYSTOLIC_ARRAY_POWER (SYSTOLIC_ARRAY_POWER) ) mul_tp ( .clk (clk), .rst_n (rst_n), .ena (mul_ena), .rdy (mul_rdy_tp), .reduce_only (mul_crt), .a_bram_addr (t2_addr_rd), .b_bram_addr (p3_addr_rd), .n_bram_addr (n2_bram_addr), .n_coeff_bram_addr (n_coeff2_bram_addr), .r_bram_addr (tp_addr_wr), .a_bram_out (t2_data_out), .b_bram_out (p3_data_out), .n_bram_out (n2_bram_out), .n_coeff_bram_out (n_coeff2_bram_out), .r_bram_in (tp_data_in), .r_bram_wr (tp_wren), .n_num_words (m_num_words_latch) ); always @(posedge clk) // case (fsm_next_state) FSM_STATE_MUL_CALC_1, FSM_STATE_CRT_CALC_A_1, FSM_STATE_CRT_CALC_B_1, FSM_STATE_CRT_CALC_C_1, FSM_STATE_EXP_CALC_1: mul_ena <= 1'b1; default: mul_ena <= 1'b0; endcase always @(posedge clk) // case (fsm_next_state) FSM_STATE_CRT_CALC_A_1: mul_crt <= 1'b1; default: mul_crt <= 1'b0; endcase // // FSM Process // always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_EXP_IDLE; else fsm_state <= fsm_next_state; // // FSM Transition Logic // always @* begin // fsm_next_state = FSM_STATE_EXP_STOP; // case (fsm_state) // // FSM_STATE_MUL_INIT_1: fsm_next_state = FSM_STATE_MUL_INIT_2; FSM_STATE_MUL_INIT_2: fsm_next_state = FSM_STATE_MUL_INIT_3; FSM_STATE_MUL_INIT_3: fsm_next_state = FSM_STATE_MUL_INIT_4; FSM_STATE_MUL_INIT_4: if (t2_addr_wr_done) fsm_next_state = FSM_STATE_MUL_CALC_1; else fsm_next_state = FSM_STATE_MUL_INIT_4; // FSM_STATE_MUL_CALC_1: fsm_next_state = FSM_STATE_MUL_CALC_2; FSM_STATE_MUL_CALC_2: if (mul_rdy_tp) fsm_next_state = FSM_STATE_MUL_CALC_3; else fsm_next_state = FSM_STATE_MUL_CALC_2; FSM_STATE_MUL_CALC_3: fsm_next_state = FSM_STATE_EXP_INIT_1; // // FSM_STATE_CRT_INIT_A_1: fsm_next_state = FSM_STATE_CRT_INIT_A_2; FSM_STATE_CRT_INIT_A_2: fsm_next_state = FSM_STATE_CRT_INIT_A_3; FSM_STATE_CRT_INIT_A_3: fsm_next_state = FSM_STATE_CRT_INIT_A_4; FSM_STATE_CRT_INIT_A_4: if (t2_addr_wr_done_crt) fsm_next_state = FSM_STATE_CRT_CALC_A_1; else fsm_next_state = FSM_STATE_CRT_INIT_A_4; // FSM_STATE_CRT_CALC_A_1: fsm_next_state = FSM_STATE_CRT_CALC_A_2; FSM_STATE_CRT_CALC_A_2: if (mul_rdy_tp) fsm_next_state = FSM_STATE_CRT_CALC_A_3; else fsm_next_state = FSM_STATE_CRT_CALC_A_2; FSM_STATE_CRT_CALC_A_3: fsm_next_state = FSM_STATE_CRT_INIT_B_1; // FSM_STATE_CRT_INIT_B_1: fsm_next_state = FSM_STATE_CRT_INIT_B_2; FSM_STATE_CRT_INIT_B_2: fsm_next_state = FSM_STATE_CRT_INIT_B_3; FSM_STATE_CRT_INIT_B_3: fsm_next_state = FSM_STATE_CRT_INIT_B_4; FSM_STATE_CRT_INIT_B_4: if (t2_addr_wr_done) fsm_next_state = FSM_STATE_CRT_CALC_B_1; else fsm_next_state = FSM_STATE_CRT_INIT_B_4; // FSM_STATE_CRT_CALC_B_1: fsm_next_state = FSM_STATE_CRT_CALC_B_2; FSM_STATE_CRT_CALC_B_2: if (mul_rdy_tp) fsm_next_state = FSM_STATE_CRT_CALC_B_3; else fsm_next_state = FSM_STATE_CRT_CALC_B_2; FSM_STATE_CRT_CALC_B_3: fsm_next_state = FSM_STATE_CRT_INIT_C_1; // FSM_STATE_CRT_INIT_C_1: fsm_next_state = FSM_STATE_CRT_INIT_C_2; FSM_STATE_CRT_INIT_C_2: fsm_next_state = FSM_STATE_CRT_INIT_C_3; FSM_STATE_CRT_INIT_C_3: fsm_next_state = FSM_STATE_CRT_INIT_C_4; FSM_STATE_CRT_INIT_C_4: if (t2_addr_wr_done) fsm_next_state = FSM_STATE_CRT_CALC_C_1; else fsm_next_state = FSM_STATE_CRT_INIT_C_4; // FSM_STATE_CRT_CALC_C_1: fsm_next_state = FSM_STATE_CRT_CALC_C_2; FSM_STATE_CRT_CALC_C_2: if (mul_rdy_tp) fsm_next_state = FSM_STATE_CRT_CALC_C_3; else fsm_next_state = FSM_STATE_CRT_CALC_C_2; FSM_STATE_CRT_CALC_C_3: fsm_next_state = FSM_STATE_EXP_INIT_1; // // FSM_STATE_EXP_IDLE: if (ena_trig) fsm_next_state = crt ? FSM_STATE_CRT_INIT_A_1 : FSM_STATE_MUL_INIT_1; else fsm_next_state = FSM_STATE_EXP_IDLE; // // FSM_STATE_EXP_INIT_1: fsm_next_state = FSM_STATE_EXP_INIT_2; FSM_STATE_EXP_INIT_2: fsm_next_state = FSM_STATE_EXP_INIT_3; FSM_STATE_EXP_INIT_3: fsm_next_state = FSM_STATE_EXP_INIT_4; FSM_STATE_EXP_INIT_4: if (t1_addr_done) fsm_next_state = FSM_STATE_EXP_LOAD_1; else fsm_next_state = FSM_STATE_EXP_INIT_4; // FSM_STATE_EXP_LOAD_1: fsm_next_state = FSM_STATE_EXP_LOAD_2; FSM_STATE_EXP_LOAD_2: fsm_next_state = FSM_STATE_EXP_LOAD_3; FSM_STATE_EXP_LOAD_3: fsm_next_state = FSM_STATE_EXP_LOAD_4; FSM_STATE_EXP_LOAD_4: if (t0_addr_done) fsm_next_state = FSM_STATE_EXP_CALC_1; else fsm_next_state = FSM_STATE_EXP_LOAD_4; // FSM_STATE_EXP_CALC_1: fsm_next_state = FSM_STATE_EXP_CALC_2; FSM_STATE_EXP_CALC_2: if (mul_rdy_all) fsm_next_state = FSM_STATE_EXP_CALC_3; else fsm_next_state = FSM_STATE_EXP_CALC_2; FSM_STATE_EXP_CALC_3: fsm_next_state = FSM_STATE_EXP_FILL_1; // FSM_STATE_EXP_FILL_1: fsm_next_state = FSM_STATE_EXP_FILL_2; FSM_STATE_EXP_FILL_2: fsm_next_state = FSM_STATE_EXP_FILL_3; FSM_STATE_EXP_FILL_3: fsm_next_state = FSM_STATE_EXP_FILL_4; FSM_STATE_EXP_FILL_4: if (p_addr_wr_done) fsm_next_state = FSM_STATE_EXP_NEXT; else fsm_next_state = FSM_STATE_EXP_FILL_4; // FSM_STATE_EXP_NEXT: if (bit_cnt_done) fsm_next_state = FSM_STATE_EXP_SAVE_1; else fsm_next_state = FSM_STATE_EXP_LOAD_1; // FSM_STATE_EXP_SAVE_1: fsm_next_state = FSM_STATE_EXP_SAVE_2; FSM_STATE_EXP_SAVE_2: fsm_next_state = FSM_STATE_EXP_SAVE_3; FSM_STATE_EXP_SAVE_3: fsm_next_state = FSM_STATE_EXP_SAVE_4; FSM_STATE_EXP_SAVE_4: if (r_addr_done) fsm_next_state = FSM_STATE_EXP_STOP; else fsm_next_state = FSM_STATE_EXP_SAVE_4; // FSM_STATE_EXP_STOP: fsm_next_state = FSM_STATE_EXP_IDLE; // endcase // end endmodule