//====================================================================== // // modexpa7_systolic_multiplier.v // ----------------------------------------------------------------------------- // Systolic Montgomery multiplier. // // Authors: Pavel Shatov // // Copyright (c) 2017, NORDUnet A/S All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // - Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //====================================================================== module modexpa7_systolic_multiplier # ( // // This sets the address widths of memory buffers. Internal data // width is 32 bits, so for e.g. 2048-bit operands buffers must store // 2048 / 32 = 64 words, and these need 5-bit address bus, because // 2 ** 6 = 64. // parameter OPERAND_ADDR_WIDTH = 4, // // Explain. // parameter SYSTOLIC_ARRAY_POWER = 2 ) ( input clk, input rst_n, input ena, output rdy, output [OPERAND_ADDR_WIDTH-1:0] a_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] b_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr, output [OPERAND_ADDR_WIDTH-1:0] r_bram_addr, input [ 32-1:0] a_bram_out, input [ 32-1:0] b_bram_out, input [ 32-1:0] n_bram_out, input [ 32-1:0] n_coeff_bram_out, output [ 32-1:0] r_bram_in, output r_bram_wr, input [OPERAND_ADDR_WIDTH-1:0] ab_num_words ); // // Include Settings // `include "pe/modexpa7_primitive_switch.v" `include "modexpa7_settings.v" // // FSM Declaration // localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11; localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12; localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13; localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23; localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24; localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31; localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32; localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33; localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34; localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41; localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42; localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43; localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53; localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54; localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61; localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62; localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63; localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64; localparam [ 7: 0] FSM_STATE_SAVE_START = 8'h71; localparam [ 7: 0] FSM_STATE_SAVE_WRITE = 8'h72; localparam [ 7: 0] FSM_STATE_SAVE_FINAL = 8'h73; localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; // // FSM State / Next State // reg [ 7: 0] fsm_state = FSM_STATE_IDLE; reg [ 7: 0] fsm_next_state; // // Enable Delay and Trigger // reg ena_dly = 1'b0; /* delay enable by one clock cycle */ always @(posedge clk) ena_dly <= ena; /* trigger new operation when enable goes high */ wire ena_trig = ena && !ena_dly; // // Ready Flag Logic // reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk or negedge rst_n) /* reset flag */ if (rst_n == 1'b0) rdy_reg <= 1'b1; else begin /* clear flag when operation is started */ if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; /* set flag after operation is finished */ if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; end // // Parameters Latch // reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch; /* save number of words in a and b when new operation starts */ always @(posedge clk) // if (fsm_next_state == FSM_STATE_LOAD_B_START) ab_num_words_latch <= ab_num_words; // // Systolic Cycle Counters // /* handy values */ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; /* counters */ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init; reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; /* handy increment values */ wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; /* handy stop flags */ wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0; wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; /* delayed load counter */ reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly; always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load; // // Multiplier Iteration Counter // /* handy values */ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; /* counter */ reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt; /* handy increment value and stop flag */ wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1; wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0; // // Initialization Counter Control Logic // always @(posedge clk) begin // case (fsm_state) FSM_STATE_LOAD_B_START, FSM_STATE_LOAD_N_COEFF_START, FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero; FSM_STATE_LOAD_B_SHIFT, FSM_STATE_LOAD_N_COEFF_SHIFT, FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next; endcase // case (fsm_state) FSM_STATE_LOAD_B_START, FSM_STATE_LOAD_N_COEFF_START, FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero; FSM_STATE_LOAD_B_WRITE, FSM_STATE_LOAD_N_COEFF_WRITE, FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init; endcase // end // // Operand Loader // /* * Explain how parallelized loader works here... * */ /* loader banks */ localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0; localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1; localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2; /* loader input */ reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1]; reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1]; reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1]; reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; /* loader output */ wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1]; /* generate parallelized loader */ // // Loader currently stores B, N_COEFF and N, it can be coded another way // to initially store B, then AB, then Q. Some memory can be saved thay way. // Maybe later... // genvar i; generate for (i=0; i {1'b0, bram_addr_last}) n_addr <= n_addr_next; endcase // case (fsm_state) FSM_STATE_SAVE_START: r_addr <= bram_addr_zero; FSM_STATE_SAVE_WRITE: r_addr <= r_addr_next; endcase // case (fsm_next_state) FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero; FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; endcase // end // // Internal Memories // /* memory inputs */ reg [31: 0] ab_data_in; reg [31: 0] q_data_in; reg [31: 0] qn_data_in; wire [31: 0] s_data_in; wire [31: 0] sn_data_in; reg [31: 0] r_data_in; /* memory outputs */ wire [31: 0] ab_data_out; wire [31: 0] q_data_out; wire [31: 0] qn_data_out; wire [31: 0] s_data_out; wire [31: 0] sn_data_out; /* write enables */ reg ab_wren; reg q_wren; reg qn_wren; reg s_wren; reg sn_wren; reg r_wren; /* map */ assign r_bram_in = r_data_in; assign r_bram_wr = r_wren; bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out)); bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out)); bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out)); bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_s (.clk(clk), .a_addr(s_addr), .a_wr(s_wren), .a_in(s_data_in), .a_out(s_data_out)); bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) bram_sn (.clk(clk), .a_addr(sn_addr), .a_wr(sn_wren), .a_in(sn_data_in), .a_out(sn_data_out)); // // Wide Operand Loader // integer j; /* shift logic */ always @(posedge clk) // case (fsm_state) // FSM_STATE_LOAD_B_SHIFT: begin /* update the rightmost part of loader buffer */ loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; /* shift the loader buffer to the left */ for (j=1; j {1'b0, bram_addr_last}) && (qn_addr_ext < bram_addr_ext_last)) begin s_addr <= s_addr_next; sn_addr <= sn_addr_next; end if (qn_addr_ext == bram_addr_ext_last) begin s_addr <= bram_addr_zero; sn_addr <= bram_addr_zero; end end FSM_STATE_MULT_Q_N_FINAL, FSM_STATE_SAVE_START, FSM_STATE_SAVE_WRITE: begin s_addr <= !s_addr_done ? s_addr_next : s_addr; sn_addr <= !sn_addr_done ? sn_addr_next : sn_addr; end endcase // case (fsm_next_state) FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero; FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next; endcase // case (fsm_next_state) FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero; FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr; endcase // end always @(posedge clk) begin // if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin ab_wren <= shreg_done_latency_dly; ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; end else begin ab_wren <= 1'b0; ab_data_in <= 32'hXXXXXXXX; end // if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin q_wren <= shreg_done_latency_dly; q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; end else begin q_wren <= 1'b0; q_data_in <= 32'hXXXXXXXX; end // if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin qn_wren <= shreg_done_latency_dly; qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; end else begin qn_wren <= 1'b0; qn_data_in <= 32'hXXXXXXXX; end // case (fsm_state) FSM_STATE_SAVE_START: r_wren <= 1'b1; FSM_STATE_SAVE_WRITE: r_wren <= ~r_addr_done; default: r_wren <= 1'b0; endcase // end always @(posedge clk) // case (fsm_next_state) FSM_STATE_MULT_A_B_START, FSM_STATE_MULT_AB_N_COEFF_START, FSM_STATE_MULT_Q_N_START, FSM_STATE_MULT_A_B_RELOAD, FSM_STATE_MULT_AB_N_COEFF_RELOAD, FSM_STATE_MULT_Q_N_RELOAD: // syst_cnt_load <= syst_cnt_zero; FSM_STATE_MULT_A_B_CRUNCH, FSM_STATE_MULT_AB_N_COEFF_CRUNCH, FSM_STATE_MULT_Q_N_CRUNCH: // syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; endcase always @(posedge clk) // case (fsm_state) FSM_STATE_MULT_A_B_CRUNCH, FSM_STATE_MULT_AB_N_COEFF_CRUNCH, FSM_STATE_MULT_Q_N_CRUNCH: begin if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero; else if (shreg_now_unloading) syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; end endcase always @(posedge clk) // case (fsm_state) FSM_STATE_MULT_A_B_CRUNCH, FSM_STATE_MULT_AB_N_COEFF_CRUNCH, FSM_STATE_MULT_Q_N_CRUNCH: begin if (shreg_now_unloading) for (j=0; j syst_cnt_zero) pe_t_mem[syst_cnt_unload-1'b1][SYSTOLIC_ARRAY_LENGTH-1] <= pe_p[0]; else pe_t_mem[syst_cnt_last][SYSTOLIC_ARRAY_LENGTH-1] <= 32'd0; end end endcase // // T and C_IN can be moved to a separate code block // always @(posedge clk) begin // if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) // for (j=0; j {1'b0, a_addr}) ? 32'd0 : a_bram_out; pe_b[j] <= loader_dout[j]; pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j]; pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j]; end else begin pe_a[j] <= 32'hXXXXXXXX; pe_b[j] <= 32'hXXXXXXXX; pe_t[j] <= 32'hXXXXXXXX; pe_c_in[j] <= 32'hXXXXXXXX; end // if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) // for (j=0; j {1'b0, q_addr}) ? 32'd0 : q_data_out; pe_b[j] <= loader_dout[j]; pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j]; pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j]; end else begin pe_a[j] <= 32'hXXXXXXXX; pe_b[j] <= 32'hXXXXXXXX; pe_t[j] <= 32'hXXXXXXXX; pe_c_in[j] <= 32'hXXXXXXXX; end // // end // // Adder // /* * This adder is used to calculate S = AB + QN. * */ reg add1_ce; // clock enable reg [31: 0] add1_s; // sum output wire add1_c_in; // carry input wire [31: 0] add1_a; // A-input reg [31: 0] add1_b; // B-input reg add1_c_in_mask; // flag to not carry anything into the very first word reg add1_c_out; // carry output /* add masking into carry feedback chain */ assign add1_c_in = add1_c_out & ~add1_c_in_mask; /* mask carry for the very first word of N */ //always @(posedge clk) add1_c_in_mask <= (fsm_next_state == FSM_STATE_INIT_2) ? 1'b1 : 1'b0; always @(posedge clk) // if (add1_ce) // {add1_c_out, add1_s} <= {{1{1'b0}}, add1_a} + {{1{1'b0}}, add1_b} + {{32{1'b0}}, add1_c_in}; assign add1_a = qn_data_in; always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) add1_b <= shreg_done_latency_dly ? ab_data_out : 32'hXXXXXXXX; else add1_b <= 32'hXXXXXXXX; always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) add1_c_in_mask <= (shreg_done_latency_dly && (ab_addr_ext == bram_addr_ext_zero)) ? 1'b1 : 1'b0; else add1_c_in_mask <= 1'b0; always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) add1_ce <= shreg_done_latency_dly; else add1_ce <= 1'b0; assign s_data_in = add1_s; assign sn_data_in = sub1_d; always @(posedge clk) begin // s_wren <= add1_ce; sn_wren <= sub1_ce; end // // Subtractor // /* * This subtractor is used to calculate SN = S - N. * */ reg sub1_ce; // clock enable reg [31: 0] sub1_d; // difference output wire sub1_b_in; // borrow input wire [31: 0] sub1_a; // A-input reg [31: 0] sub1_b; // B-input reg sub1_b_in_mask; // flag to not borrow anything from the very first word reg sub1_b_out; // borrow output /* add masking into borrow feedback chain */ assign sub1_b_in = sub1_b_out & ~sub1_b_in_mask; always @(posedge clk) // if (sub1_ce) // {sub1_b_out, sub1_d} <= {{1{1'b0}}, sub1_a} - {{1{1'b0}}, sub1_b} - {{32{1'b0}}, sub1_b_in}; assign sub1_a = add1_s; always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) sub1_b <= add1_ce ? n_bram_out : 32'hXXXXXXXX; else sub1_b <= 32'hXXXXXXXX; always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) sub1_b_in_mask <= (add1_ce && ((qn_addr_ext - 1'b1) == {1'b0, bram_addr_last})) ? 1'b1 : 1'b0; else sub1_b_in_mask <= 1'b0; always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) sub1_ce <= add1_ce && (qn_addr_ext > {1'b0, q_addr}); else sub1_ce <= 1'b0; assign s_data_in = add1_s; always @(posedge clk) // s_wren <= add1_ce; always @(posedge clk) // if (fsm_state == FSM_STATE_MULT_Q_N_FINAL) flag_select_s <= sub1_b_out & ~add1_c_out; always @(posedge clk) // case (fsm_state) FSM_STATE_SAVE_START, FSM_STATE_SAVE_WRITE: r_data_in <= flag_select_s ? s_data_out : sn_data_out; endcase // // FSM Process // always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; // // FSM Transition Logic // always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START; else fsm_next_state = FSM_STATE_IDLE; // FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT; FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE; else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL; else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START; // FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE; else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL; else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START; // FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT; FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE; else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL; else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START; // FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD; else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL; else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START; // FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD; else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL; else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START; // FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD; else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL; else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_SAVE_START; // FSM_STATE_SAVE_START: fsm_next_state = FSM_STATE_SAVE_WRITE; FSM_STATE_SAVE_WRITE: if (r_addr_done) fsm_next_state = FSM_STATE_SAVE_FINAL; else fsm_next_state = FSM_STATE_SAVE_WRITE; FSM_STATE_SAVE_FINAL: fsm_next_state = FSM_STATE_STOP; // FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; endcase // end endmodule //====================================================================== // End of file //======================================================================