aboutsummaryrefslogblamecommitdiff
path: root/rtl/modexpng_mmm_dual.v
blob: 8d8b83d9dc0fd6ddc0363ea4f4d2e383d8f488af (plain) (tree)































                                                                           
                        
 
               

             



                           
                  
                
                  
    

                               





                        



                      



                      

                    


















                         
                      





              
                                     
                                       





                                     
                                       






                                        
                              
                      
                        
        


                                          

                                                  

                                                                

                                                   



                                                

                                               
                                                             
                                    

                                    
 
                                                




                                     
                                                  




                                       
                                           





                                
                           




                      

                                                             
 




                                                              




                  
                                          
          

                                                    






                                       

                                              



                                     
                                            







































                                                                       


                                                          






                                                        


                                                          



































                                                                                                                                  

                                                 

















                                                                   

                                                   


















                                                                       

                                                    















                                                                         
                                          
          
                                          





                                 





                                                                                                                                  
                  





                                                                                                                                                   

                                                  





                                                                                                                                     









                                                                           





                                                                                        
                  





                                                                                                                                                   
                                                        
                  





                                                                                                                                     







                                                                            

















                                                                                                                                                   































                                                                            
                                          
          
                         











                                                                                                                           





                                                                                                                                   
                      





                                                                                                                                     
                      





                                                                                                                                      









                                                                                                                       





                                                                                                                                 
                  





                                                                                                                                   
                  






                                                                                                                 








                                                                                             

















                                                                                                   





                                                                            


















                                                                                                                                          
                                       

                                                                      









                                                                                  

















                                                                               





                                                                           












                                                                                                                                        











































                                                                            

                                                        



















































                                                    
                                          
          
                         


























                                                                      
                             
    
                               

                        

                                                                           
               

                        

                                                                               



                                                                           



                                     


                                                                                                         
  
                                                                       
                                                                                       
  




                                                

                                                 

                                        

                                                                
                                                                    



                                

                                





















































                                                                            











                                                                                                                                             























                                                                                    
                                                









































                                                            
                                          
          

                                                                                            












                                                                                              




                                                                                                                                         
     


                   
                                            

                        
                                                                                                                                                                              
                        


                                                                                                                                                             
            


                                                                                                                                                       
            
                                                                                                                                                                         
 


                                                                                                                                                                          
            


                                                                                                                                                            
            
                                                                                                                                                                                      
 


                                                                                                                                                                              
            


                                                                                                                                                                
            
                                                                                                                                                                            
            
                                                                                                                                                          
            
                                                                                                                                   
            
                                                                                                                                












                                    
                                          
          
                                                                        
                             
                                                                         



                                                                     






                         
                                          
          
                                    

                                            
                                                                            
           











                                                         

         
//======================================================================
//
// Copyright (c) 2019, NORDUnet A/S All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// - Redistributions of source code must retain the above copyright
//   notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
//   notice, this list of conditions and the following disclaimer in the
//   documentation and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may
//   be used to endorse or promote products derived from this software
//   without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//======================================================================

module modexpng_mmm_dual
(
    clk, rst_n,
    
    ena, rdy,
        
    ladder_mode,
    word_index_last,
    word_index_last_minus1,
    force_unity_b,
    only_reduce,
    just_multiply,
    
    sel_wide_in, sel_narrow_in,
    
    rd_wide_xy_ena,
    rd_wide_xy_ena_aux,
    rd_wide_xy_bank,
    rd_wide_xy_bank_aux,
    rd_wide_xy_addr,
    rd_wide_xy_addr_aux,
    rd_wide_x_din,
    rd_wide_y_din,
    rd_wide_x_din_aux,
    rd_wide_y_din_aux,
    
    rd_narrow_xy_ena,
    rd_narrow_xy_bank,
    rd_narrow_xy_addr,
    rd_narrow_x_din,
    rd_narrow_y_din,
    
    rcmb_wide_xy_bank,
    rcmb_wide_xy_addr,
    rcmb_wide_x_dout,
    rcmb_wide_y_dout,
    rcmb_wide_xy_valid,
    
    rcmb_narrow_xy_bank,
    rcmb_narrow_xy_addr,
    rcmb_narrow_x_dout,
    rcmb_narrow_y_dout,
    rcmb_narrow_xy_valid,
    
    rcmb_xy_bank,
    rcmb_xy_addr,
    rcmb_x_dout,
    rcmb_y_dout,
    rcmb_xy_valid,
    
    rdct_ena, rdct_rdy
);


    //
    // Headers
    //
    `include "modexpng_parameters.vh"
    `include "modexpng_mmm_dual_fsm.vh"


    //
    // Ports
    //
    input                        clk;
    input                        rst_n;
    
    input                        ena;
    output                       rdy;
    
    input                   ladder_mode;
    input [7:0] word_index_last;
    input [7:0] word_index_last_minus1;
    input       force_unity_b;
    input only_reduce;
    input just_multiply;
        
    input [BANK_ADDR_W-1:0] sel_wide_in;
    input [BANK_ADDR_W-1:0] sel_narrow_in;
    
    output                     rd_wide_xy_ena;
    output                     rd_wide_xy_ena_aux;
    output  [             BANK_ADDR_W -1:0] rd_wide_xy_bank;
    output  [             BANK_ADDR_W -1:0] rd_wide_xy_bank_aux;
    output  [ 8*NUM_MULTS/2-1:0] rd_wide_xy_addr;
    output  [           8-1:0] rd_wide_xy_addr_aux;
    input  [18*NUM_MULTS/2-1:0] rd_wide_x_din;
    input  [18*NUM_MULTS/2-1:0] rd_wide_y_din;
    input  [          18-1:0] rd_wide_x_din_aux;
    input  [          18-1:0] rd_wide_y_din_aux;

    output                    rd_narrow_xy_ena;
    output [             BANK_ADDR_W -1:0] rd_narrow_xy_bank;
    output [ 7:0] rd_narrow_xy_addr;
    input  [18-1:0] rd_narrow_x_din;
    input  [18-1:0] rd_narrow_y_din;

    output [BANK_ADDR_W -1:0] rcmb_wide_xy_bank;
    output [ 7:0] rcmb_wide_xy_addr;
    output [17:0] rcmb_wide_x_dout;
    output [17:0] rcmb_wide_y_dout;
    output        rcmb_wide_xy_valid;

    output [BANK_ADDR_W -1:0] rcmb_narrow_xy_bank;
    output [ 7:0] rcmb_narrow_xy_addr;
    output [17:0] rcmb_narrow_x_dout;
    output [17:0] rcmb_narrow_y_dout;
    output        rcmb_narrow_xy_valid;

    output [BANK_ADDR_W -1:0] rcmb_xy_bank;
    output [ 7:0] rcmb_xy_addr;
    output [17:0] rcmb_x_dout;
    output [17:0] rcmb_y_dout;
    output        rcmb_xy_valid;
    
    output        rdct_ena;
    input         rdct_rdy;

    
    //
    // FSM Declaration
    //
    reg [MMM_FSM_STATE_W-1:0] fsm_state = MMM_FSM_STATE_IDLE;
    reg [MMM_FSM_STATE_W-1:0] fsm_state_next;

    wire [MMM_FSM_STATE_W-1:0] fsm_state_after_idle;    
    wire [MMM_FSM_STATE_W-1:0] fsm_state_after_mult_square;
    wire [MMM_FSM_STATE_W-1:0] fsm_state_after_mult_triangle;
    wire [MMM_FSM_STATE_W-1:0] fsm_state_after_mult_rectangle;
    wire [MMM_FSM_STATE_W-1:0] fsm_state_after_square_holdoff;

    
    //
    // FSM Process
    //
    always @(posedge clk or negedge rst_n)
        //
        if (!rst_n) fsm_state <= MMM_FSM_STATE_IDLE;
        else        fsm_state <= fsm_state_next;

        
    //
    // Storage Control Interface
    //
    reg         wide_xy_ena = 1'b0;
    reg         wide_xy_ena_aux = 1'b0;
    reg  [ BANK_ADDR_W -1:0] wide_xy_bank;
    reg  [ BANK_ADDR_W -1:0] wide_xy_bank_aux;
    reg  [ 8-1:0] wide_xy_addr[0:3];
    reg  [ 8-1:0] wide_xy_addr_aux;
    
    reg         narrow_xy_ena = 1'b0;
    reg  [ BANK_ADDR_W -1:0] narrow_xy_bank;
    reg  [ 7:0] narrow_xy_addr;
    reg  [ 7:0] narrow_xy_addr_dly;
    
    assign rd_wide_xy_ena  = wide_xy_ena;
    assign rd_wide_xy_ena_aux  = wide_xy_ena_aux;
    assign rd_wide_xy_bank = wide_xy_bank;
    assign rd_wide_xy_bank_aux = wide_xy_bank_aux;
    assign rd_wide_xy_addr_aux = wide_xy_addr_aux;

    assign rd_narrow_xy_ena  = narrow_xy_ena;
    assign rd_narrow_xy_bank = narrow_xy_bank;
    assign rd_narrow_xy_addr = narrow_xy_addr;

    genvar z;
    generate for (z=0; z<(NUM_MULTS/2); z=z+1)
        begin : gen_rd_wide_xy_addr
            assign rd_wide_xy_addr[8*z+:8] = wide_xy_addr[z];
        end
    endgenerate
        
    //
    // Column Counter
    //
    reg  [4:0] col_index;       // current column index
    reg  [4:0] col_index_prev;  // delayed column index value
    reg  [4:0] col_index_last;  // index of the very last column
    reg  [4:0] col_index_next;  // precomputed next column index
    reg        col_is_last;     // flag set during the very last column

    always @(posedge clk)
        //
        col_index_prev <= col_index;

    //
    // Column Counter Increment Logic
    //
    always @(posedge clk)
        //
        case (fsm_state_next)
            //
            MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT,
            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
            MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT: begin
                col_index       <= 5'd0;
                col_index_last  <= word_index_last[7:3];
                col_index_next  <= 5'd1;
                col_is_last     <= 1'b0;
                
            end
            //
            MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT,
            MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
            MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT: begin
                col_index <= col_index_next;
                col_is_last <= col_index_next == col_index_last;
                col_index_next <= col_index_next == col_index_last ? 5'd0 : col_index_next + 5'd1;   
            end
            //
        endcase


    //
    // Completion Flags
    //
    wire square_almost_done_comb;
    reg  square_almost_done_flop = 1'b0;
    reg  square_surely_done_flop = 1'b0;

    wire triangle_almost_done_comb;
    reg  triangle_almost_done_flop = 1'b0;
    reg  triangle_surely_done_flop = 1'b0;
    reg  triangle_tardy_done_flop = 1'b0;

    wire rectangle_almost_done_comb;
    reg  rectangle_almost_done_flop = 1'b0;        
    reg  rectangle_surely_done_flop = 1'b0;
    reg  rectangle_tardy_done_flop = 1'b0;

    assign square_almost_done_comb = narrow_xy_addr == word_index_last_minus1;
    assign triangle_almost_done_comb = (narrow_xy_addr[2:0] == word_index_last_minus1[2:0]) && (narrow_xy_addr[7:3] == col_index);
    assign rectangle_almost_done_comb = narrow_xy_addr == word_index_last_minus1;

    //
    // Square Completion Flags
    //
    always @(posedge clk) begin
        //
        case (fsm_state)
            //
            MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:
                square_almost_done_flop <= square_almost_done_comb;
            //
            default:
               square_almost_done_flop <= 1'b0;
           //
        endcase
        //
        square_surely_done_flop <= square_almost_done_flop;
        //
    end

    //
    // Triangle Completion Flags
    //
    always @(posedge clk) begin
        //
        case (fsm_state)
            //
            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:
                triangle_almost_done_flop <= triangle_almost_done_comb;
            //
            default:
                triangle_almost_done_flop <= 1'b0;
            //
        endcase
        //
        triangle_surely_done_flop <= triangle_almost_done_flop;
        triangle_tardy_done_flop  <= triangle_surely_done_flop;
        //
    end
      
    //
    // Rectangle Completion Flags
    //
    always @(posedge clk) begin
        //
        case (fsm_state)
            //
            MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:
                rectangle_almost_done_flop <= rectangle_almost_done_comb;
            //
            default:
                rectangle_almost_done_flop <= 1'b0;
            //
        endcase
        //
        rectangle_surely_done_flop <= rectangle_almost_done_flop;
        rectangle_tardy_done_flop  <= rectangle_surely_done_flop;
        //
    end


    //
    // Narrow Storage Control Logic
    //
    always @(posedge clk or negedge rst_n)
        //
        if (!rst_n) narrow_xy_ena <= 1'b0;
        else begin
            //
            // Narrow Address
            //
            case (fsm_state_next)
                //
                MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT:   narrow_xy_addr <= 8'd0;
                MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:   narrow_xy_addr <= !square_almost_done_flop ? narrow_xy_addr + 1'b1 : 8'd0;
                //
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT: narrow_xy_addr <= 8'd0;
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: narrow_xy_addr <= triangle_almost_done_flop || (col_is_last && triangle_surely_done_flop) ?
                    8'd0 :  narrow_xy_addr + 1'b1;
                //
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT: narrow_xy_addr <= 8'd0;
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: narrow_xy_addr <= rectangle_almost_done_flop || rectangle_surely_done_flop ?
                    8'd1 :  narrow_xy_addr + 1'b1;            
                //
                default:                            narrow_xy_addr <= 8'dX;
                //
            endcase
            //
            // Narrow Bank
            //
            case (fsm_state_next)
                //
                MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:   narrow_xy_bank <= sel_narrow_in;
                //
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: narrow_xy_bank <= col_is_last && (triangle_almost_done_flop || triangle_surely_done_flop) ?
                    BANK_NARROW_EXT : BANK_NARROW_COEFF;
                //
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: narrow_xy_bank <= rectangle_almost_done_flop || rectangle_surely_done_flop ?
                    BANK_NARROW_EXT : BANK_NARROW_Q;            
                //
                default:                            narrow_xy_bank <= 2'bXX;
                //
            endcase        
            //
            case (fsm_state_next)
                //
                MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG:   narrow_xy_ena <= 1'b1;
                MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:   narrow_xy_ena <= ~square_almost_done_flop;
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:   narrow_xy_ena <= 1'b1;
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   narrow_xy_ena <= !col_is_last ? ~triangle_almost_done_flop : ~triangle_surely_done_flop; 
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:   narrow_xy_ena <= 1'b1;
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   narrow_xy_ena <= ~rectangle_surely_done_flop;
                //
                default:                              narrow_xy_ena <= 1'b0;
                //
            endcase
            //
        end


    //
    // Wide Storage Control Logic
    //

    wire [2:0] wide_offset_rom[0:3];
    
    generate for (z=1; z<NUM_MULTS; z=z+2)
        begin : gen_wide_offset_rom
            assign wide_offset_rom[(z-1)/2] = z[2:0];
        end
    endgenerate    

    function  [7:0] wide_xy_addr_next;
        input [7:0] wide_xy_addr_current;
        input [7:0] wide_xy_addr_last;
        begin
            if (wide_xy_addr_current > 8'd0)
                wide_xy_addr_next = wide_xy_addr_current - 1'b1;
            else
                wide_xy_addr_next = wide_xy_addr_last;
        end
    endfunction
    
    integer j;
    always @(posedge clk or negedge rst_n)
        //
        if (!rst_n) begin
            wide_xy_ena <= 1'b0;
            wide_xy_ena_aux <= 1'b0;
        end else begin
            //
            // Wide Address
            //        
            for (j=0; j<(NUM_MULTS/2); j=j+1)
                //
                case (fsm_state_next)
                    //
                    // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
                    //
                    MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT:   wide_xy_addr[j] <= {5'd0, wide_offset_rom[j]};
                    MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT:   wide_xy_addr[j] <= {col_index_next, wide_offset_rom[j]};
                    MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                    MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                    MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                    MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:   wide_xy_addr[j] <= wide_xy_addr_next(wide_xy_addr[j], word_index_last);
                    //
                    MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   wide_xy_addr[j] <= {5'd0, wide_offset_rom[j]};
                    MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   wide_xy_addr[j] <= {col_index_next, wide_offset_rom[j]};
                    MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                    MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
                    MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                    MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_addr[j] <= wide_xy_addr_next(wide_xy_addr[j], word_index_last);
                    //
                    MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   wide_xy_addr[j] <= {5'd0, wide_offset_rom[j]};
                    MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT:   wide_xy_addr[j] <= {col_index_next, wide_offset_rom[j]};
                    MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                    MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
                    MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                    MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   wide_xy_addr[j] <= wide_xy_addr_next(wide_xy_addr[j], word_index_last);
                    //
                    default:                            wide_xy_addr[j] <= 8'dX;
                endcase
            //
            // Wide Aux Address
            //
            case (fsm_state_next)
                //
                // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
                //
                MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
                MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
                MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:   wide_xy_addr_aux <= wide_xy_addr_next(wide_xy_addr_aux, word_index_last);
                //
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   wide_xy_addr_aux <= {5'd0, 3'd1};
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_addr_aux <= wide_xy_addr_next(wide_xy_addr_aux, word_index_last);
                //
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   wide_xy_addr_aux <= 8'dX;//{5'd0, 3'd0};
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_HOLDOFF:    wide_xy_addr_aux <= rcmb_xy_valid ? rcmb_xy_addr : 8'dX;
                //recomb_fat_bram_xy_dout_valid && (recomb_fat_bram_xy_bank == BANK_FAT_ML) ?
                    //mac_fat_bram_xy_addr[4] + 1'b1 : mac_fat_bram_xy_addr[4];
                //
                default:                            wide_xy_addr_aux <= 8'dX;
            endcase
            //
            // Wide Bank
            //
            case (fsm_state_next)
                MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:    wide_xy_bank <= sel_wide_in;
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:  wide_xy_bank <= BANK_WIDE_L;
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_bank <= BANK_WIDE_L;
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,    
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   wide_xy_bank <= BANK_WIDE_N;            
                default:                             wide_xy_bank <= 3'bXXX;
            endcase
            //
            // Wide Aux Bank
            //
            case (fsm_state_next)
                MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:   wide_xy_bank_aux <= sel_wide_in;
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: wide_xy_bank_aux <= BANK_WIDE_H;
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   wide_xy_bank_aux <= BANK_WIDE_L;
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,    
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_HOLDOFF:   if (rcmb_xy_valid) // rewrite using "Kolya-style" here (get rid of too many xxx's)
                    case (rcmb_xy_bank)
                        BANK_RCMB_ML: wide_xy_bank_aux <= BANK_WIDE_L;
                        BANK_RCMB_MH: wide_xy_bank_aux <= BANK_WIDE_H;
                        //BANK_RDCT_EXT: wide_xy_bank_aux <= BANK_WIDE_EXT; '3bXXX
                        default: wide_xy_bank_aux <= 3'bXXX; 
                     endcase
                     else wide_xy_bank_aux <= 3'bXXX;
                default:                            wide_xy_bank_aux <= 3'bXXX;
            endcase
            //
            // Wide Enable
            //
            case (fsm_state_next)
                MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   wide_xy_ena <= 1'b1;
                default:                               wide_xy_ena <= 1'b0;
            endcase
            //
            // Wide Aux Enable
            //
            case (fsm_state_next)
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:  wide_xy_ena_aux <= 1'b1;
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT: wide_xy_ena_aux <= 1'b0;//1'b1;
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY,
                MMM_FSM_STATE_MULT_RECTANGLE_HOLDOFF:   wide_xy_ena_aux <= rcmb_xy_valid;// && (recomb_fat_bram_xy_bank == BANK_FAT_ML);
                default:                            wide_xy_ena_aux <= 1'b0;
            endcase
            //
        end
        
        
    //
    // Delay Lines
    //
    always @(posedge clk)
        //
        narrow_xy_addr_dly <= narrow_xy_addr;

    
    //
    // DSP Array Logic
    //
    reg             dsp_xy_ce_a = 1'b0;
    reg             dsp_xy_ce_b = 1'b0;
    reg             dsp_xy_ce_b_dly = 1'b0;
    reg             dsp_xy_ce_m = 1'b0;
    reg             dsp_xy_ce_p = 1'b0;
    reg             dsp_xy_ce_mode = 1'b0;
    
    reg  [9   -1:0] dsp_xy_mode_z = {9{1'b1}};
    
    wire [5*18-1:0] dsp_x_a;
    wire [5*18-1:0] dsp_y_a;

    reg  [1*16-1:0] dsp_x_b;
    reg  [1*16-1:0] dsp_y_b;
    
    reg  [ 1:0] dsp_xy_b_carry;

    wire [9*47-1:0] dsp_x_p;            
    wire [9*47-1:0] dsp_y_p;
        
    //generate for (z=0; z<(NUM_MULTS/2); z=z+1)
        //begin : gen_dsp_xy_a_split
            //assign dsp_x_a[18*z+:18] = rd_wide_x_dout[z];
            //assign dsp_y_a[18*z+:18] = rd_wide_y_dout[z];
        //end
    //endgenerate
    
    assign dsp_x_a = {rd_wide_x_din_aux, rd_wide_x_din};
    assign dsp_y_a = {rd_wide_y_din_aux, rd_wide_y_din};
    
    //assign dsp_x_a[18*4+:18] = rd_wide_x_dout_aux;
    //assign dsp_y_a[18*4+:18] = rd_wide_y_dout_aux;
            
    always @(posedge clk)
        //
        dsp_xy_ce_b_dly <= dsp_xy_ce_b;
    

    modexpng_dsp_array_block dsp_array_block_x
    (
        .clk            (clk),
        
        .ce_a           (dsp_xy_ce_a),
        .ce_b           (dsp_xy_ce_b),
        .ce_m           (dsp_xy_ce_m),
        .ce_p           (dsp_xy_ce_p),
        .ce_mode        (dsp_xy_ce_mode),

        .mode_z         (dsp_xy_mode_z),
        
        .a              (dsp_x_a),
        .b              (dsp_x_b),
        .p              (dsp_x_p)
    );

    modexpng_dsp_array_block dsp_array_block_y
    (
        .clk            (clk),
        
        .ce_a           (dsp_xy_ce_a),
        .ce_b           (dsp_xy_ce_b),
        .ce_m           (dsp_xy_ce_m),
        .ce_p           (dsp_xy_ce_p),
        .ce_mode        (dsp_xy_ce_mode),

        .mode_z         (dsp_xy_mode_z),
        
        .a              (dsp_y_a),
        .b              (dsp_y_b),
        .p              (dsp_y_p)
    );


   

    //
    // DSP Control Logic
    //
    reg narrow_xy_ena_dly1 = 1'b0;
    reg narrow_xy_ena_dly2 = 1'b0;
    
    always @(posedge clk or negedge rst_n)
        //
        if (!rst_n) begin
            //
            narrow_xy_ena_dly1 <= 1'b0;
            narrow_xy_ena_dly2 <= 1'b0;
            //
            dsp_xy_ce_a    <= 1'b0;
            dsp_xy_ce_b    <= 1'b0;
            dsp_xy_ce_m    <= 1'b0;
            dsp_xy_ce_p    <= 1'b0;
            dsp_xy_ce_mode <= 1'b0;
            //
        end else begin
            //
            narrow_xy_ena_dly1 <= narrow_xy_ena;
            narrow_xy_ena_dly2 <= narrow_xy_ena_dly1; 
            //
            dsp_xy_ce_a    <= narrow_xy_ena_dly1 | narrow_xy_ena_dly2;
            dsp_xy_ce_b    <= narrow_xy_ena_dly2;
            dsp_xy_ce_m    <= dsp_xy_ce_b_dly;
            dsp_xy_ce_p    <= dsp_xy_ce_m;
            dsp_xy_ce_mode <= dsp_xy_ce_b_dly;
            //
        end    
        
    //
    // DSP Feed Logic
    //
    reg dsp_merge_xy_b;
    reg dsp_merge_xy_b_first;
    
    always @(posedge clk) begin
        //
        case (fsm_state)
            MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG:   dsp_merge_xy_b <= 1'b1;
            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: dsp_merge_xy_b <= 1'b0;
        endcase
        //
        case (fsm_state)
            MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG: dsp_merge_xy_b_first <= 1'b1;
            default:                          dsp_merge_xy_b_first <= 1'b0;
        endcase
        //
    end

    //
    // On-the-fly Carry Recombination
    //
    wire [17:0] rd_narrow_x_din_carry = rd_narrow_x_din + {{16{1'b0}}, dsp_xy_b_carry};
    wire [17:0] rd_narrow_y_din_carry = rd_narrow_y_din + {{16{1'b0}}, dsp_xy_b_carry};
    wire [17:0] rd_narrow_xy_din_carry_mux = ladder_mode ? rd_narrow_y_din_carry : rd_narrow_x_din_carry;
  
    wire [15:0] rd_narrow_xy_dout_carry_mux_or_unity = !force_unity_b ?
        rd_narrow_xy_din_carry_mux[15:0] : dsp_merge_xy_b_first ? WORD_ONE : WORD_ZERO;
  
    always @(posedge clk)
        //
        if (narrow_xy_ena_dly2) begin // rewrite
            //
            if (!dsp_merge_xy_b) begin
                dsp_x_b <= rd_narrow_x_din[15:0];
                dsp_y_b <= rd_narrow_y_din[15:0];
                dsp_xy_b_carry <= 2'b00;
            end else begin
                dsp_x_b <= rd_narrow_xy_dout_carry_mux_or_unity;
                dsp_y_b <= rd_narrow_xy_dout_carry_mux_or_unity;
                dsp_xy_b_carry <= rd_narrow_xy_din_carry_mux[17:16];
            end                 
            //
        end else begin
            //
            dsp_x_b <= WORD_DNC;
            dsp_y_b <= WORD_DNC;
            //
            dsp_xy_b_carry <= 2'b00;
            //
        end

        
    reg  [9   -1:0] dsp_xy_mode_z_adv1 = {9{1'b1}};
    reg  [9   -1:0] dsp_xy_mode_z_adv2 = {9{1'b1}};
    reg  [9   -1:0] dsp_xy_mode_z_adv3 = {9{1'b1}};
    reg  [9   -1:0] dsp_xy_mode_z_adv4 = {9{1'b1}};
        
         function  [NUM_MULTS:0] calc_mac_mode_z_square;
        input [        4:0] col_index_value;
        input [        7:0] narrow_xy_addr_value;
        begin
            if (narrow_xy_addr_value[7:3] == col_index_value)
                case (narrow_xy_addr_value[2:0])
                    3'b000: calc_mac_mode_z_square = {1'b1, 8'b11111110};
                    3'b001: calc_mac_mode_z_square = {1'b1, 8'b11111101};
                    3'b010: calc_mac_mode_z_square = {1'b1, 8'b11111011};
                    3'b011: calc_mac_mode_z_square = {1'b1, 8'b11110111};
                    3'b100: calc_mac_mode_z_square = {1'b1, 8'b11101111};
                    3'b101: calc_mac_mode_z_square = {1'b1, 8'b11011111};
                    3'b110: calc_mac_mode_z_square = {1'b1, 8'b10111111};
                    3'b111: calc_mac_mode_z_square = {1'b1, 8'b01111111};
                endcase
            else
                calc_mac_mode_z_square = {1'b1, {NUM_MULTS{1'b1}}};
        end
    endfunction
    
    function  [NUM_MULTS:0] calc_mac_mode_z_rectangle;
        input [        4:0] col_index_value;
        input [        7:0] narrow_xy_addr_value;
        begin
            if (narrow_xy_addr_value[7:3] == col_index_value)
                case (narrow_xy_addr_value[2:0])
                    3'b000: calc_mac_mode_z_rectangle = {1'b1, 8'b11111110};
                    3'b001: calc_mac_mode_z_rectangle = {1'b1, 8'b11111101};
                    3'b010: calc_mac_mode_z_rectangle = {1'b1, 8'b11111011};
                    3'b011: calc_mac_mode_z_rectangle = {1'b1, 8'b11110111};
                    3'b100: calc_mac_mode_z_rectangle = {1'b1, 8'b11101111};
                    3'b101: calc_mac_mode_z_rectangle = {1'b1, 8'b11011111};
                    3'b110: calc_mac_mode_z_rectangle = {1'b1, 8'b10111111};
                    3'b111: calc_mac_mode_z_rectangle = {1'b1, 8'b01111111};
                endcase
            else
                calc_mac_mode_z_rectangle = {1'b1, {NUM_MULTS{1'b1}}};
        end
    endfunction
        
    always @(posedge clk)
        //
        case (fsm_state_next)
            MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG:       dsp_xy_mode_z_adv4 <= {9{1'b0}};
            MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY:       dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, narrow_xy_addr_dly);
            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= {9{1'b1}};
            MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
            MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= calc_mac_mode_z_rectangle(col_index_prev, narrow_xy_addr_dly);
            default:                                dsp_xy_mode_z_adv4 <= {9{1'b1}};
        endcase

    always @(posedge clk) begin
        dsp_xy_mode_z <= dsp_xy_mode_z_adv1;
        //
        dsp_xy_mode_z_adv1 <= dsp_xy_mode_z_adv2;
        dsp_xy_mode_z_adv2 <= dsp_xy_mode_z_adv3;
        dsp_xy_mode_z_adv3 <= dsp_xy_mode_z_adv4;
    end
        


    
    
    //
    // Recombinator
    //
    reg  rcmb_ena = 1'b0;
    wire rcmb_rdy;

    modexpng_recombinator_block recombinator_block
    (
        .clk                            (clk),
        .rst_n                          (rst_n),
        
        .ena                            (rcmb_ena),
        .rdy                            (rcmb_rdy),
        
        .fsm_state_next                 (fsm_state_next),
        
        .word_index_last                (word_index_last),
        
        .dsp_xy_ce_p                    (dsp_xy_ce_p),
        .dsp_x_p                        (dsp_x_p),
        .dsp_y_p                        (dsp_y_p),
        
        .col_index                      (col_index),
        .col_index_last                 (col_index_last),
        
        .rd_narrow_xy_addr                 (narrow_xy_addr),
        .rd_narrow_xy_bank                 (narrow_xy_bank),
        
        .rcmb_wide_xy_bank          (rcmb_wide_xy_bank),
        .rcmb_wide_xy_addr          (rcmb_wide_xy_addr),
        .rcmb_wide_x_dout           (rcmb_wide_x_dout),
        .rcmb_wide_y_dout           (rcmb_wide_y_dout),
        .rcmb_wide_xy_valid         (rcmb_wide_xy_valid),
        
        .rcmb_narrow_xy_bank        (rcmb_narrow_xy_bank),
        .rcmb_narrow_xy_addr        (rcmb_narrow_xy_addr),
        .rcmb_narrow_x_dout         (rcmb_narrow_x_dout),
        .rcmb_narrow_y_dout         (rcmb_narrow_y_dout),
        .rcmb_narrow_xy_valid       (rcmb_narrow_xy_valid),
        
        .rdct_narrow_xy_bank        (rcmb_xy_bank),
        .rdct_narrow_xy_addr        (rcmb_xy_addr),
        .rdct_narrow_x_dout         (rcmb_x_dout),
        .rdct_narrow_y_dout         (rcmb_y_dout),
        .rdct_narrow_xy_valid       (rcmb_xy_valid)

    );
    
    
    //
    // Recombinator Enable Logic
    //    
    always @(posedge clk or negedge rst_n)
        //
        if (!rst_n) rcmb_ena <= 1'b0;
        else        rcmb_ena <= dsp_xy_ce_a && !dsp_xy_ce_b && !dsp_xy_ce_m && !dsp_xy_ce_p;

        
    //
    // Handy Completion Flags
    //    
    wire square_done    = square_surely_done_flop;
    wire triangle_done  = !col_is_last ? triangle_surely_done_flop : triangle_tardy_done_flop;
    wire rectangle_done = rectangle_tardy_done_flop;
    

    //
    // FSM Transition Logic
    //
    assign fsm_state_after_idle = !only_reduce ? MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT : MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT;
    assign fsm_state_after_mult_square    = col_is_last ? MMM_FSM_STATE_MULT_SQUARE_HOLDOFF   : MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT;
    assign fsm_state_after_mult_triangle  = col_is_last ? MMM_FSM_STATE_MULT_TRIANGLE_HOLDOFF : MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT;
    assign fsm_state_after_mult_rectangle = col_is_last ? MMM_FSM_STATE_MULT_RECTANGLE_HOLDOFF : MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT;
    assign fsm_state_after_square_holdoff = just_multiply ? MMM_FSM_STATE_STOP : MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT;
     

    always @* begin
        //
        fsm_state_next = MMM_FSM_STATE_IDLE;
        //
        case (fsm_state)
            MMM_FSM_STATE_IDLE:                   fsm_state_next = ena                   ? fsm_state_after_idle /*MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT*/ : MMM_FSM_STATE_IDLE;
                        
            MMM_FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next =                         MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG ;
            MMM_FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next =                         MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
            MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = square_done ? MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT : MMM_FSM_STATE_MULT_SQUARE_COL_0_BUSY;
            
            MMM_FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next =                         MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG ;
            MMM_FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next =                         MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
            MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = square_done ? fsm_state_after_mult_square    : MMM_FSM_STATE_MULT_SQUARE_COL_N_BUSY;
            
            MMM_FSM_STATE_MULT_SQUARE_HOLDOFF:    fsm_state_next =                         rcmb_rdy ? fsm_state_after_square_holdoff : MMM_FSM_STATE_MULT_SQUARE_HOLDOFF;

            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_INIT: fsm_state_next =                         MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG ;
            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: fsm_state_next =                         MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY ;
            MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY: fsm_state_next = triangle_done ? MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT : MMM_FSM_STATE_MULT_TRIANGLE_COL_0_BUSY;     
            
            MMM_FSM_STATE_MULT_TRIANGLE_COL_N_INIT: fsm_state_next =                         MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG ;
            MMM_FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: fsm_state_next =                         MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY ;
            MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: fsm_state_next = triangle_done ? fsm_state_after_mult_triangle : MMM_FSM_STATE_MULT_TRIANGLE_COL_N_BUSY;
            
            MMM_FSM_STATE_MULT_TRIANGLE_HOLDOFF:    fsm_state_next =                         rcmb_rdy ? MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT : MMM_FSM_STATE_MULT_TRIANGLE_HOLDOFF;

            MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT: fsm_state_next =                         MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG ;
            MMM_FSM_STATE_MULT_RECTANGLE_COL_0_TRIG: fsm_state_next =                         MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY ;
            MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY: fsm_state_next = rectangle_done ? MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT : MMM_FSM_STATE_MULT_RECTANGLE_COL_0_BUSY;     
            
            MMM_FSM_STATE_MULT_RECTANGLE_COL_N_INIT: fsm_state_next =                         MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG ;
            MMM_FSM_STATE_MULT_RECTANGLE_COL_N_TRIG: fsm_state_next =                         MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY ;
            MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: fsm_state_next = rectangle_done ? fsm_state_after_mult_rectangle : MMM_FSM_STATE_MULT_RECTANGLE_COL_N_BUSY;
            
            MMM_FSM_STATE_MULT_RECTANGLE_HOLDOFF:    fsm_state_next =                         rcmb_rdy ? MMM_FSM_STATE_WAIT_REDUCTOR : MMM_FSM_STATE_MULT_RECTANGLE_HOLDOFF;
            
            MMM_FSM_STATE_WAIT_REDUCTOR:             fsm_state_next =                         rdct_rdy ? MMM_FSM_STATE_STOP : MMM_FSM_STATE_WAIT_REDUCTOR;
            
            MMM_FSM_STATE_STOP:                     fsm_state_next =                         MMM_FSM_STATE_IDLE                   ;
            
            default:                             fsm_state_next =                         MMM_FSM_STATE_IDLE                   ;

        endcase
        //
    end


    //
    // Reductor Control Logic
    //
    reg rdct_ena_reg = 1'b0;

    assign rdct_ena = rdct_ena_reg; 
    
    always @(posedge clk or negedge rst_n)
        //
        if (!rst_n)                                rdct_ena_reg <= 1'b0;
        else case (fsm_state)
           MMM_FSM_STATE_MULT_RECTANGLE_COL_0_INIT: rdct_ena_reg <= 1'b1;
           default:                             rdct_ena_reg <= 1'b0;
        endcase
    
    
    //
    // Ready Logic
    //
    reg rdy_reg = 1'b1;
    
    assign rdy = rdy_reg;
    
    always @(posedge clk or negedge rst_n)
        //
        if (!rst_n) rdy_reg <= 1'b1;
        else begin
            if (rdy && ena) rdy_reg <= 1'b0;
            if (!rdy && (fsm_state == MMM_FSM_STATE_STOP)) rdy_reg <= 1'b1; 
        end


    //
    // Debug
    //
    `ifdef MODEXPNG_ENABLE_DEBUG
        real load_cyc_mult = 0.0;
        always @(posedge clk)
            //
            if (dsp_xy_ce_m)
                load_cyc_mult <= load_cyc_mult + 1.0;    
    `endif
    
endmodule