aboutsummaryrefslogblamecommitdiff
path: root/bench/tb_square.v
blob: d35a5cccdff4d772a63355e656737d646db78d0f (plain) (tree)








































                                                             

                             

                       









































                                                                                       



















                                                                                                           


























                                                                                   




















                                               



                                          





                                          





                                           












                                                           
                                            





                                                                 
                                                  












                                                                       

                                                                                                













                                                                       

                                                                                               








                                                



                                                                 










                                                                 



                                                                 










































                                                                 








                                                     

                                                 





                                            








                                             























                                          
                 
                 













                                   
                                             
    
                            
                            
                            







                                   
                                             
        
                            
                            
                            
        
                                                  










                                                                     



                                                   

















































                                                         















                                                                                           



                                           

                                            

                                                     

                                             





                                                      
    
                                                                                          

                                                                                                                                                     
 
            
    






                                             


                                                                                            
                    

                                                                                                  


               


                                                                               


                         




















                                                                                                      






















                                                                                                      
 




                                                           
                                                             
                                                              

    


                             
              





                                                                                                                                                  








                                                                                                                                                                      







                                                                                                                                                        


                                                                              








                                                         
              
                               

                                         
              
                                 




                                                                                                                         



                                                                                                                                              





                                                                                                                           







                                                                                                                                                 
                  

                                                                                    
















                                                                                                                                          







                                                                                                                                             




                                                                                



                             
              





                                                                                        
              





                                                                                                                                                                      









                                                                                                                                                        










                                                                               






                                                                                                                                 





                                                                                                                                   
                                                                                                                












                                                                                                           





                                                                                                                                                                      





                                                                                                                 
                                                                               









                                             





                                               






                                                                               










                                                                            



                                                   
 








                                                                    

                         







                                                                                                                 







                                                  
                                                    








































                                                                                                 
                                             

                                                      





                                                   

               
                                             

                                                      




                                                                                                        


               


                                                                                                                                 




                                             
                                                                                    
                                             

                                                                                                                                            
                                                                                                  

                                                                                    



                                                                                                                                                
                                                                                    









                                                                


                                                        


                                                                    







                                                                         

                       
                                                                   

               

                                                      




                                                                    







                                                                            

                       
                                                                      
           
               















                                                                                




                                               



                                     























                                                                        


                             
                            
                            
    
                               



                                          




                                                                                                     
                   









                                                                                         






















                                                            

 






















                                                              


















                                                                                                  


















                                                                                              


















                                                                                              
                                                                   
                                                                                                                           

                                                                        









                                                                                                                                         
                                                                                                                                                           


                                                                                                                         










                                                                                                                                                                        
            










                                                                                                                                                                            









                                                                                                                         
`timescale 1ns / 1ps

module tb_square;


    //
    // Headers
    //
    `include "../rtl/modexpng_parameters.vh"
    `include "../rtl/modexpng_parameters_x8.vh"
    `include "../rtl/modexpng_mmm_fsm.vh"


    //
    // Clock
    //
    `define CLK_FREQUENCY_MHZ   100.0
    `define CLK_PERIOD_NS       (1000.0 / `CLK_FREQUENCY_MHZ)
    `define CLK_PERIOD_HALF_NS  (0.5 * `CLK_PERIOD_NS)
    
	reg clk = 1'b0;

    always begin
        #`CLK_PERIOD_HALF_NS clk = 1'b1;
        #`CLK_PERIOD_HALF_NS clk = 1'b0;
    end
    
    
    //
    // Reset
    //
    reg rst = 1'b1;
    
    

    //
    // T1, T2
    //
    reg [17:0] T1[0:31];
    reg [17:0] T2[0:31];
    reg [17:0] AB[0:63];
    reg [17:0] N_COEFF[0:32];
    reg [17:0] Q[0:32];
    reg [17:0] N[0:31];
    reg [17:0] M[0:64];


    //
    // Init
    //
    initial begin
        //
        T1[ 0] = 18'h0f13e; T1[ 1] = 18'h0daf6; T1[ 2] = 18'h0aaa9; T1[ 3] = 18'h0c2c2;
        T1[ 4] = 18'h0fc5f; T1[ 5] = 18'h12164; T1[ 6] = 18'h14375; T1[ 7] = 18'h15615;
        T1[ 8] = 18'h0d8e2; T1[ 9] = 18'h0ec15; T1[10] = 18'h17c46; T1[11] = 18'h0c922;
        T1[12] = 18'h08f00; T1[13] = 18'h152f9; T1[14] = 18'h0b0b6; T1[15] = 18'h0ce87;
        T1[16] = 18'h178f2; T1[17] = 18'h09efb; T1[18] = 18'h0409d; T1[19] = 18'h11104;
        T1[20] = 18'h0b4a6; T1[21] = 18'h158a6; T1[22] = 18'h0514e; T1[23] = 18'h0ec55;
        T1[24] = 18'h11e73; T1[25] = 18'h11ddd; T1[26] = 18'h07bd4; T1[27] = 18'h0638b;
        T1[28] = 18'h0e805; T1[29] = 18'h11c4f; T1[30] = 18'h0a2eb; T1[31] = 18'h05454;
        //
        T2[ 0] = 18'h1a479; T2[ 1] = 18'h102f5; T2[ 2] = 18'h10e72; T2[ 3] = 18'h120b1;
        T2[ 4] = 18'h169cd; T2[ 5] = 18'h1d0c4; T2[ 6] = 18'h11462; T2[ 7] = 18'h12015;
        T2[ 8] = 18'h16fca; T2[ 9] = 18'h1044f; T2[10] = 18'h122b4; T2[11] = 18'h10a5a;
        T2[12] = 18'h12620; T2[13] = 18'h0e01a; T2[14] = 18'h095cd; T2[15] = 18'h1278a;
        T2[16] = 18'h10763; T2[17] = 18'h09fe7; T2[18] = 18'h0d35c; T2[19] = 18'h10e24;
        T2[20] = 18'h1527d; T2[21] = 18'h115b3; T2[22] = 18'h05443; T2[23] = 18'h1190a;
        T2[24] = 18'h0fcc3; T2[25] = 18'h115e2; T2[26] = 18'h0a398; T2[27] = 18'h0608d;
        T2[28] = 18'h13075; T2[29] = 18'h0d816; T2[30] = 18'h0bb4c; T2[31] = 18'h04e8a;
        //
        AB[ 0] = 18'h0be4e; AB[ 1] = 18'h0fed7; AB[ 2] = 18'h09496; AB[ 3] = 18'h07181;
        AB[ 4] = 18'h0ee73; AB[ 5] = 18'h04692; AB[ 6] = 18'h0141a; AB[ 7] = 18'h0078c;
        AB[ 8] = 18'h030eb; AB[ 9] = 18'h0217c; AB[10] = 18'h0696f; AB[11] = 18'h0a165;
        AB[12] = 18'h0b753; AB[13] = 18'h04af9; AB[14] = 18'h0ed7c; AB[15] = 18'h079ce;
        AB[16] = 18'h0e863; AB[17] = 18'h097df; AB[18] = 18'h07984; AB[19] = 18'h048af;
        AB[20] = 18'h0197f; AB[21] = 18'h0206a; AB[22] = 18'h027e7; AB[23] = 18'h04b3a;
        AB[24] = 18'h03312; AB[25] = 18'h03b56; AB[26] = 18'h04487; AB[27] = 18'h0bd6a;
        AB[28] = 18'h04e4b; AB[29] = 18'h069ca; AB[30] = 18'h0f994; AB[31] = 18'h0dd4e;
        AB[32] = 18'h1b024; AB[33] = 18'h0127f; AB[34] = 18'h02631; AB[35] = 18'h0186b;
        AB[36] = 18'h03adb; AB[37] = 18'h05368; AB[38] = 18'h059a5; AB[39] = 18'h002e0;
        AB[40] = 18'h0b78a; AB[41] = 18'h016f3; AB[42] = 18'h0b58d; AB[43] = 18'h03ddb;
        AB[44] = 18'h078b0; AB[45] = 18'h0073b; AB[46] = 18'h07337; AB[47] = 18'h0c7b0;
        AB[48] = 18'h00668; AB[49] = 18'h0106d; AB[50] = 18'h01a44; AB[51] = 18'h05ee3;
        AB[52] = 18'h0462d; AB[53] = 18'h0fdeb; AB[54] = 18'h05f85; AB[55] = 18'h02af9;
        AB[56] = 18'h0e1c0; AB[57] = 18'h00989; AB[58] = 18'h01201; AB[59] = 18'h0e194;
        AB[60] = 18'h07f93; AB[61] = 18'h0e739; AB[62] = 18'h07cf6; AB[63] = 18'h019df;
        //
        N_COEFF[ 0] = 18'h05a97; N_COEFF[ 1] = 18'h0ac69; N_COEFF[ 2] = 18'h0d51e; N_COEFF[ 3] = 18'h07326;
        N_COEFF[ 4] = 18'h01053; N_COEFF[ 5] = 18'h0f68a; N_COEFF[ 6] = 18'h09c70; N_COEFF[ 7] = 18'h064f7;
        N_COEFF[ 8] = 18'h01041; N_COEFF[ 9] = 18'h0c2bf; N_COEFF[10] = 18'h0f01f; N_COEFF[11] = 18'h01842;
        N_COEFF[12] = 18'h0e69a; N_COEFF[13] = 18'h037ea; N_COEFF[14] = 18'h0b4a0; N_COEFF[15] = 18'h0c1ab;
        N_COEFF[16] = 18'h0bd5b; N_COEFF[17] = 18'h09e5e; N_COEFF[18] = 18'h039bd; N_COEFF[19] = 18'h06430;
        N_COEFF[20] = 18'h0b460; N_COEFF[21] = 18'h08bd4; N_COEFF[22] = 18'h09fcd; N_COEFF[23] = 18'h05391;
        N_COEFF[24] = 18'h0fa45; N_COEFF[25] = 18'h08892; N_COEFF[26] = 18'h0732c; N_COEFF[27] = 18'h0baf6;
        N_COEFF[28] = 18'h067a9; N_COEFF[29] = 18'h0b184; N_COEFF[30] = 18'h02089; N_COEFF[31] = 18'h0297b;
        N_COEFF[32] = 18'h01810;
        //
        Q[ 0] = 18'h0ac02; Q[ 1] = 18'h0a026; Q[ 2] = 18'h06825; Q[ 3] = 18'h08f06;
        Q[ 4] = 18'h03783; Q[ 5] = 18'h04cb5; Q[ 6] = 18'h0e8ea; Q[ 7] = 18'h083d2;
        Q[ 8] = 18'h0fec9; Q[ 9] = 18'h066d9; Q[10] = 18'h0edad; Q[11] = 18'h06c12;
        Q[12] = 18'h0a5fb; Q[13] = 18'h07295; Q[14] = 18'h06a0c; Q[15] = 18'h081a5;
        Q[16] = 18'h03493; Q[17] = 18'h0a393; Q[18] = 18'h03da6; Q[19] = 18'h0beb1;
        Q[20] = 18'h0d138; Q[21] = 18'h02815; Q[22] = 18'h0f191; Q[23] = 18'h03617;
        Q[24] = 18'h08d4f; Q[25] = 18'h0f641; Q[26] = 18'h00e82; Q[27] = 18'h01774;
        Q[28] = 18'h0bf39; Q[29] = 18'h0929d; Q[30] = 18'h05273; Q[31] = 18'h0c30a;
        Q[32] = 18'h0eef3;
        //
        N[ 0] = 18'h03ad9; N[ 1] = 18'h046b4; N[ 2] = 18'h0e181; N[ 3] = 18'h0fac7;
        N[ 4] = 18'h0be72; N[ 5] = 18'h029ab; N[ 6] = 18'h07e51; N[ 7] = 18'h037a8;
        N[ 8] = 18'h0880c; N[ 9] = 18'h05a7d; N[10] = 18'h043c2; N[11] = 18'h038c9;
        N[12] = 18'h01275; N[13] = 18'h0aa0d; N[14] = 18'h0c0c1; N[15] = 18'h0d035;
        N[16] = 18'h04082; N[17] = 18'h0543c; N[18] = 18'h0dcb0; N[19] = 18'h0497c;
        N[20] = 18'h0b12c; N[21] = 18'h013d4; N[22] = 18'h0b80a; N[23] = 18'h051cf;
        N[24] = 18'h0286c; N[25] = 18'h0b600; N[26] = 18'h0d838; N[27] = 18'h0af4b;
        N[28] = 18'h08274; N[29] = 18'h06a07; N[30] = 18'h0beea; N[31] = 18'h0f000;
        //
        M[ 0] = 18'h041b2; M[ 1] = 18'h00128; M[ 2] = 18'h06b69; M[ 3] = 18'h08e7e;
        M[ 4] = 18'h0118c; M[ 5] = 18'h0b96d; M[ 6] = 18'h0ebe5; M[ 7] = 18'h0f873;
        M[ 8] = 18'h0cf14; M[ 9] = 18'h0de83; M[10] = 18'h09690; M[11] = 18'h05e9a;
        M[12] = 18'h048ac; M[13] = 18'h0b506; M[14] = 18'h01283; M[15] = 18'h08631;
        M[16] = 18'h0179c; M[17] = 18'h06820; M[18] = 18'h0867b; M[19] = 18'h0b750;
        M[20] = 18'h0e680; M[21] = 18'h0df95; M[22] = 18'h0d818; M[23] = 18'h0b4c5;
        M[24] = 18'h0cced; M[25] = 18'h0c4a9; M[26] = 18'h0bb78; M[27] = 18'h04295;
        M[28] = 18'h0b1b4; M[29] = 18'h09635; M[30] = 18'h0066b; M[31] = 18'h022b1;
        M[32] = 18'h04fdb; M[33] = 18'h0efc8; M[34] = 18'h00a14; M[35] = 18'h04bef;
        M[36] = 18'h006a1; M[37] = 18'h0f1a6; M[38] = 18'h0fc40; M[39] = 18'h0adb5;
        M[40] = 18'h06e8f; M[41] = 18'h02c60; M[42] = 18'h083e1; M[43] = 18'h0f862;
        M[44] = 18'h0da61; M[45] = 18'h0dd3d; M[46] = 18'h03381; M[47] = 18'h09db0;
        M[48] = 18'h05454; M[49] = 18'h07525; M[50] = 18'h0d9c7; M[51] = 18'h0a361;
        M[52] = 18'h049e0; M[53] = 18'h0a671; M[54] = 18'h0242e; M[55] = 18'h07cb2;
        M[56] = 18'h02021; M[57] = 18'h0bde1; M[58] = 18'h025aa; M[59] = 18'h0c615;
        M[60] = 18'h05645; M[61] = 18'h03b46; M[62] = 18'h065d6; M[63] = 18'h0390d;
        M[64] = 18'h0e005;
        //  
    end
    

    //
    // BRAMs
    //
    reg        tb_fat_bram_xy_ena = 1'b0;
    reg [ 2:0] tb_fat_bram_xy_bank;
    reg [ 7:0] tb_fat_bram_xy_addr;
    reg [17:0] tb_fat_bram_x_din;
    reg [17:0] tb_fat_bram_y_din;

    reg        mgr_fat_bram_xy_ena = 1'b0;
    reg [ 2:0] mgr_fat_bram_xy_bank;
    reg [ 7:0] mgr_fat_bram_xy_addr;
    reg [17:0] mgr_fat_bram_x_din;
    reg [17:0] mgr_fat_bram_y_din;
    
    reg         mac_fat_bram_xy_ena = 1'b0;
    reg         mac_fat_bram_xy_reg_ena = 1'b0;
    reg  [ 2:0] mac_fat_bram_xy_bank;
    reg  [ 2:0] mac_fat_bram_xy_bank_aux;
    reg  [ 7:0] mac_fat_bram_xy_addr[0:4];
    wire [17:0] mac_fat_bram_x_dout[0:4];
    wire [17:0] mac_fat_bram_y_dout[0:4];
    
    reg        tb_slim_bram_xy_ena = 1'b0;
    reg [ 1:0] tb_slim_bram_xy_bank;
    reg [ 7:0] tb_slim_bram_xy_addr;
    reg [17:0] tb_slim_bram_x_din;
    reg [17:0] tb_slim_bram_y_din;
    
    reg        mgr_slim_bram_xy_ena = 1'b0;
    reg [ 1:0] mgr_slim_bram_xy_bank;
    reg [ 7:0] mgr_slim_bram_xy_addr;
    reg [17:0] mgr_slim_bram_x_din;
    reg [17:0] mgr_slim_bram_y_din;

    reg         mac_slim_bram_xy_ena = 1'b0;
    reg         mac_slim_bram_xy_reg_ena = 1'b0;
    reg  [ 1:0] mac_slim_bram_xy_bank;
    reg  [ 7:0] mac_slim_bram_xy_addr;
    reg  [ 7:0] mac_slim_bram_xy_addr_dly;
    wire [17:0] mac_slim_bram_x_dout;
    wire [17:0] mac_slim_bram_y_dout;
    
    always @(posedge clk)
        //
        mac_slim_bram_xy_addr_dly <= mac_slim_bram_xy_addr;
    
    reg mac_slim_bram_xy_reg_ena_dly = 1'b0;
    always @(posedge clk)
        mac_slim_bram_xy_reg_ena_dly <= mac_slim_bram_xy_reg_ena;
    
    
    
    genvar z;
    generate for (z=0; z<((NUM_MULTS/2)+1); z=z+1)
        begin : gen_fat_bram
            //
            ip_bram_36k fat_bram_x
            (
                .clka   (clk),
                .ena    (mgr_fat_bram_xy_ena),
                .wea    (mgr_fat_bram_xy_ena),
                .addra  ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
                .dina   (mgr_fat_bram_x_din),
            
                .clkb   (clk),
                .enb    (mac_fat_bram_xy_ena),
                .regceb (mac_fat_bram_xy_reg_ena),
                .addrb  ({(z < (NUM_MULTS/2) ?
                    mac_fat_bram_xy_bank : mac_fat_bram_xy_bank_aux), mac_fat_bram_xy_addr[z]}),
                .doutb  (mac_fat_bram_x_dout[z])
            );
            //
            ip_bram_36k fat_bram_y
            (
                .clka   (clk),
                .ena    (mgr_fat_bram_xy_ena),
                .wea    (mgr_fat_bram_xy_ena),
                .addra  ({mgr_fat_bram_xy_bank, mgr_fat_bram_xy_addr}),
                .dina   (mgr_fat_bram_y_din),
            
                .clkb   (clk),
                .enb    (mac_fat_bram_xy_ena),
                .regceb (mac_fat_bram_xy_reg_ena),
                .addrb  ({z < (NUM_MULTS/2) ?
                    mac_fat_bram_xy_bank : mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_addr[z]}),
                .doutb  (mac_fat_bram_y_dout[z])
            );
            //
        end
    endgenerate

    ip_bram_18k slim_bram_x
    (
        .clka   (clk),
        .ena    (mgr_slim_bram_xy_ena),
        .wea    (mgr_slim_bram_xy_ena),
        .addra  ({mgr_slim_bram_xy_bank, mgr_slim_bram_xy_addr}),
        .dina   (mgr_slim_bram_x_din),
    
        .clkb   (clk),
        .enb    (mac_slim_bram_xy_ena),
        .regceb (mac_slim_bram_xy_reg_ena),
        .addrb  ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
        .doutb  (mac_slim_bram_x_dout)
    );

    ip_bram_18k slim_bram_y
    (
        .clka   (clk),
        .ena    (mgr_slim_bram_xy_ena),
        .wea    (mgr_slim_bram_xy_ena),
        .addra  ({mgr_slim_bram_xy_bank, mgr_slim_bram_xy_addr}),
        .dina   (mgr_slim_bram_y_din),
    
        .clkb   (clk),
        .enb    (mac_slim_bram_xy_ena),
        .regceb (mac_slim_bram_xy_reg_ena),
        .addrb  ({mac_slim_bram_xy_bank, mac_slim_bram_xy_addr}),
        .doutb  (mac_slim_bram_y_dout)
    );
    
    
    
    //
    // Enable, Ready
    //
    reg ena = 1'b0;

    integer i;
    initial begin

        for (i=0; i<10; i=i+1)
            wait_clock_tick;
        
        rst = 1'b0;

        for (i=0; i<10; i=i+1)
            wait_clock_tick;
        
        tb_fat_bram_xy_ena = 1'b1;
        tb_slim_bram_xy_ena = 1'b1;

        for (i=0; i<32; i=i+1) begin
            tb_fat_bram_xy_bank = BANK_FAT_T1T2;
            tb_fat_bram_xy_addr = i[7:0];
            tb_fat_bram_x_din = T1[i];
            tb_fat_bram_y_din = T2[i];
            
            tb_slim_bram_xy_bank = BANK_SLIM_T1T2;
            tb_slim_bram_xy_addr = i[7:0];
            tb_slim_bram_x_din = T1[i];
            tb_slim_bram_y_din = T2[i];
            
            wait_clock_tick;
        end

        for (i=0; i<32; i=i+1) begin
            tb_slim_bram_xy_bank = BANK_SLIM_N_COEFF;
            tb_slim_bram_xy_addr = i[7:0];
            tb_slim_bram_x_din = N_COEFF[i];
            tb_slim_bram_y_din = N_COEFF[i];
            
            wait_clock_tick;
        end
        for (i=32; i<33; i=i+1) begin
            tb_slim_bram_xy_bank = BANK_SLIM_EXT;
            tb_slim_bram_xy_addr = 0;   // !
            tb_slim_bram_x_din = N_COEFF[i];
            tb_slim_bram_y_din = N_COEFF[i];
            
            wait_clock_tick;
        end

        for (i=0; i<32; i=i+1) begin
            tb_fat_bram_xy_bank = BANK_FAT_N;
            tb_fat_bram_xy_addr = i[7:0];
            tb_fat_bram_x_din = N[i];
            tb_fat_bram_y_din = N[i];
            
            wait_clock_tick;
        end

        tb_fat_bram_xy_ena = 1'b0;        
        tb_slim_bram_xy_ena = 1'b0;
        
        tb_fat_bram_xy_bank = {3{1'bX}};
        tb_fat_bram_xy_addr = {8{1'bX}};
        tb_fat_bram_x_din = {18{1'bX}};
        tb_fat_bram_y_din = {18{1'bX}};

        tb_slim_bram_xy_bank = {2{1'bX}};
        tb_slim_bram_xy_addr = {8{1'bX}};
        tb_slim_bram_x_din = {18{1'bX}};
        tb_slim_bram_y_din = {18{1'bX}};

        for (i=0; i<10; i=i+1)
            wait_clock_tick;
            
        ena = 1'b1;
        wait_clock_tick;
        ena = 1'b0;
    
        for (i=0; i<10000; i=i+1)
            wait_clock_tick;
            
        verify_ab;
        verify_q;
        verify_m;

    end

    
    //
    // DSPs
    //
    reg             dsp_x_ce_a;
    reg             dsp_x_ce_b;
    reg             dsp_x_ce_b_dly;
    reg             dsp_x_ce_m;
    reg             dsp_x_ce_p;
    reg             dsp_x_ce_mode;
    
    reg  [9   -1:0] dsp_x_mode_z = {9{1'b1}};
    
    wire [5*18-1:0] dsp_x_a;
    reg  [1*17-1:0] dsp_x_b;
    wire [9*47-1:0] dsp_x_p;

    reg             dsp_y_ce_a;
    reg             dsp_y_ce_b;
    reg             dsp_y_ce_b_dly;
    reg             dsp_y_ce_m;
    reg             dsp_y_ce_p;
    reg             dsp_y_ce_mode;
    
    reg  [9   -1:0] dsp_y_mode_z = {9{1'b1}};
        
    wire [5*18-1:0] dsp_y_a;
    reg  [1*17-1:0] dsp_y_b;
    wire [9*47-1:0] dsp_y_p;
        
    generate for (z=0; z<((NUM_MULTS/2)+1); z=z+1)
        begin : gen_dsp_xy_a_split
            assign dsp_x_a[18*z+:18] = mac_fat_bram_x_dout[z];
            assign dsp_y_a[18*z+:18] = mac_fat_bram_y_dout[z];
        end
    endgenerate
    
    always @(posedge clk)
        //
        {dsp_y_ce_b_dly, dsp_x_ce_b_dly} <= {dsp_y_ce_b, dsp_x_ce_b};
    

    reg  [9   -1:0] dsp_xy_mode_z_adv1 = {9{1'b1}};
    reg  [9   -1:0] dsp_xy_mode_z_adv2 = {9{1'b1}};
    reg  [9   -1:0] dsp_xy_mode_z_adv3 = {9{1'b1}};
    reg  [9   -1:0] dsp_xy_mode_z_adv4 = {9{1'b1}};
    
    dsp_array dsp_x
    (
        .clk            (clk),
        
        .ce_a           (dsp_x_ce_a),
        .ce_b           (dsp_x_ce_b),
        .ce_m           (dsp_x_ce_m),
        .ce_p           (dsp_x_ce_p),
        .ce_mode        (dsp_x_ce_mode),

        .mode_z         (dsp_x_mode_z),
        
        .a              (dsp_x_a),
        .b              (dsp_x_b),
        .p              (dsp_x_p)
    );

    dsp_array dsp_y
    (
        .clk            (clk),
        
        .ce_a           (dsp_y_ce_a),
        .ce_b           (dsp_y_ce_b),
        .ce_m           (dsp_y_ce_m),
        .ce_p           (dsp_y_ce_p),
        .ce_mode        (dsp_y_ce_mode),

        .mode_z         (dsp_y_mode_z),
        
        .a              (dsp_y_a),
        .b              (dsp_y_b),
        .p              (dsp_y_p)
    );


    //
    // FSM State and Next States
    //
    reg [FSM_STATE_WIDTH-1:0] fsm_state = FSM_STATE_IDLE;
    reg [FSM_STATE_WIDTH-1:0] fsm_state_next;

    
    always @(posedge clk)
        //
        if (rst) fsm_state <= FSM_STATE_IDLE;
        else     fsm_state <= fsm_state_next;


    localparam [7:0] index_last = 8'd31;
    localparam [7:0] index_last_minus1 = index_last - 1'b1;


    //
    // Column
    //
    reg  [4:0] col_index;       // current column index
    reg  [4:0] col_index_prev;  // delayed column index value
    reg  [4:0] col_index_last;  // index of the very last column
    reg  [4:0] col_index_next1;  // precomputed next column index
    //reg  [4:0] col_index_next2;  // precomputed next column index after next column index
    reg        col_is_last;     // flag set during the very last column

    always @(posedge clk)
        //
        col_index_prev <= col_index;
    

    wire mult_square_addr_almost_done_comb;
    reg  mult_square_addr_almost_done_flop;
    reg  mult_square_addr_surely_done_flop; 

    wire  mult_triangle_addr_almost_done_comb;
    reg  mult_triangle_addr_almost_done_flop;        
    reg  mult_triangle_addr_surely_done_flop;
    reg  mult_triangle_addr_tardy_done_flop;

    wire  mult_rectangle_addr_almost_done_comb;
    reg  mult_rectangle_addr_almost_done_flop;        
    reg  mult_rectangle_addr_surely_done_flop;
    reg  mult_rectangle_addr_tardy_done_flop;

    
    assign mult_square_addr_almost_done_comb = mac_slim_bram_xy_addr == index_last_minus1;
    assign mult_triangle_addr_almost_done_comb = (mac_slim_bram_xy_addr[2:0] == index_last_minus1[2:0]) && (mac_slim_bram_xy_addr[7:3] == col_index);
    assign mult_rectangle_addr_almost_done_comb = mac_slim_bram_xy_addr == index_last_minus1;

            
    
    
    always @(posedge clk)
        //
        case (fsm_state)
        
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY:
                mult_square_addr_almost_done_flop <= mult_square_addr_almost_done_comb;
                //{mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 
                //{mult_square_addr_surely_done_comb, mult_square_addr_almost_done_comb};
            default:
               mult_square_addr_almost_done_flop <= 1'b0;
                //{mult_square_addr_surely_done_flop, mult_square_addr_almost_done_flop} <= 2'b00;
            
        endcase

    always @(posedge clk)
        //
        mult_square_addr_surely_done_flop <= mult_square_addr_almost_done_flop;

    always @(posedge clk)
        //
        case (fsm_state)
        
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:
                mult_triangle_addr_almost_done_flop <= mult_triangle_addr_almost_done_comb;
                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 
                //{mult_triangle_addr_surely_done_comb, mult_triangle_addr_almost_done_comb};
                
            default:
                mult_triangle_addr_almost_done_flop <= 1'b0;
                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 2'b00;
            
        endcase

    always @(posedge clk) begin
        //
        mult_triangle_addr_surely_done_flop <= mult_triangle_addr_almost_done_flop;
        mult_triangle_addr_tardy_done_flop  <= mult_triangle_addr_surely_done_flop;
        //
    end
        
        
     always @(posedge clk)
        //
        case (fsm_state)
        
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:
                mult_rectangle_addr_almost_done_flop <= mult_rectangle_addr_almost_done_comb;
                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 
                //{mult_triangle_addr_surely_done_comb, mult_triangle_addr_almost_done_comb};
                
            default:
                mult_rectangle_addr_almost_done_flop <= 1'b0;
                //{mult_triangle_addr_surely_done_flop, mult_triangle_addr_almost_done_flop} <= 2'b00;
            
        endcase

    always @(posedge clk) begin
        //
        mult_rectangle_addr_surely_done_flop <= mult_rectangle_addr_almost_done_flop;
        mult_rectangle_addr_tardy_done_flop  <= mult_rectangle_addr_surely_done_flop;
        //
    end


    //
    // FSM Transition Logic
    //
    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square;
    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_triangle;
    wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_rectangle;
    
    
    always @(posedge clk)
        //
        case (fsm_state_next)
            //
            FSM_STATE_MULT_SQUARE_COL_0_INIT,
            FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_slim_bram_xy_addr <= 8'd0;
            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_addr <= !mult_square_addr_almost_done_flop ? mac_slim_bram_xy_addr + 1'b1 : 8'd0;
            //
            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0;
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_slim_bram_xy_addr <= mult_triangle_addr_almost_done_flop || (col_is_last && mult_triangle_addr_surely_done_flop) ?
                8'd0 :  mac_slim_bram_xy_addr + 1'b1;
            //
            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: mac_slim_bram_xy_addr <= 8'd0;
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: mac_slim_bram_xy_addr <= mult_rectangle_addr_almost_done_flop || mult_rectangle_addr_surely_done_flop ?
                8'd1 :  mac_slim_bram_xy_addr + 1'b1;            
            //
            default:                            mac_slim_bram_xy_addr <= 8'dX;
        endcase


    wire [2:0] fat_bram_offset_rom[0:3];
    
    generate for (z=1; z<NUM_MULTS; z=z+2)
        begin : gen_fat_bram_offset
            assign fat_bram_offset_rom[(z-1)/2] = z[2:0];
        end
    endgenerate    
        
    integer j;
    always @(posedge clk) begin
        //
        for (j=0; j<(NUM_MULTS/2); j=j+1)
            //
            case (fsm_state_next)
                //
                // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
                //
                FSM_STATE_MULT_SQUARE_COL_0_INIT:   mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]};
                FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]};
                FSM_STATE_MULT_SQUARE_COL_0_TRIG,
                FSM_STATE_MULT_SQUARE_COL_N_TRIG,
                FSM_STATE_MULT_SQUARE_COL_0_BUSY,
                FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
                //
                FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]};
                FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]};
                FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
                FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
                FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
                FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
                //
                FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[j] <= {5'd0, fat_bram_offset_rom[j]};
                FSM_STATE_MULT_RECTANGLE_COL_N_INIT:   mac_fat_bram_xy_addr[j] <= {col_index_next1, fat_bram_offset_rom[j]};
                FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
                FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
                FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
                FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_fat_bram_xy_addr[j] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[j], index_last);
                //
                default:                            mac_fat_bram_xy_addr[j] <= 8'dX;
            endcase
            //
        case (fsm_state_next)
            //
            // this can be reworked by having 8 address regs instead of 4 and using shifts instead of subtractions!
            //
            FSM_STATE_MULT_SQUARE_COL_0_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
            FSM_STATE_MULT_SQUARE_COL_N_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_fat_bram_xy_addr[4] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[4], index_last);
            //
            FSM_STATE_MULT_TRIANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_fat_bram_xy_addr[4] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[4], index_last);
            //
            FSM_STATE_MULT_RECTANGLE_COL_0_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT:   mac_fat_bram_xy_addr[4] <= {5'd0, 3'd1};
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_fat_bram_xy_addr[4] <= mac_fat_bram_xy_addr_next(mac_fat_bram_xy_addr[4], index_last);
            //
            default:                            mac_fat_bram_xy_addr[4] <= 8'dX;
        endcase
//
    end

    always @(posedge clk)
        //
        case (fsm_state_next)
            //
            FSM_STATE_MULT_SQUARE_COL_0_INIT,
            FSM_STATE_MULT_SQUARE_COL_N_INIT,
            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_bank <= BANK_SLIM_T1T2;
            //
            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: mac_slim_bram_xy_bank <= col_is_last && (mult_triangle_addr_almost_done_flop || mult_triangle_addr_surely_done_flop) ?
                BANK_SLIM_EXT : BANK_SLIM_N_COEFF;
            //
            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: mac_slim_bram_xy_bank <= mult_rectangle_addr_almost_done_flop || mult_rectangle_addr_surely_done_flop ?
                BANK_SLIM_EXT : BANK_SLIM_Q;            
            //
            default:                            mac_slim_bram_xy_bank <= 2'bXX;
        endcase

    always @(posedge clk)
        //
        case (fsm_state_next)
            FSM_STATE_MULT_SQUARE_COL_0_INIT,
            FSM_STATE_MULT_SQUARE_COL_N_INIT,
            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {2{BANK_FAT_T1T2}};
            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {BANK_FAT_ABH, BANK_FAT_ABL};
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {2{BANK_FAT_ABL}};
            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,    
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {2{BANK_FAT_N}};            
            default:                            {mac_fat_bram_xy_bank_aux, mac_fat_bram_xy_bank} <= {2{3'bXXX}};
        endcase



    always @(posedge clk)
        //
        case (fsm_state_next)
            FSM_STATE_MULT_SQUARE_COL_0_INIT,
            FSM_STATE_MULT_SQUARE_COL_N_INIT,
            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            FSM_STATE_MULT_SQUARE_COL_N_TRIG:   mac_slim_bram_xy_ena <= 1'b1;
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY:   mac_slim_bram_xy_ena <= ~mult_square_addr_almost_done_flop;
            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:   mac_slim_bram_xy_ena <= 1'b1;
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:   mac_slim_bram_xy_ena <= !col_is_last ? ~mult_triangle_addr_almost_done_flop : ~mult_triangle_addr_surely_done_flop; 
            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:   mac_slim_bram_xy_ena <= 1'b1;
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_slim_bram_xy_ena <= ~mult_rectangle_addr_surely_done_flop;
            default:                              mac_slim_bram_xy_ena <= 1'b0;
        endcase

    always @(posedge clk)
        //
        case (fsm_state_next)
            FSM_STATE_MULT_SQUARE_COL_0_INIT,
            FSM_STATE_MULT_SQUARE_COL_N_INIT,
            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            FSM_STATE_MULT_SQUARE_COL_N_TRIG,
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_0_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:   mac_fat_bram_xy_ena <= 1'b1;
            default:                            mac_fat_bram_xy_ena <= 1'b0;
        endcase


    always @(posedge clk)
        //
        mac_slim_bram_xy_reg_ena <= mac_slim_bram_xy_ena;
        
    always @(posedge clk)
        //
        mac_fat_bram_xy_reg_ena <= mac_fat_bram_xy_ena;
          
    reg ladder_mode = 1'b0; // 0 = X:T1*T2, Y:T2*T2
                            // 1 = X:T1*T2, Y:T2*T1
          

    reg dsp_swap_xy;
    
    always @(posedge clk)
        //
        case (fsm_state)
            FSM_STATE_MULT_SQUARE_COL_0_TRIG:   dsp_swap_xy <= 1'b1;
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: dsp_swap_xy <= 1'b0;
        endcase
  
    always @(posedge clk)
        //
        if (mac_slim_bram_xy_reg_ena_dly) begin // rewrite
            if (!dsp_swap_xy)
                {dsp_y_b, dsp_x_b} <= {mac_slim_bram_y_dout[16:0], mac_slim_bram_x_dout[16:0]};
            else begin
                if (!ladder_mode) {dsp_y_b, dsp_x_b} <= {mac_slim_bram_x_dout[16:0], mac_slim_bram_y_dout[16:0]};
                else              {dsp_y_b, dsp_x_b} <= {mac_slim_bram_y_dout[16:0], mac_slim_bram_x_dout[16:0]};
            end
        end
        else
            {dsp_y_b, dsp_x_b} <= {2{{17{1'bX}}}};


    function  [7:0] mac_fat_bram_xy_addr_next;
        input [7:0] mac_fat_bram_xy_addr_current;
        input [7:0] mac_fat_bram_xy_addr_last;
        begin
            if (mac_fat_bram_xy_addr_current > 8'd0)
                mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_current - 1'b1;
            else
                mac_fat_bram_xy_addr_next = mac_fat_bram_xy_addr_last;
        end
    endfunction
        

    
    always @(posedge clk)
        //
        {dsp_y_ce_a, dsp_x_ce_a} <= {2{mac_slim_bram_xy_reg_ena | mac_slim_bram_xy_reg_ena_dly}};
        
    always @(posedge clk)
        //
        {dsp_y_ce_b, dsp_x_ce_b} <= {2{mac_slim_bram_xy_reg_ena_dly}};
    
    always @(posedge clk)
        //
        {dsp_y_ce_m, dsp_x_ce_m} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};

    always @(posedge clk)
        //
        {dsp_y_ce_p, dsp_x_ce_p} <= {dsp_y_ce_m, dsp_x_ce_m};
        
    always @(posedge clk)
        //
        {dsp_y_ce_mode, dsp_x_ce_mode} <= {dsp_y_ce_b_dly, dsp_x_ce_b_dly};

    task wait_clock_tick;
        begin
            #`CLK_PERIOD_NS;
        end
    endtask
    
    //
    // Increment Logic
    //
    always @(posedge clk)
        //
        case (fsm_state_next)
            //
            FSM_STATE_MULT_SQUARE_COL_0_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_0_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_0_INIT: begin
                col_index       <= 5'd0;
                col_index_last  <= index_last[7:3];
                col_index_next1 <= 5'd1;
                //col_index_next2 <= 5'd2;
                col_is_last     <= 1'b0;
                
            end
            //
            FSM_STATE_MULT_SQUARE_COL_N_INIT,
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT,
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: begin
                col_index <= col_index_next1;
                col_is_last <= col_index_next1 == col_index_last;
                col_index_next1 <= col_index_next1 == col_index_last ? 5'd0 : col_index_next1 + 5'd1;   
                //col_index_next2 <= col_index_next2 + 1'b1;
            end
            //
        endcase
    
    assign fsm_state_after_mult_square    = col_is_last ? FSM_STATE_MULT_SQUARE_HOLDOFF   : FSM_STATE_MULT_SQUARE_COL_N_INIT;
    assign fsm_state_after_mult_triangle  = col_is_last ? FSM_STATE_MULT_TRIANGLE_HOLDOFF : FSM_STATE_MULT_TRIANGLE_COL_N_INIT;
    assign fsm_state_after_mult_rectangle = col_is_last ? FSM_STATE_MULT_RECTANGLE_HOLDOFF : FSM_STATE_MULT_RECTANGLE_COL_N_INIT;
    
    always @(posedge clk)
        //
        case (fsm_state_next)
            FSM_STATE_MULT_SQUARE_COL_0_TRIG,
            FSM_STATE_MULT_SQUARE_COL_N_TRIG:       dsp_xy_mode_z_adv4 <= {9{1'b0}};
            FSM_STATE_MULT_SQUARE_COL_0_BUSY,
            FSM_STATE_MULT_SQUARE_COL_N_BUSY:       dsp_xy_mode_z_adv4 <= calc_mac_mode_z_square(col_index_prev, mac_slim_bram_xy_addr_dly);
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG,
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY,
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= {9{1'b1}};
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG,
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG:     dsp_xy_mode_z_adv4 <= {9{1'b0}};    // so easy
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY,
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY:     dsp_xy_mode_z_adv4 <= calc_mac_mode_z_rectangle(col_index_prev, mac_slim_bram_xy_addr_dly);
            default:                                dsp_xy_mode_z_adv4 <= {9{1'b1}};
        endcase

    always @(posedge clk) begin
        {dsp_y_mode_z, dsp_x_mode_z} <= {2{dsp_xy_mode_z_adv1}};
        //
        dsp_xy_mode_z_adv1 <= {dsp_xy_mode_z_adv2};
        dsp_xy_mode_z_adv2 <= {dsp_xy_mode_z_adv3};
        dsp_xy_mode_z_adv3 <= {dsp_xy_mode_z_adv4};
    end
    
    function  [NUM_MULTS:0] calc_mac_mode_z_square;
        input [        4:0] col_index_value;
        input [        7:0] mac_slim_bram_xy_addr_value;
        begin
            if (mac_slim_bram_xy_addr_value[7:3] == col_index_value)
                case (mac_slim_bram_xy_addr_value[2:0])
                    3'b000: calc_mac_mode_z_square = {1'b1, 8'b11111110};
                    3'b001: calc_mac_mode_z_square = {1'b1, 8'b11111101};
                    3'b010: calc_mac_mode_z_square = {1'b1, 8'b11111011};
                    3'b011: calc_mac_mode_z_square = {1'b1, 8'b11110111};
                    3'b100: calc_mac_mode_z_square = {1'b1, 8'b11101111};
                    3'b101: calc_mac_mode_z_square = {1'b1, 8'b11011111};
                    3'b110: calc_mac_mode_z_square = {1'b1, 8'b10111111};
                    3'b111: calc_mac_mode_z_square = {1'b1, 8'b01111111};
                endcase
            else
                calc_mac_mode_z_square = {1'b1, {NUM_MULTS{1'b1}}};
        end
    endfunction
    
    function  [NUM_MULTS:0] calc_mac_mode_z_rectangle;
        input [        4:0] col_index_value;
        input [        7:0] mac_slim_bram_xy_addr_value;
        begin
            if (mac_slim_bram_xy_addr_value[7:3] == col_index_value)
                case (mac_slim_bram_xy_addr_value[2:0])
                    3'b000: calc_mac_mode_z_rectangle = {1'b1, 8'b11111110};
                    3'b001: calc_mac_mode_z_rectangle = {1'b1, 8'b11111101};
                    3'b010: calc_mac_mode_z_rectangle = {1'b1, 8'b11111011};
                    3'b011: calc_mac_mode_z_rectangle = {1'b1, 8'b11110111};
                    3'b100: calc_mac_mode_z_rectangle = {1'b1, 8'b11101111};
                    3'b101: calc_mac_mode_z_rectangle = {1'b1, 8'b11011111};
                    3'b110: calc_mac_mode_z_rectangle = {1'b1, 8'b10111111};
                    3'b111: calc_mac_mode_z_rectangle = {1'b1, 8'b01111111};
                endcase
            else
                calc_mac_mode_z_rectangle = {1'b1, {NUM_MULTS{1'b1}}};
        end
    endfunction

    reg recomb_x_ena = 1'b0;
    reg recomb_y_ena = 1'b0;
    
    always @(posedge clk) begin
        //
        recomb_x_ena <= dsp_x_ce_a && !dsp_x_ce_b && !dsp_x_ce_m && !dsp_x_ce_p;
        recomb_y_ena <= dsp_y_ce_a && !dsp_y_ce_b && !dsp_y_ce_m && !dsp_y_ce_p;
        //
    end
    
    wire [ 2:0] recomb_fat_bram_xy_bank;
    wire [ 7:0] recomb_fat_bram_xy_addr;
    wire [17:0] recomb_fat_bram_x_dout;
    wire [17:0] recomb_fat_bram_y_dout;
    wire        recomb_fat_bram_xy_dout_valid;
    wire [ 2:0] recomb_slim_bram_xy_bank;
    wire [ 7:0] recomb_slim_bram_xy_addr;
    wire [17:0] recomb_slim_bram_x_dout;
    wire [17:0] recomb_slim_bram_y_dout;
    wire        recomb_slim_bram_xy_dout_valid;
    wire        recomb_rdy;
    
    modexpng_part_recombinator recomb
    (
        .clk                            (clk),
        .rdy                            (recomb_rdy),
        .fsm_state_next                 (fsm_state_next),
        .index_last                     (index_last),
        .dsp_x_ce_p                     (dsp_x_ce_p),
        .dsp_y_ce_p                     (dsp_y_ce_p),
        .ena_x                          (recomb_x_ena),
        .ena_y                          (recomb_y_ena),
        .dsp_x_p                        (dsp_x_p),
        .dsp_y_p                        (dsp_y_p),
        .col_index                      (col_index),
        .col_index_last                 (col_index_last),
        .slim_bram_xy_addr              (mac_slim_bram_xy_addr),
        .slim_bram_xy_bank              (mac_slim_bram_xy_bank),
        .rcmb_fat_bram_xy_bank          (recomb_fat_bram_xy_bank),
        .rcmb_fat_bram_xy_addr          (recomb_fat_bram_xy_addr),
        .rcmb_fat_bram_x_dout           (recomb_fat_bram_x_dout),
        .rcmb_fat_bram_y_dout           (recomb_fat_bram_y_dout),
        .rcmb_fat_bram_xy_dout_valid    (recomb_fat_bram_xy_dout_valid),
        .rcmb_slim_bram_xy_bank         (recomb_slim_bram_xy_bank),
        .rcmb_slim_bram_xy_addr         (recomb_slim_bram_xy_addr),
        .rcmb_slim_bram_x_dout          (recomb_slim_bram_x_dout),
        .rcmb_slim_bram_y_dout          (recomb_slim_bram_y_dout),
        .rcmb_slim_bram_xy_dout_valid   (recomb_slim_bram_xy_dout_valid)
    );
    
    reg [17:0] AB_READ[0:63];
    reg [17:0] Q_READ[0:32];
    reg [17:0] M_READ[0:64];
    
    always @(posedge clk) begin
        //
        if (recomb_fat_bram_xy_dout_valid)
            //
            case (recomb_fat_bram_xy_bank)
                BANK_FAT_ABL: AB_READ[recomb_fat_bram_xy_addr % 32] <= recomb_fat_bram_x_dout;
                BANK_FAT_ABH: AB_READ[32 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout;
                BANK_FAT_ML:  M_READ[recomb_fat_bram_xy_addr % 32] <= recomb_fat_bram_x_dout;
                BANK_FAT_MH:  M_READ[32 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout;
                BANK_FAT_EXT: M_READ[64 + (recomb_fat_bram_xy_addr % 32)] <= recomb_fat_bram_x_dout;
            endcase
            //
        if (recomb_slim_bram_xy_dout_valid)
            //
            case (recomb_slim_bram_xy_bank)
                BANK_SLIM_Q: Q_READ[recomb_slim_bram_xy_addr] <= recomb_slim_bram_x_dout;
                BANK_SLIM_EXT: if (recomb_slim_bram_xy_addr == 8'd1)
                             Q_READ[32] <= recomb_slim_bram_x_dout;
            endcase
            //
    end
            

    always @(posedge clk)
        //
        if (tb_fat_bram_xy_ena) begin
            mgr_fat_bram_xy_ena  <= 1'b1;
            mgr_fat_bram_xy_bank <= tb_fat_bram_xy_bank;
            mgr_fat_bram_xy_addr <= tb_fat_bram_xy_addr;
            mgr_fat_bram_x_din   <= tb_fat_bram_x_din;
            mgr_fat_bram_y_din   <= tb_fat_bram_y_din;
        end else if (recomb_fat_bram_xy_dout_valid) begin
            mgr_fat_bram_xy_ena  <= 1'b1;
            mgr_fat_bram_xy_bank <= recomb_fat_bram_xy_bank;
            mgr_fat_bram_xy_addr <= recomb_fat_bram_xy_addr;
            mgr_fat_bram_x_din   <= recomb_fat_bram_x_dout;
            mgr_fat_bram_y_din   <= recomb_fat_bram_y_dout;
        end else begin
            mgr_fat_bram_xy_ena  <= 1'b0;
            mgr_fat_bram_xy_bank <= 3'bXXX;
            mgr_fat_bram_xy_addr <= 8'hXX;
            mgr_fat_bram_x_din   <= {18{1'bX}};
            mgr_fat_bram_y_din   <= {18{1'bX}};
        end


    always @(posedge clk)
        //
        if (tb_slim_bram_xy_ena) begin
            mgr_slim_bram_xy_ena  <= 1'b1;
            mgr_slim_bram_xy_bank <= tb_slim_bram_xy_bank;
            mgr_slim_bram_xy_addr <= tb_slim_bram_xy_addr;
            mgr_slim_bram_x_din   <= tb_slim_bram_x_din;
            mgr_slim_bram_y_din   <= tb_slim_bram_y_din;
        end else if (recomb_slim_bram_xy_dout_valid) begin
            mgr_slim_bram_xy_ena  <= 1'b1;
            mgr_slim_bram_xy_bank <= recomb_slim_bram_xy_bank;
            mgr_slim_bram_xy_addr <= recomb_slim_bram_xy_addr;
            mgr_slim_bram_x_din   <= recomb_slim_bram_x_dout;
            mgr_slim_bram_y_din   <= recomb_slim_bram_y_dout;
        end else begin
            mgr_slim_bram_xy_ena  <= 1'b0;
            mgr_slim_bram_xy_bank <= 3'bXXX;
            mgr_slim_bram_xy_addr <= 8'hXX;
            mgr_slim_bram_x_din   <= {18{1'bX}};
            mgr_slim_bram_y_din   <= {18{1'bX}};
        end


    task verify_ab;
        reg verify_ab_ok;
        begin
            verify_ab_ok = 1;
            for (i=0; i<64; i=i+1)
                if (AB_READ[i] === AB[i])
                    $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x", i, AB[i], AB_READ[i]);
                else begin
                    $display("AB / AB_READ [%02d] = 0x%05x / 0x%05x <???>", i, AB[i], AB_READ[i]);
                    verify_ab_ok = 0;
                end
            if (verify_ab_ok)
                $display("AB is OK.");
            else
                $display("AB is WRONG!");
        end
    endtask


    task verify_q;
        reg verify_q_ok;
        begin
            verify_q_ok = 1;
            for (i=0; i<33; i=i+1)
                if (Q_READ[i] === Q[i])
                    $display("Q / Q_READ [%02d] = 0x%05x / 0x%05x", i, Q[i], Q_READ[i]);
                else begin
                    $display("Q / Q_READ [%02d] = 0x%05x / 0x%05x <???>", i, Q[i], Q_READ[i]);
                    verify_q_ok = 0;
                end
            if (verify_q_ok)
                $display("Q is OK.");
            else
                $display("Q is WRONG!");
        end
    endtask


    task verify_m;
        reg verify_m_ok;
        begin
            verify_m_ok = 1;
            for (i=0; i<65; i=i+1)
                if (M_READ[i] === M[i])
                    $display("M / M_READ [%02d] = 0x%05x / 0x%05x", i, M[i], M_READ[i]);
                else begin
                    $display("M / M_READ [%02d] = 0x%05x / 0x%05x <???>", i, M[i], M_READ[i]);
                    verify_m_ok = 0;
                end
            if (verify_m_ok)
                $display("M is OK.");
            else
                $display("M is WRONG!");
        end
    endtask


    wire mult_square_addr_done = mult_square_addr_surely_done_flop;
    wire mult_triangle_addr_done = !col_is_last ? mult_triangle_addr_surely_done_flop : mult_triangle_addr_tardy_done_flop;
    wire mult_rectangle_addr_done = mult_rectangle_addr_tardy_done_flop;
    

    always @* begin
        //
        fsm_state_next = FSM_STATE_IDLE;
        //
        case (fsm_state)
            FSM_STATE_IDLE:                   fsm_state_next = ena                   ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_IDLE;
                        
            FSM_STATE_MULT_SQUARE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_TRIG ;
            FSM_STATE_MULT_SQUARE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_0_BUSY ;
            FSM_STATE_MULT_SQUARE_COL_0_BUSY: fsm_state_next = mult_square_addr_done ? FSM_STATE_MULT_SQUARE_COL_N_INIT : FSM_STATE_MULT_SQUARE_COL_0_BUSY;
            
            FSM_STATE_MULT_SQUARE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_TRIG ;
            FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_SQUARE_COL_N_BUSY ;
            FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = mult_square_addr_done ? fsm_state_after_mult_square    : FSM_STATE_MULT_SQUARE_COL_N_BUSY;
            
            FSM_STATE_MULT_SQUARE_HOLDOFF:    fsm_state_next =                         recomb_rdy ? FSM_STATE_MULT_TRIANGLE_COL_0_INIT : FSM_STATE_MULT_SQUARE_HOLDOFF;

            FSM_STATE_MULT_TRIANGLE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_0_TRIG ;
            FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_0_BUSY ;
            FSM_STATE_MULT_TRIANGLE_COL_0_BUSY: fsm_state_next = mult_triangle_addr_done ? FSM_STATE_MULT_TRIANGLE_COL_N_INIT : FSM_STATE_MULT_TRIANGLE_COL_0_BUSY;     
            
            FSM_STATE_MULT_TRIANGLE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_N_TRIG ;
            FSM_STATE_MULT_TRIANGLE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_TRIANGLE_COL_N_BUSY ;
            FSM_STATE_MULT_TRIANGLE_COL_N_BUSY: fsm_state_next = mult_triangle_addr_done ? fsm_state_after_mult_triangle : FSM_STATE_MULT_TRIANGLE_COL_N_BUSY;
            
            FSM_STATE_MULT_TRIANGLE_HOLDOFF:    fsm_state_next =                         recomb_rdy ? FSM_STATE_MULT_RECTANGLE_COL_0_INIT : FSM_STATE_MULT_TRIANGLE_HOLDOFF;

            FSM_STATE_MULT_RECTANGLE_COL_0_INIT: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_0_TRIG ;
            FSM_STATE_MULT_RECTANGLE_COL_0_TRIG: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_0_BUSY ;
            FSM_STATE_MULT_RECTANGLE_COL_0_BUSY: fsm_state_next = mult_rectangle_addr_done ? FSM_STATE_MULT_RECTANGLE_COL_N_INIT : FSM_STATE_MULT_RECTANGLE_COL_0_BUSY;     
            
            FSM_STATE_MULT_RECTANGLE_COL_N_INIT: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_N_TRIG ;
            FSM_STATE_MULT_RECTANGLE_COL_N_TRIG: fsm_state_next =                         FSM_STATE_MULT_RECTANGLE_COL_N_BUSY ;
            FSM_STATE_MULT_RECTANGLE_COL_N_BUSY: fsm_state_next = mult_rectangle_addr_done ? fsm_state_after_mult_rectangle : FSM_STATE_MULT_RECTANGLE_COL_N_BUSY;
            
            FSM_STATE_MULT_RECTANGLE_HOLDOFF:    fsm_state_next =                         recomb_rdy ? FSM_STATE_STOP : FSM_STATE_MULT_RECTANGLE_HOLDOFF;
            
            default:                          fsm_state_next =                         FSM_STATE_IDLE                   ;

        endcase
        //
    end
    
    
endmodule