aboutsummaryrefslogblamecommitdiff
path: root/rtl/modexpng_general_worker.v
blob: d82a120ecea2c34876a1f3955920f29d17daacb2 (plain) (tree)
1
2
3
4
5
6
7

                              



                                  
           








                                                                                                                             

  
 























                                                             
                                                                  




















































                                                                     
                                                            
    




                                                                                                                                         
    
                                                            
    




















                                                            
    





                                                                                                                                                        

 


                                                                                                                            


























































































                                                          







                                                


                                                  

                                                  

                                                  













                                                     

                                                     



                                                     

                                                     


                               

                                                                                                                                                                                                                
          

                                                                                                                                                                                                                        




                                                                                                                                                                                


                                                                                                                                                                                
          
       


        
                               
















                                                                                                          
              




                                    
                       
              
                                              
                  


                                           
                      
                                 
                          


                                                       

                                                   
                              





                                                                               
                          



                                                 
                           
                  











                                                      
                      
                                 
                          

                                                           
                              
                                                 
                                                   



                              











                                              
                      

                                                    
                              




                                                             
                          
                           






                   
                                     
















                                                                                                          





                                    
                       
              
                                
                  


                                            
                      
                                 
                          

                                                     
                              
                                                   
                              
                                                      

                                                 
                                                                               


                              
                                                       

                                                 

                           









                                               
                      
                                 
                          

                                                           


                                                   






                              
























                                                       



           




                                       
      
 




                                               
                                                                                                                       



                                                                                                                       




                                                                                                    




                                                                                                                     




















































































































                                                                                                                       

                               

                                        
          



                                                  
              
                                
                  











                                                     
                      




















































                                                                                   
                      
                                                  
                          



                                                              
                          



                                                                          


                          






                                                                






                                                                
                       









                                           
                  
                             
                      
                                                      
                          



                                                                   
                          



                                                                       


                          
                                                       
                          



                                                                   
                          



                                                                       



                          











                                           
                  
                             
                      
                                                
                          





                                                                                                                                     
                          
                       






               
                                
      



                                                   

                                          





















































                                                                                                                          
    















                                                                                         

                               

                                                          
          


                                          
              
                                       
                  
                             
                      



                                                         
                          

                                                                                                                         


                          




                                                                                                                         

                       

                                       
                  
                             
                      





                                                                                                                          
                          


                                                         
                          

                                                                                                                          


                          








                                                                                                                                                                  
                       













































                                                                                                                           
                  
                             
                      
                                                
                          
















                                                                                                                              
                          





                                                                                                                               


                       


               

 
      
                                      
      


                                                                                               
 

                                                                                                                            
 









                                                                       
    











                                                                           
          

                                                                                 
          


                            
              


                                        
                  
                             
                      



                                                                                                                                       

                       

                                                                                                                                                                                     
                      


                                                                                                                                       

                       














                                                                                                                                       
                       

                       










                                           
                  
                             
                      
                                                
                          





                                                                                                                                                                
                          
                           
                  
                   

          
 



                  
 





                                                     
                                    

                                                                                         


                                                                                                 






                                                                                

                                             
                                             
                                             


                               

                                              
                                              




                                          
                                    

                                           
                  



                                                                                             
                  

                                         
                  







                                                                                                     
                  



                                                                                             
                  
              



               












                                                                                                               











                                                                                                                                                 
























                                                                                                                                                                                     



               




















                                                                                                                                                                      
















                                                  
module modexpng_general_worker
(
    clk, rst,
    ena, rdy,
    sel_narrow_in, sel_narrow_out,
    sel_wide_in,   sel_wide_out,
    opcode,
    word_index_last, word_index_last_half,
    wrk_rd_wide_xy_ena_x,   wrk_rd_wide_xy_bank_x,   wrk_rd_wide_xy_addr_x,   wrk_rd_wide_x_din_x,    wrk_rd_wide_y_din_x,
    wrk_rd_narrow_xy_ena_x, wrk_rd_narrow_xy_bank_x, wrk_rd_narrow_xy_addr_x, wrk_rd_narrow_x_din_x,  wrk_rd_narrow_y_din_x,
    wrk_rd_wide_xy_ena_y,   wrk_rd_wide_xy_bank_y,   wrk_rd_wide_xy_addr_y,   wrk_rd_wide_x_din_y,    wrk_rd_wide_y_din_y,
    wrk_rd_narrow_xy_ena_y, wrk_rd_narrow_xy_bank_y, wrk_rd_narrow_xy_addr_y, wrk_rd_narrow_x_din_y,  wrk_rd_narrow_y_din_y,
    wrk_wr_wide_xy_ena_x,   wrk_wr_wide_xy_bank_x,   wrk_wr_wide_xy_addr_x,   wrk_wr_wide_x_dout_x,   wrk_wr_wide_y_dout_x,
    wrk_wr_narrow_xy_ena_x, wrk_wr_narrow_xy_bank_x, wrk_wr_narrow_xy_addr_x, wrk_wr_narrow_x_dout_x, wrk_wr_narrow_y_dout_x,
    wrk_wr_wide_xy_ena_y,   wrk_wr_wide_xy_bank_y,   wrk_wr_wide_xy_addr_y,   wrk_wr_wide_x_dout_y,   wrk_wr_wide_y_dout_y,
    wrk_wr_narrow_xy_ena_y, wrk_wr_narrow_xy_bank_y, wrk_wr_narrow_xy_addr_y, wrk_wr_narrow_x_dout_y, wrk_wr_narrow_y_dout_y
);


    //
    // Headers
    //
    `include "modexpng_parameters.vh"
    `include "modexpng_microcode.vh"

    
    //
    // Ports
    //
    input                                    clk;
    input                                    rst;

    input                                    ena;
    output                                   rdy;
    
    input  [              BANK_ADDR_W  -1:0] sel_narrow_in; 
    input  [              BANK_ADDR_W  -1:0] sel_narrow_out; 
    input  [              BANK_ADDR_W  -1:0] sel_wide_in; 
    input  [              BANK_ADDR_W  -1:0] sel_wide_out; 
    
    input  [              UOP_OPCODE_W -1:0] opcode;
    
    input  [              OP_ADDR_W    -1:0] word_index_last;
    input  [              OP_ADDR_W    -1:0] word_index_last_half;
    
    output                                   wrk_rd_wide_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_rd_wide_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_rd_wide_xy_addr_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_x_din_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_y_din_x;

    output                                   wrk_rd_narrow_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_rd_narrow_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_rd_narrow_xy_addr_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_x_din_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_y_din_x;
    
    output                                   wrk_rd_wide_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_rd_wide_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_rd_wide_xy_addr_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_x_din_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_y_din_y;

    output                                   wrk_rd_narrow_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_rd_narrow_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_rd_narrow_xy_addr_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_x_din_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_y_din_y;

    output                                   wrk_wr_wide_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_wr_wide_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_wr_wide_xy_addr_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_x_dout_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_y_dout_x;

    output                                   wrk_wr_narrow_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_wr_narrow_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_wr_narrow_xy_addr_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_x_dout_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_y_dout_x;
    
    output                                   wrk_wr_wide_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_wr_wide_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_wr_wide_xy_addr_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_x_dout_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_y_dout_y;

    output                                   wrk_wr_narrow_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_wr_narrow_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_wr_narrow_xy_addr_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_x_dout_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_y_dout_y;


    //
    // FSM Declaration
    //
    localparam [5:0] WRK_FSM_STATE_IDLE             = 6'h00;
    
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1     = 6'h01;
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2     = 6'h02;
    localparam [5:0] WRK_FSM_STATE_BUSY             = 6'h03;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1    = 6'h05;    // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug!
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2    = 6'h06;
    
    localparam [5:0] WRK_FSM_STATE_STOP             = 6'h07;
    
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M1  = 6'h10;
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M2  = 6'h11;
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M1  = 6'h12;
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M2  = 6'h13;
    localparam [5:0] WRK_FSM_STATE_BUSY_M1          = 6'h14;
    localparam [5:0] WRK_FSM_STATE_BUSY_M2          = 6'h15;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 6'h16;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 6'h17;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 6'h18;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 6'h19;

    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_TP  = 6'h20;
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_TP  = 6'h21;
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE3_TP  = 6'h22;
    localparam [5:0] WRK_FSM_STATE_LATENCY_PRE4_TP  = 6'h23;
    localparam [5:0] WRK_FSM_STATE_BUSY_TP          = 6'h24;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_TP = 6'h25;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_TP = 6'h26;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST3_TP = 6'h27;
    localparam [5:0] WRK_FSM_STATE_LATENCY_POST4_TP = 6'h28;
    localparam [5:0] WRK_FSM_STATE_HOLDOFF_TP       = 6'h29;
    
    reg [5:0] wrk_fsm_state = WRK_FSM_STATE_IDLE;
    reg [5:0] wrk_fsm_state_next_one_pass;         // single address space sweep
    reg [5:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y)
    reg [5:0] wrk_fsm_state_next_two_pass;         // two address space sweeps
    reg       wrk_fsm_two_pass_pass;               // 0=first pass, 1=second pass
    reg       wrk_fsm_two_pass_pass_dly;           // 0=first pass, 1=second pass


    // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps)
    

    //
    // Control Signals
    //
    reg                    rd_wide_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_x;
    reg [  OP_ADDR_W -1:0] rd_wide_xy_addr_x; 

    reg                    rd_narrow_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_x;
    reg [  OP_ADDR_W -1:0] rd_narrow_xy_addr_x; 

    reg                    rd_wide_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_y;
    reg [  OP_ADDR_W -1:0] rd_wide_xy_addr_y; 

    reg                    rd_narrow_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_y;
    reg [  OP_ADDR_W -1:0] rd_narrow_xy_addr_y; 
    
    reg                    wr_wide_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_x;
    reg [  OP_ADDR_W -1:0] wr_wide_xy_addr_x;
    reg [ WORD_EXT_W -1:0] wr_wide_x_dout_x;
    reg [ WORD_EXT_W -1:0] wr_wide_y_dout_x;

    reg                    wr_narrow_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_narrow_xy_bank_x;
    reg [  OP_ADDR_W -1:0] wr_narrow_xy_addr_x;
    reg [ WORD_EXT_W -1:0] wr_narrow_x_dout_x;
    reg [ WORD_EXT_W -1:0] wr_narrow_y_dout_x;

    reg                    wr_wide_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_y;
    reg [  OP_ADDR_W -1:0] wr_wide_xy_addr_y;
    reg [ WORD_EXT_W -1:0] wr_wide_x_dout_y;
    reg [ WORD_EXT_W -1:0] wr_wide_y_dout_y;

    reg                    wr_narrow_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_narrow_xy_bank_y;
    reg [  OP_ADDR_W -1:0] wr_narrow_xy_addr_y;
    reg [ WORD_EXT_W -1:0] wr_narrow_x_dout_y;
    reg [ WORD_EXT_W -1:0] wr_narrow_y_dout_y;


    //
    // Mapping
    //
    assign wrk_rd_wide_xy_ena_x     = rd_wide_xy_ena_x;
    assign wrk_rd_wide_xy_bank_x    = rd_wide_xy_bank_x;
    assign wrk_rd_wide_xy_addr_x    = rd_wide_xy_addr_x;

    assign wrk_rd_narrow_xy_ena_x   = rd_narrow_xy_ena_x;
    assign wrk_rd_narrow_xy_bank_x  = rd_narrow_xy_bank_x;
    assign wrk_rd_narrow_xy_addr_x  = rd_narrow_xy_addr_x;
    
    assign wrk_rd_wide_xy_ena_y     = rd_wide_xy_ena_y;
    assign wrk_rd_wide_xy_bank_y    = rd_wide_xy_bank_y;
    assign wrk_rd_wide_xy_addr_y    = rd_wide_xy_addr_y;

    assign wrk_rd_narrow_xy_ena_y   = rd_narrow_xy_ena_y;
    assign wrk_rd_narrow_xy_bank_y  = rd_narrow_xy_bank_y;
    assign wrk_rd_narrow_xy_addr_y  = rd_narrow_xy_addr_y;

    assign wrk_wr_wide_xy_ena_x     = wr_wide_xy_ena_x;
    assign wrk_wr_wide_xy_bank_x    = wr_wide_xy_bank_x;
    assign wrk_wr_wide_xy_addr_x    = wr_wide_xy_addr_x;
    assign wrk_wr_wide_x_dout_x     = wr_wide_x_dout_x;
    assign wrk_wr_wide_y_dout_x     = wr_wide_y_dout_x;

    assign wrk_wr_narrow_xy_ena_x   = wr_narrow_xy_ena_x;
    assign wrk_wr_narrow_xy_bank_x  = wr_narrow_xy_bank_x;
    assign wrk_wr_narrow_xy_addr_x  = wr_narrow_xy_addr_x;
    assign wrk_wr_narrow_x_dout_x   = wr_narrow_x_dout_x;
    assign wrk_wr_narrow_y_dout_x   = wr_narrow_y_dout_x;
    
    assign wrk_wr_wide_xy_ena_y     = wr_wide_xy_ena_y;
    assign wrk_wr_wide_xy_bank_y    = wr_wide_xy_bank_y;
    assign wrk_wr_wide_xy_addr_y    = wr_wide_xy_addr_y;
    assign wrk_wr_wide_x_dout_y     = wr_wide_x_dout_y;
    assign wrk_wr_wide_y_dout_y     = wr_wide_y_dout_y;

    assign wrk_wr_narrow_xy_ena_y   = wr_narrow_xy_ena_y;
    assign wrk_wr_narrow_xy_bank_y  = wr_narrow_xy_bank_y;
    assign wrk_wr_narrow_xy_addr_y  = wr_narrow_xy_addr_y;
    assign wrk_wr_narrow_x_dout_y   = wr_narrow_x_dout_y;
    assign wrk_wr_narrow_y_dout_y   = wr_narrow_y_dout_y;
   
   
    //
    // Delays
    //    
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly1;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly2;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly3;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly4;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly1;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly2;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly3;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly4;

    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly1;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly2;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly3;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly4;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly1;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly2;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly3;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly4;
    
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly3;
    
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly3;
    
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly2;
    
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly2;
    
    always @(posedge clk) begin
        //
        {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x};
        {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y};        
        //
        {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x};
        {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y};
        //
        {wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x};
        {wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y};
        //
        {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x};
        {wrk_rd_narrow_y_din_x_dly2, wrk_rd_narrow_y_din_x_dly1} <= {wrk_rd_narrow_y_din_x_dly1, wrk_rd_narrow_y_din_x};
        {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y};
        {wrk_rd_narrow_y_din_y_dly2, wrk_rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y_dly1, wrk_rd_narrow_y_din_y};
        //
    end
        

    //
    // Source Read Enable Logic
    //
    
    task _update_wide_xy_rd_en;   input _en; {rd_wide_xy_ena_x,   rd_wide_xy_ena_y  } <= {2{_en}}; endtask
    task _update_narrow_xy_rd_en; input _en; {rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{_en}}; endtask
    
    task enable_wide_xy_rd_en;  _update_wide_xy_rd_en(1'b1); endtask
    task disable_wide_xy_rd_en; _update_wide_xy_rd_en(1'b0); endtask
    
    task enable_narrow_xy_rd_en;  _update_narrow_xy_rd_en(1'b1); endtask
    task disable_narrow_xy_rd_en; _update_narrow_xy_rd_en(1'b0); endtask
    
    always @(posedge clk)
        //
        if (rst) begin
            //
            disable_wide_xy_rd_en;
            disable_narrow_xy_rd_en;
            //
        end else begin
            //
            disable_wide_xy_rd_en;
            disable_narrow_xy_rd_en;
            //
            // one_pass
            //
            case (wrk_fsm_state_next_one_pass)
                //
                WRK_FSM_STATE_LATENCY_PRE1,
                WRK_FSM_STATE_LATENCY_PRE2,
                WRK_FSM_STATE_BUSY:
                    //
                    case (opcode)
                        //
                        UOP_OPCODE_PROPAGATE_CARRIES,
                        UOP_OPCODE_OUTPUT_FROM_NARROW,
                        UOP_OPCODE_MODULAR_REDUCE_INIT:
                            //
                            enable_narrow_xy_rd_en;
                            //
                        UOP_OPCODE_COPY_CRT_Y2X: begin
                            //
                            enable_wide_xy_rd_en;
                            enable_narrow_xy_rd_en;                            
                            //
                        end
                        //
                        UOP_OPCODE_MERGE_LH:
                            //
                            enable_wide_xy_rd_en;
                            //
                    endcase
                //
            endcase
            //
            // one_pass_meander
            //
            case (wrk_fsm_state_next_one_pass_meander)
                //
                WRK_FSM_STATE_LATENCY_PRE1_M1,
                WRK_FSM_STATE_LATENCY_PRE1_M2,
                WRK_FSM_STATE_LATENCY_PRE2_M1,
                WRK_FSM_STATE_LATENCY_PRE2_M2,
                WRK_FSM_STATE_BUSY_M1,
                WRK_FSM_STATE_BUSY_M2:
                    //
                    case (opcode)
                        //
                        UOP_OPCODE_COPY_LADDERS_X2Y,
                        UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                            //
                            enable_wide_xy_rd_en;
                            enable_narrow_xy_rd_en;
                            //
                        end
                        //
                    endcase
                //
            endcase
            //
            // two_pass
            //
            case (wrk_fsm_state_next_two_pass)
                //
                WRK_FSM_STATE_LATENCY_PRE1_TP,
                WRK_FSM_STATE_LATENCY_PRE2_TP,
                WRK_FSM_STATE_LATENCY_PRE3_TP,
                WRK_FSM_STATE_LATENCY_PRE4_TP,
                WRK_FSM_STATE_BUSY_TP:
                    //
                    case (opcode)
                        UOP_OPCODE_MODULAR_SUBTRACT:
                            //
                            if (!wrk_fsm_two_pass_pass) begin
                                enable_wide_xy_rd_en;
                                enable_narrow_xy_rd_en;
                            end else
                                enable_narrow_xy_rd_en;
                        //
                    endcase
                //
            endcase
            //
        end


    //
    // Destination Write Enable Logic
    //
    
    task _update_wide_xy_wr_en;   input _en; {wr_wide_xy_ena_x,   wr_wide_xy_ena_y  } <= {2{_en}}; endtask
    task _update_narrow_xy_wr_en; input _en; {wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{_en}}; endtask
    
    task enable_wide_xy_wr_en;  _update_wide_xy_wr_en(1'b1); endtask
    task disable_wide_xy_wr_en; _update_wide_xy_wr_en(1'b0); endtask
    
    task enable_narrow_xy_wr_en;  _update_narrow_xy_wr_en(1'b1); endtask
    task disable_narrow_xy_wr_en; _update_narrow_xy_wr_en(1'b0); endtask
    
    always @(posedge clk)
        //
        if (rst) begin
            //
            disable_wide_xy_wr_en;
            disable_narrow_xy_wr_en;
            //
        end else begin
            //
            disable_wide_xy_wr_en;
            disable_narrow_xy_wr_en;
            //
            // one_pass
            //
            case (wrk_fsm_state)
                //
                WRK_FSM_STATE_BUSY,
                WRK_FSM_STATE_LATENCY_POST1,
                WRK_FSM_STATE_LATENCY_POST2:
                    //
                    case (opcode)
                        //
                        UOP_OPCODE_PROPAGATE_CARRIES,
                        UOP_OPCODE_MERGE_LH:
                            //
                            enable_narrow_xy_wr_en;
                            //
                        UOP_OPCODE_COPY_CRT_Y2X: begin
                            //
                            enable_wide_xy_wr_en;
                            enable_narrow_xy_wr_en;                            
                            //
                        end
                        //
                        UOP_OPCODE_MODULAR_REDUCE_INIT:
                            //
                            enable_wide_xy_wr_en;
                        //
                    endcase
                //
            endcase
            //
            // one_pass_meander
            //
            case (wrk_fsm_state)
                //
                WRK_FSM_STATE_BUSY_M2,
                WRK_FSM_STATE_LATENCY_POST1_M2,
                WRK_FSM_STATE_LATENCY_POST2_M2:
                    //
                    case (opcode)
                        //
                        UOP_OPCODE_COPY_LADDERS_X2Y,
                        UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                            //
                            enable_wide_xy_wr_en;
                            enable_narrow_xy_wr_en;
                            //
                        end
                        //
                    endcase
                //
            endcase
            //
            // two_pass
            //
            case (wrk_fsm_state)
                //
                WRK_FSM_STATE_BUSY_TP,
                WRK_FSM_STATE_LATENCY_POST1_TP,
                WRK_FSM_STATE_LATENCY_POST2_TP,
                WRK_FSM_STATE_LATENCY_POST3_TP,
                WRK_FSM_STATE_LATENCY_POST4_TP:
                    //
                    case (opcode)
                        //
                        UOP_OPCODE_MODULAR_SUBTRACT:
                            //
                            if (!wrk_fsm_two_pass_pass)
                                enable_narrow_xy_wr_en;
                            else begin
                                enable_wide_xy_wr_en;
                                enable_narrow_xy_wr_en;
                            end
                        //
                    endcase
                //
            endcase
            //
        end


    //
    // Source to Destination Data Logic
    //
    
    //
    // UOP_OPCODE_PROPAGATE_CARRIES
    //

    reg [CARRY_W -1:0] rd_narrow_x_din_x_cry_r;
    reg [CARRY_W -1:0] rd_narrow_y_din_x_cry_r;
    reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r;
    reg [CARRY_W -1:0] rd_narrow_y_din_y_cry_r;
    
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry = wrk_rd_narrow_x_din_x + {{WORD_W{1'b0}}, rd_narrow_x_din_x_cry_r};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry = wrk_rd_narrow_y_din_x + {{WORD_W{1'b0}}, rd_narrow_y_din_x_cry_r};
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r};
    
    wire [CARRY_W -1:0] rd_narrow_x_din_x_w_cry_msb = rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W];
    wire [CARRY_W -1:0] rd_narrow_y_din_x_w_cry_msb = rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W];
    wire [CARRY_W -1:0] rd_narrow_x_din_y_w_cry_msb = rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W];
    wire [CARRY_W -1:0] rd_narrow_y_din_y_w_cry_msb = rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W];
    
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_x_w_cry[WORD_W -1:0]};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_x_w_cry[WORD_W -1:0]};
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_y_w_cry[WORD_W -1:0]};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_y_w_cry[WORD_W -1:0]};
    
    task update_wide_dout;
        input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y;
        {wr_wide_x_dout_x, wr_wide_y_dout_x, wr_wide_x_dout_y, wr_wide_y_dout_y} <=
        {        x_x,              y_x,              x_y,              y_y     };
    endtask
    
    task update_narrow_dout;
        input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y;
        {wr_narrow_x_dout_x, wr_narrow_y_dout_x, wr_narrow_x_dout_y, wr_narrow_y_dout_y} <=
        {          x_x,                y_x,                x_y,                y_y     };
    endtask
    
    task update_narrow_carries;
        input [CARRY_W-1:0] x_x_cry, y_x_cry, x_y_cry, y_y_cry;
        {rd_narrow_x_din_x_cry_r, rd_narrow_y_din_x_cry_r, rd_narrow_x_din_y_cry_r, rd_narrow_y_din_y_cry_r} <=
        {          x_x_cry,                 y_x_cry,                 x_y_cry,                 y_y_cry      };
    endtask
        
    
    always @(posedge clk)
        //
        if (opcode == UOP_OPCODE_PROPAGATE_CARRIES)
            //
            case (wrk_fsm_state)
                //
                WRK_FSM_STATE_LATENCY_PRE2:
                    //
                    update_narrow_carries(CARRY_ZERO, CARRY_ZERO, CARRY_ZERO, CARRY_ZERO);
                //
                WRK_FSM_STATE_BUSY,
                WRK_FSM_STATE_LATENCY_POST1:
                    //
                    update_narrow_carries(rd_narrow_x_din_x_w_cry_msb,
                                          rd_narrow_y_din_x_w_cry_msb,
                                          rd_narrow_x_din_y_w_cry_msb,
                                          rd_narrow_y_din_y_w_cry_msb);
                //
            endcase


    //
    // UOP_OPCODE_MODULAR_SUBTRACT
    //
    
    reg [WORD_W:0] modsub_x_ab; 
    reg [WORD_W:0] modsub_y_ab;

    reg [WORD_W:0] modsub_x_ab_dly; 
    reg [WORD_W:0] modsub_y_ab_dly;

    reg [WORD_W:0] modsub_x_abn; 
    reg [WORD_W:0] modsub_y_abn;    
    
    reg            modsub_x_ab_mask_now;
    reg            modsub_y_ab_mask_now;

    reg            modsub_x_abn_mask_now;
    reg            modsub_y_abn_mask_now;

    reg            modsub_x_borrow_r;
    reg            modsub_y_borrow_r;
    
    wire           modsub_x_ab_masked = modsub_x_ab_mask_now ? 1'b0 : modsub_x_ab[WORD_W];  
    wire           modsub_y_ab_masked = modsub_y_ab_mask_now ? 1'b0 : modsub_y_ab[WORD_W];

    wire           modsub_x_abn_masked = modsub_x_abn_mask_now ? 1'b0 : modsub_x_abn[WORD_W];  
    wire           modsub_y_abn_masked = modsub_y_abn_mask_now ? 1'b0 : modsub_y_abn[WORD_W];

    wire [WORD_W:0] modsub_x_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_x_din_x[WORD_W-1:0]};
    wire [WORD_W:0] modsub_y_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_y_din_x[WORD_W-1:0]};
    wire [WORD_W:0] modsub_x_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_x_din_y[WORD_W-1:0]};
    wire [WORD_W:0] modsub_y_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_y_din_y[WORD_W-1:0]};
    
    wire [WORD_W:0] modsub_x_wide_x_lsb_pad = {1'b0, wrk_rd_wide_x_din_x_dly1[WORD_W-1:0]};
    wire [WORD_W:0] modsub_x_wide_y_lsb_pad = {1'b0, wrk_rd_wide_x_din_y_dly1[WORD_W-1:0]};
    
    wire [WORD_EXT_W -1:0] modsub_x_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_x_ab_dly[WORD_W-1:0]};  
    wire [WORD_EXT_W -1:0] modsub_y_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_y_ab_dly[WORD_W-1:0]};

    wire [WORD_EXT_W -1:0] modsub_x_abn_trunc = {{CARRY_W{1'b0}}, modsub_x_abn[WORD_W-1:0]};  
    wire [WORD_EXT_W -1:0] modsub_y_abn_trunc = {{CARRY_W{1'b0}}, modsub_y_abn[WORD_W-1:0]};
    
    wire [WORD_EXT_W -1:0] modsub_x_mux = !modsub_x_borrow_r ? wrk_rd_narrow_x_din_x_dly2 : wrk_rd_narrow_y_din_x_dly2;
    wire [WORD_EXT_W -1:0] modsub_y_mux = !modsub_y_borrow_r ? wrk_rd_narrow_x_din_y_dly2 : wrk_rd_narrow_y_din_y_dly2;

    wire [WORD_W:0] modsub_x_ab_lsb_pad = {1'b0, modsub_x_ab[WORD_W-1:0]};
    wire [WORD_W:0] modsub_y_ab_lsb_pad = {1'b0, modsub_y_ab[WORD_W-1:0]};
    
    task update_modsub_ab;
        begin
            modsub_x_ab <= modsub_x_narrow_x_lsb_pad - modsub_y_narrow_x_lsb_pad - modsub_x_ab_masked;
            modsub_y_ab <= modsub_x_narrow_y_lsb_pad - modsub_y_narrow_y_lsb_pad - modsub_y_ab_masked;
        end
    endtask

    task update_modsub_abn;
        begin
            modsub_x_abn <= modsub_x_ab_lsb_pad + modsub_x_wide_x_lsb_pad + modsub_x_abn_masked;
            modsub_y_abn <= modsub_y_ab_lsb_pad + modsub_x_wide_y_lsb_pad + modsub_y_abn_masked;
        end
    endtask
    
    always @(posedge clk)
        //
        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
            //
            case (wrk_fsm_state)
                WRK_FSM_STATE_LATENCY_POST4_TP:
                    if (!wrk_fsm_two_pass_pass)
                        {modsub_x_borrow_r, modsub_y_borrow_r} <= {modsub_x_ab_dly[WORD_W], modsub_y_ab_dly[WORD_W]};
            endcase
    
    always @(posedge clk) begin
        modsub_x_ab_dly <= modsub_x_ab;  
        modsub_y_ab_dly <= modsub_y_ab;
    end
    
    always @(posedge clk) begin
        //
        modsub_x_ab <= {1'bX, WORD_DNC};
        modsub_y_ab <= {1'bX, WORD_DNC};
        //
        modsub_x_abn <= {1'bX, WORD_DNC};
        modsub_y_abn <= {1'bX, WORD_DNC};
        //
        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
            //
            case (wrk_fsm_state)
                //
                WRK_FSM_STATE_LATENCY_PRE3_TP:
                    update_modsub_ab;
                    
                WRK_FSM_STATE_LATENCY_PRE4_TP,
                WRK_FSM_STATE_BUSY_TP,
                WRK_FSM_STATE_LATENCY_POST1_TP,
                WRK_FSM_STATE_LATENCY_POST2_TP: begin
                    update_modsub_ab;
                    update_modsub_abn;
                end
                //
                WRK_FSM_STATE_LATENCY_POST3_TP:
                    //
                    update_modsub_abn;
                //
            endcase
        //
    end

    always @(posedge clk) begin
        //
        modsub_x_ab_mask_now <= 1'b0;
        modsub_y_ab_mask_now <= 1'b0;
        //
        modsub_x_abn_mask_now <= 1'b0;
        modsub_y_abn_mask_now <= 1'b0;
        //
        if (opcode == UOP_OPCODE_MODULAR_SUBTRACT)
            //
            case (wrk_fsm_state)
                //
                WRK_FSM_STATE_LATENCY_PRE2_TP: begin
                    modsub_x_ab_mask_now <= 1'b1;
                    modsub_y_ab_mask_now <= 1'b1;
                end
                //
                WRK_FSM_STATE_LATENCY_PRE3_TP: begin
                    modsub_x_abn_mask_now <= 1'b1;
                    modsub_y_abn_mask_now <= 1'b1;
                end
                //
            endcase
        //     
    end
    
    always @(posedge clk) begin
        //
        update_wide_dout  (WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
        update_narrow_dout(WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC);
        //
        // one_pass
        //
        case (wrk_fsm_state)
            //
            WRK_FSM_STATE_BUSY,
            WRK_FSM_STATE_LATENCY_POST1,
            WRK_FSM_STATE_LATENCY_POST2:
                //
                case (opcode)
                    //
                    UOP_OPCODE_PROPAGATE_CARRIES:
                        //
                        update_narrow_dout(rd_narrow_x_din_x_w_cry_reduced,
                                           rd_narrow_y_din_x_w_cry_reduced,
                                           rd_narrow_x_din_y_w_cry_reduced,
                                           rd_narrow_y_din_y_w_cry_reduced);
                    //
                    UOP_OPCODE_COPY_CRT_Y2X: begin
                        //
                        update_wide_dout(wrk_rd_wide_x_din_y,
                                         wrk_rd_wide_y_din_y,
                                         wrk_rd_wide_x_din_y,
                                         wrk_rd_wide_y_din_y);
                        //
                        update_narrow_dout(wrk_rd_narrow_x_din_y,
                                           wrk_rd_narrow_y_din_y,
                                           wrk_rd_narrow_x_din_y,
                                           wrk_rd_narrow_y_din_y);        
                        //
                    end
                    //
                    UOP_OPCODE_MODULAR_REDUCE_INIT:
                        //
                        update_wide_dout(wrk_rd_narrow_x_din_x,
                                         wrk_rd_narrow_y_din_x,
                                         wrk_rd_narrow_x_din_y,
                                         wrk_rd_narrow_y_din_y);
                    //
                    UOP_OPCODE_MERGE_LH:
                        //
                        update_narrow_dout(wrk_rd_wide_x_din_x,
                                           wrk_rd_wide_y_din_x,
                                           wrk_rd_wide_x_din_y,
                                           wrk_rd_wide_y_din_y);
                    //
                endcase
            //
        endcase
        //
        // one_pass_meander
        //
        case (wrk_fsm_state)
            //
            WRK_FSM_STATE_BUSY_M2,
            WRK_FSM_STATE_LATENCY_POST1_M2,
            WRK_FSM_STATE_LATENCY_POST2_M2:
                //
                case (opcode)
                    //
                    UOP_OPCODE_COPY_LADDERS_X2Y: begin
                        //
                        update_wide_dout(wrk_rd_wide_x_din_x_dly3,
                                         wrk_rd_wide_x_din_x_dly2,
                                         wrk_rd_wide_x_din_y_dly3,
                                         wrk_rd_wide_x_din_y_dly2);
                        //
                        update_narrow_dout(wrk_rd_narrow_x_din_x_dly3,
                                           wrk_rd_narrow_x_din_x_dly2,
                                           wrk_rd_narrow_x_din_y_dly3,
                                           wrk_rd_narrow_x_din_y_dly2);
                        //
                    end
                    //
                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                        //
                        update_wide_dout(wrk_rd_wide_x_din_x_dly3,
                                         wrk_rd_wide_x_din_y_dly2,
                                         wrk_rd_wide_x_din_y_dly3,
                                         wrk_rd_wide_x_din_x_dly2);
                        //
                        update_narrow_dout(wrk_rd_narrow_x_din_x_dly3,
                                           wrk_rd_narrow_x_din_y_dly2,
                                           wrk_rd_narrow_x_din_y_dly3,
                                           wrk_rd_narrow_x_din_x_dly2);
                        //
                    end
                    //
                endcase
            //
        endcase
        //
        // two_pass
        //
        case (wrk_fsm_state)
            //
            WRK_FSM_STATE_BUSY_TP,
            WRK_FSM_STATE_LATENCY_POST1_TP,
            WRK_FSM_STATE_LATENCY_POST2_TP,
            WRK_FSM_STATE_LATENCY_POST3_TP,
            WRK_FSM_STATE_LATENCY_POST4_TP:
                //
                case (opcode)
                    //
                    UOP_OPCODE_MODULAR_SUBTRACT:
                        //
                        if (!wrk_fsm_two_pass_pass)
                            update_narrow_dout(modsub_x_ab_dly_trunc, modsub_x_abn_trunc, modsub_y_ab_dly_trunc, modsub_y_abn_trunc);
                        else begin
                            update_wide_dout  (modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux);
                            update_narrow_dout(modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux);
                        end
                        //
                endcase
            //
        endcase
        //
    end


    //
    // Source Read Address Logic
    //
    
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_xy_next;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_xy_next;

    reg rd_wide_xy_addr_xy_next_last_seen;

    wire rd_wide_xy_addr_xy_next_is_last = rd_wide_xy_addr_xy_next == word_index_last_half;
    wire rd_narrow_xy_addr_xy_next_is_last = rd_narrow_xy_addr_xy_next == word_index_last;
    
    task update_rd_wide_bank_addr;
        input [BANK_ADDR_W -1:0] bank;
        input [  OP_ADDR_W -1:0] addr;
        begin
            {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, addr};
            {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, addr};
        end
    endtask

    task update_rd_wide_bank;
        input [BANK_ADDR_W -1:0] bank;
        begin
            {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, rd_wide_xy_addr_x};
            {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, rd_wide_xy_addr_y};
        end
    endtask
    
    task update_rd_narrow_bank_addr;
        input [BANK_ADDR_W -1:0] bank;
        input [  OP_ADDR_W -1:0] addr;
        begin
            {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, addr};
            {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, addr};
        end
    endtask
    
    task update_rd_narrow_bank;
        input [BANK_ADDR_W -1:0] bank;
        begin
            {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, rd_narrow_xy_addr_x};
            {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, rd_narrow_xy_addr_y};
        end
    endtask
    
    task update_rd_wide_addr_next;
        input [OP_ADDR_W -1:0] addr;
        rd_wide_xy_addr_xy_next <= addr;
    endtask

    task update_rd_narrow_addr_next;
        input [OP_ADDR_W -1:0] addr;
        rd_narrow_xy_addr_xy_next <= addr;
    endtask
    
    task advance_rd_wide_addr_next;
        rd_wide_xy_addr_xy_next <= !rd_wide_xy_addr_xy_next_is_last ? rd_wide_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO;
    endtask

    task advance_rd_narrow_addr_next;
        rd_narrow_xy_addr_xy_next <= !rd_narrow_xy_addr_xy_next_is_last ? rd_narrow_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO;
    endtask 
    
    always @(posedge clk)
        //
        case (wrk_fsm_state_next_one_pass)
            //
            WRK_FSM_STATE_LATENCY_PRE1:
                //
                rd_wide_xy_addr_xy_next_last_seen <= 1'b0;
            //
            WRK_FSM_STATE_LATENCY_PRE2,
            WRK_FSM_STATE_BUSY:
                //
                if (!rd_wide_xy_addr_xy_next_last_seen)
                    rd_wide_xy_addr_xy_next_last_seen <= rd_wide_xy_addr_xy_next_is_last;
            //
        endcase

    always @(posedge clk) begin
        //
        update_rd_wide_bank_addr  (BANK_DNC, OP_ADDR_DNC);
        update_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_DNC);
        //
        // one_pass
        //
        case (wrk_fsm_state_next_one_pass)
            //
            WRK_FSM_STATE_LATENCY_PRE1:
                //
                case (opcode)
                    //
                    UOP_OPCODE_PROPAGATE_CARRIES,
                    UOP_OPCODE_OUTPUT_FROM_NARROW,
                    UOP_OPCODE_COPY_CRT_Y2X,
                    UOP_OPCODE_MODULAR_REDUCE_INIT: begin
                        //
                        update_rd_wide_bank_addr  (sel_wide_in,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
                        update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
                        //
                    end
                    //
                    UOP_OPCODE_MERGE_LH: begin
                        update_rd_wide_bank_addr  (BANK_WIDE_L,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
                        update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
                    end
                    //
                endcase
                //
            WRK_FSM_STATE_LATENCY_PRE2,
            WRK_FSM_STATE_BUSY:
                //
                case (opcode)
                    //
                    UOP_OPCODE_PROPAGATE_CARRIES,
                    UOP_OPCODE_OUTPUT_FROM_NARROW,
                    UOP_OPCODE_COPY_CRT_Y2X: begin
                        //
                        update_rd_wide_bank_addr  (sel_wide_in,   rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
                        update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                        //
                    end
                    //
                    UOP_OPCODE_MODULAR_REDUCE_INIT: begin
                        //
                        update_rd_wide_bank_addr  (sel_wide_in,   rd_wide_xy_addr_xy_next  ); advance_rd_wide_addr_next  ;
                        update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                        //
                    end
                    //
                    UOP_OPCODE_MERGE_LH: begin
                        //
                        if (!rd_wide_xy_addr_xy_next_last_seen) update_rd_wide_bank_addr  (BANK_WIDE_L,   rd_wide_xy_addr_xy_next  ); 
                        else                                    update_rd_wide_bank_addr  (BANK_WIDE_H,   rd_wide_xy_addr_xy_next  );
                                                                                                                                      advance_rd_wide_addr_next  ;
                                                                update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                        //
                    end
                    //
                endcase
            //
        endcase
        //
        // one_pass_meander
        //
        case (wrk_fsm_state_next_one_pass_meander)
            //
            WRK_FSM_STATE_LATENCY_PRE1_M1:
                case (opcode)
                    UOP_OPCODE_COPY_LADDERS_X2Y,
                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                        update_rd_wide_bank_addr  (sel_wide_out,   OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
                        update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
                    end
                endcase
            //
            WRK_FSM_STATE_LATENCY_PRE2_M1,
            WRK_FSM_STATE_BUSY_M1:
                case (opcode)
                    UOP_OPCODE_COPY_LADDERS_X2Y,
                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                        update_rd_wide_bank_addr  (sel_wide_out,   rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
                        update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                        //
                    end
                    //
                endcase
            //
            WRK_FSM_STATE_LATENCY_PRE1_M2,
            WRK_FSM_STATE_LATENCY_PRE2_M2,
            WRK_FSM_STATE_BUSY_M2:
                case (opcode)
                    UOP_OPCODE_COPY_LADDERS_X2Y,
                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                        update_rd_wide_bank  (sel_wide_in  );
                        update_rd_narrow_bank(sel_narrow_in);
                    end
                endcase
            //
        endcase
        //
        // two_pass
        //
        case (wrk_fsm_state_next_two_pass)
            //
            WRK_FSM_STATE_LATENCY_PRE1_TP:
                //
                case (opcode)
                    //
                    UOP_OPCODE_MODULAR_SUBTRACT:
                        //
                        if (!wrk_fsm_two_pass_pass) begin
                            update_rd_wide_bank_addr  (BANK_WIDE_N,    OP_ADDR_ZERO); update_rd_wide_addr_next  (OP_ADDR_ONE);
                            update_rd_narrow_bank_addr(sel_narrow_in,  OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
                        end else begin
                            update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE);
                        end
                    //
                endcase
                //
            WRK_FSM_STATE_LATENCY_PRE2_TP,
            WRK_FSM_STATE_LATENCY_PRE3_TP,
            WRK_FSM_STATE_LATENCY_PRE4_TP,
            WRK_FSM_STATE_BUSY_TP:
                //
                case (opcode)
                    //
                    UOP_OPCODE_MODULAR_SUBTRACT:
                        //
                        if (!wrk_fsm_two_pass_pass) begin
                            update_rd_wide_bank_addr  (BANK_WIDE_N,    rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next  ;
                            update_rd_narrow_bank_addr(sel_narrow_in,  rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                        end else begin
                            update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next;
                        end
                    //
                endcase
                //
        endcase
        //
    end


    //
    // Destination Write Address Logic
    //
    
    wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half;
    wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half;

    wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_x = uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H;
    wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_y = uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H;

    task update_wr_wide_bank_addr;
        input [BANK_ADDR_W -1:0] x_bank;
        input [BANK_ADDR_W -1:0] y_bank;
        input [  OP_ADDR_W -1:0] x_addr;
        input [  OP_ADDR_W -1:0] y_addr;
        begin
            {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {x_bank, x_addr};
            {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {y_bank, y_addr};
        end
    endtask
    
    task update_wr_narrow_bank_addr;
        input [BANK_ADDR_W -1:0] x_bank;
        input [BANK_ADDR_W -1:0] y_bank;
        input [  OP_ADDR_W -1:0] x_addr;
        input [  OP_ADDR_W -1:0] y_addr;
        begin
            {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {x_bank, x_addr};
            {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {y_bank, y_addr};
        end
    endtask
    
    always @(posedge clk) begin
        //
        update_wr_wide_bank_addr  (BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC);
        update_wr_narrow_bank_addr(BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC);
        //
        // one_pass
        //
        case (wrk_fsm_state)
            //
            WRK_FSM_STATE_BUSY,
            WRK_FSM_STATE_LATENCY_POST1,
            WRK_FSM_STATE_LATENCY_POST2:
                //
                case (opcode)
                    //
                    UOP_OPCODE_PROPAGATE_CARRIES,
                    UOP_OPCODE_COPY_CRT_Y2X: begin
                        update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
                    end
                    //
                    UOP_OPCODE_MODULAR_REDUCE_INIT:
                        update_wr_wide_bank_addr(uop_modular_reduce_init_bank_x, uop_modular_reduce_init_bank_y, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_y_dly2);                    
                    //
                    UOP_OPCODE_MERGE_LH:
                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2);
                    //
                endcase
                //
        endcase
        //
        // one_pass_meander
        //
        case (wrk_fsm_state)
            //
            WRK_FSM_STATE_BUSY_M2,
            WRK_FSM_STATE_LATENCY_POST1_M2,
            WRK_FSM_STATE_LATENCY_POST2_M2:
                //        
                case (opcode)
                    UOP_OPCODE_COPY_LADDERS_X2Y,
                    UOP_OPCODE_CROSS_LADDERS_X2Y: begin
                        update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
                        update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
                    end
                endcase
                //
        endcase
        //
        // two_pass
        //
        case (wrk_fsm_state)
            //
            WRK_FSM_STATE_BUSY_TP,
            WRK_FSM_STATE_LATENCY_POST1_TP,
            WRK_FSM_STATE_LATENCY_POST2_TP,
            WRK_FSM_STATE_LATENCY_POST3_TP,
            WRK_FSM_STATE_LATENCY_POST4_TP:
                //
                case (opcode)
                    //
                    UOP_OPCODE_MODULAR_SUBTRACT:
                        //
                        if (!wrk_fsm_two_pass_pass) begin
                            update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);                     
                        end else begin
                            update_wr_wide_bank_addr  (sel_wide_out,   sel_wide_out,   rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
                            update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4);
                        end 
                        //
                    endcase
                //
            endcase
        //
    end


    //
    // FSM Process
    //

    always @(posedge clk)
        //
        if (rst) wrk_fsm_state <= WRK_FSM_STATE_IDLE;
        else case (opcode)
            UOP_OPCODE_PROPAGATE_CARRIES,
            UOP_OPCODE_OUTPUT_FROM_NARROW,
            UOP_OPCODE_COPY_CRT_Y2X,
            UOP_OPCODE_MODULAR_REDUCE_INIT,
            UOP_OPCODE_MERGE_LH:            wrk_fsm_state <= wrk_fsm_state_next_one_pass;
            UOP_OPCODE_COPY_LADDERS_X2Y,
            UOP_OPCODE_CROSS_LADDERS_X2Y:   wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander;
            UOP_OPCODE_MODULAR_SUBTRACT:    wrk_fsm_state <= wrk_fsm_state_next_two_pass;
            default:                        wrk_fsm_state <= WRK_FSM_STATE_IDLE;
        endcase
    
  
    //
    // Busy Exit Logic
    //
    
    reg wrk_fsm_done_one_pass         = 1'b0;
    reg wrk_fsm_done_one_pass_meander = 1'b0;
    reg wrk_fsm_done_two_pass         = 1'b0;
    
    always @(posedge clk) begin
        //
        wrk_fsm_done_one_pass         <= 1'b0;
        wrk_fsm_done_one_pass_meander <= 1'b0;
        wrk_fsm_done_two_pass         <= 1'b0;
        //
        case (opcode)
            //
            UOP_OPCODE_PROPAGATE_CARRIES,
            UOP_OPCODE_OUTPUT_FROM_NARROW,
            UOP_OPCODE_COPY_CRT_Y2X,
            UOP_OPCODE_MODULAR_REDUCE_INIT,
            UOP_OPCODE_MERGE_LH:
                //
                case (wrk_fsm_state)
                    WRK_FSM_STATE_BUSY:
                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass <= 1'b1;
                endcase
                //
            UOP_OPCODE_COPY_LADDERS_X2Y,
            UOP_OPCODE_CROSS_LADDERS_X2Y:
                //
                case (wrk_fsm_state)
                    WRK_FSM_STATE_BUSY_M2:
                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1;
                    WRK_FSM_STATE_BUSY_M1:
                        wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander;
                endcase
                //
            UOP_OPCODE_MODULAR_SUBTRACT:
                //
                case (wrk_fsm_state)
                    WRK_FSM_STATE_BUSY_TP:
                        if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_two_pass <= 1'b1;
                endcase
                //
            //
        endcase
        //
    end
    
    
    //
    // FSM Helper Logic
    //    
    always @(posedge clk)
        //
        case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE: if (ena)    {wrk_fsm_two_pass_pass, wrk_fsm_two_pass_pass_dly} <= {1'b0, 1'b0};
            WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_two_pass_pass <= 1'b1;
            WRK_FSM_STATE_HOLDOFF_TP:       wrk_fsm_two_pass_pass_dly <= 1'b1;
        endcase


    //
    // FSM Transition Logic
    //
    always @* begin
        //
        case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE:          wrk_fsm_state_next_one_pass = ena                   ? WRK_FSM_STATE_LATENCY_PRE1  : WRK_FSM_STATE_IDLE ;
            WRK_FSM_STATE_LATENCY_PRE1:  wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_LATENCY_PRE2  ;
            WRK_FSM_STATE_LATENCY_PRE2:  wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_BUSY          ;
            WRK_FSM_STATE_BUSY:          wrk_fsm_state_next_one_pass = wrk_fsm_done_one_pass ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY ;
            WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_LATENCY_POST2 ;
            WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_STOP          ;
            WRK_FSM_STATE_STOP:          wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_IDLE          ;
            default:                     wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_IDLE          ;
        endcase
        //
    end
    
    always @* begin
        //
        case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE:             wrk_fsm_state_next_one_pass_meander = ena                           ? WRK_FSM_STATE_LATENCY_PRE1_M1  : WRK_FSM_STATE_IDLE    ;           
            //
            WRK_FSM_STATE_LATENCY_PRE1_M1:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE1_M2  ;
            WRK_FSM_STATE_LATENCY_PRE1_M2:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE2_M1  ;
            WRK_FSM_STATE_LATENCY_PRE2_M1:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE2_M2  ;
            WRK_FSM_STATE_LATENCY_PRE2_M2:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_BUSY_M1          ;
            WRK_FSM_STATE_BUSY_M1:          wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_BUSY_M2          ;
            WRK_FSM_STATE_BUSY_M2:          wrk_fsm_state_next_one_pass_meander = wrk_fsm_done_one_pass_meander ? WRK_FSM_STATE_LATENCY_POST1_M1 : WRK_FSM_STATE_BUSY_M1 ;
            WRK_FSM_STATE_LATENCY_POST1_M1: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST1_M2 ;
            WRK_FSM_STATE_LATENCY_POST1_M2: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST2_M1 ;
            WRK_FSM_STATE_LATENCY_POST2_M1: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST2_M2 ;
            WRK_FSM_STATE_LATENCY_POST2_M2: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_STOP             ;
            //
            WRK_FSM_STATE_STOP:             wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_IDLE             ;
            //
            default:                        wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_IDLE             ;
        endcase
        //
    end
    
    always @* begin
        //
        case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE:             wrk_fsm_state_next_two_pass = ena                       ? WRK_FSM_STATE_LATENCY_PRE1_TP  : WRK_FSM_STATE_IDLE;
            WRK_FSM_STATE_LATENCY_PRE1_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE2_TP  ;
            WRK_FSM_STATE_LATENCY_PRE2_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE3_TP  ;
            WRK_FSM_STATE_LATENCY_PRE3_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_PRE4_TP  ;
            WRK_FSM_STATE_LATENCY_PRE4_TP:  wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_BUSY_TP          ;
            WRK_FSM_STATE_BUSY_TP:          wrk_fsm_state_next_two_pass = wrk_fsm_done_two_pass ?     WRK_FSM_STATE_LATENCY_POST1_TP : WRK_FSM_STATE_BUSY_TP;
            WRK_FSM_STATE_LATENCY_POST1_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST2_TP ;
            WRK_FSM_STATE_LATENCY_POST2_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST3_TP ;
            WRK_FSM_STATE_LATENCY_POST3_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_LATENCY_POST4_TP ;
            WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_HOLDOFF_TP       ;
            WRK_FSM_STATE_HOLDOFF_TP:       wrk_fsm_state_next_two_pass = wrk_fsm_two_pass_pass_dly ? WRK_FSM_STATE_STOP             : WRK_FSM_STATE_LATENCY_PRE1_TP; 
            WRK_FSM_STATE_STOP:             wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_IDLE             ;
            default:                        wrk_fsm_state_next_two_pass =                             WRK_FSM_STATE_IDLE             ;
        endcase
        //
    end
    
    
    //
    // Ready Logic
    //
    reg rdy_reg = 1'b1;
    
    assign rdy = rdy_reg;
    
    always @(posedge clk)
        //
        if (rst)                  rdy_reg <= 1'b1;
        else case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE:   rdy_reg <= ~ena;
            WRK_FSM_STATE_STOP:   rdy_reg <= 1'b1;
        endcase


endmodule