aboutsummaryrefslogblamecommitdiff
path: root/rtl/modexpng_general_worker.v
blob: 269ef98b7932f047d92551e6b86cfe07b60faaa3 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16















                              
                         









































































                                                             
                                                                  




















































                                                                     
                                                            
    





















                                                                                                                                                        

 


                                                                                                                            


























































































                                                          







                                                


                                                  

                                                  

                                                  


















                                                     


                               




                                                                                                                             



                                                                                                                








                                                                                                                                                                                    
       







































                                                                                                          

                                               





















                                                                                    
                                                 
                                                   



                              

















                                                              

















































                                                                                                          
                                                   













                                                          



























                                                             

















                                               
                                                                                                                       







































































                                                                                                                              






































                                                                         








                          


                                                                                               



























                                                                                                                                   



































                                                                                                                                   



               

 


                         


                                                


                                                  




                                                                                         
    
                                                                                                    





















                                                                                                                               


                                                              













                                                                                                                                  


























































































                                                                                                                                   




                                                                                    










                                                                                                           
















                                                     


                                                                                                 






                                                                                

                                              


                               

                                              




                                          

                                                 









                                                                                                                                      













                                                                                                                                              
















                                                                                                                                                 
























                                                                                                                                                                                     





















                                                  
module modexpng_general_worker
(
    clk,
    rst,
    
    ena,
    rdy,
    
    sel_narrow_in,
    sel_narrow_out,
    sel_wide_in,
    sel_wide_out,
    
    opcode,
    
    word_index_last,
    word_index_last_half,
    
    wrk_rd_wide_xy_ena_x,
    wrk_rd_wide_xy_bank_x,
    wrk_rd_wide_xy_addr_x,
    wrk_rd_wide_x_din_x,
    wrk_rd_wide_y_din_x,

    wrk_rd_narrow_xy_ena_x,
    wrk_rd_narrow_xy_bank_x,
    wrk_rd_narrow_xy_addr_x,
    wrk_rd_narrow_x_din_x,
    wrk_rd_narrow_y_din_x,

    wrk_rd_wide_xy_ena_y,
    wrk_rd_wide_xy_bank_y,
    wrk_rd_wide_xy_addr_y,
    wrk_rd_wide_x_din_y,
    wrk_rd_wide_y_din_y,

    wrk_rd_narrow_xy_ena_y,
    wrk_rd_narrow_xy_bank_y,
    wrk_rd_narrow_xy_addr_y,
    wrk_rd_narrow_x_din_y,
    wrk_rd_narrow_y_din_y,
    
    wrk_wr_wide_xy_ena_x,
    wrk_wr_wide_xy_bank_x,
    wrk_wr_wide_xy_addr_x,
    wrk_wr_wide_x_dout_x,
    wrk_wr_wide_y_dout_x,

    wrk_wr_narrow_xy_ena_x,
    wrk_wr_narrow_xy_bank_x,
    wrk_wr_narrow_xy_addr_x,
    wrk_wr_narrow_x_dout_x,
    wrk_wr_narrow_y_dout_x,

    wrk_wr_wide_xy_ena_y,
    wrk_wr_wide_xy_bank_y,
    wrk_wr_wide_xy_addr_y,
    wrk_wr_wide_x_dout_y,
    wrk_wr_wide_y_dout_y,

    wrk_wr_narrow_xy_ena_y,
    wrk_wr_narrow_xy_bank_y,
    wrk_wr_narrow_xy_addr_y,
    wrk_wr_narrow_x_dout_y,
    wrk_wr_narrow_y_dout_y
);

    //
    // Headers
    //
    `include "modexpng_parameters.vh"
    `include "modexpng_microcode.vh"

    
    //
    // Ports
    //
    input                                    clk;
    input                                    rst;

    input                                    ena;
    output                                   rdy;
    
    input  [              BANK_ADDR_W  -1:0] sel_narrow_in; 
    input  [              BANK_ADDR_W  -1:0] sel_narrow_out; 
    input  [              BANK_ADDR_W  -1:0] sel_wide_in; 
    input  [              BANK_ADDR_W  -1:0] sel_wide_out; 
    
    input  [              UOP_OPCODE_W -1:0] opcode;
    
    input  [              OP_ADDR_W    -1:0] word_index_last;
    input  [              OP_ADDR_W    -1:0] word_index_last_half;
    
    output                                   wrk_rd_wide_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_rd_wide_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_rd_wide_xy_addr_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_x_din_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_y_din_x;

    output                                   wrk_rd_narrow_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_rd_narrow_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_rd_narrow_xy_addr_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_x_din_x;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_y_din_x;
    
    output                                   wrk_rd_wide_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_rd_wide_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_rd_wide_xy_addr_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_x_din_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_wide_y_din_y;

    output                                   wrk_rd_narrow_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_rd_narrow_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_rd_narrow_xy_addr_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_x_din_y;
    input  [              WORD_EXT_W   -1:0] wrk_rd_narrow_y_din_y;

    output                                   wrk_wr_wide_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_wr_wide_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_wr_wide_xy_addr_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_x_dout_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_y_dout_x;

    output                                   wrk_wr_narrow_xy_ena_x;
    output [              BANK_ADDR_W  -1:0] wrk_wr_narrow_xy_bank_x;
    output [              OP_ADDR_W    -1:0] wrk_wr_narrow_xy_addr_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_x_dout_x;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_y_dout_x;
    
    output                                   wrk_wr_wide_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_wr_wide_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_wr_wide_xy_addr_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_x_dout_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_wide_y_dout_y;

    output                                   wrk_wr_narrow_xy_ena_y;
    output [              BANK_ADDR_W  -1:0] wrk_wr_narrow_xy_bank_y;
    output [              OP_ADDR_W    -1:0] wrk_wr_narrow_xy_addr_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_x_dout_y;
    output [              WORD_EXT_W   -1:0] wrk_wr_narrow_y_dout_y;


    //
    // FSM Declaration
    //
    localparam [4:0] WRK_FSM_STATE_IDLE             = 5'h00;
    
    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1     = 5'h01;
    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2     = 5'h02;
    localparam [4:0] WRK_FSM_STATE_BUSY             = 5'h03;
    localparam [4:0] WRK_FSM_STATE_LATENCY_POST1    = 5'h05;    // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug!
    localparam [4:0] WRK_FSM_STATE_LATENCY_POST2    = 5'h06;
    
    localparam [4:0] WRK_FSM_STATE_STOP             = 5'h07;
    
    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M1  = 5'h10;
    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M2  = 5'h11;
    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M1  = 5'h12;
    localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M2  = 5'h13;
    localparam [4:0] WRK_FSM_STATE_BUSY_M1          = 5'h14;
    localparam [4:0] WRK_FSM_STATE_BUSY_M2          = 5'h15;
    localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 5'h16;
    localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 5'h17;
    localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 5'h18;
    localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 5'h19;
    
    reg [4:0] wrk_fsm_state = WRK_FSM_STATE_IDLE;
    reg [4:0] wrk_fsm_state_next_one_pass;         // single address space sweep
    reg [4:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y)


    // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps)
    

    //
    // Control Signals
    //
    reg                    rd_wide_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_x;
    reg [  OP_ADDR_W -1:0] rd_wide_xy_addr_x; 

    reg                    rd_narrow_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_x;
    reg [  OP_ADDR_W -1:0] rd_narrow_xy_addr_x; 

    reg                    rd_wide_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_y;
    reg [  OP_ADDR_W -1:0] rd_wide_xy_addr_y; 

    reg                    rd_narrow_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_y;
    reg [  OP_ADDR_W -1:0] rd_narrow_xy_addr_y; 
    
    reg                    wr_wide_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_x;
    reg [  OP_ADDR_W -1:0] wr_wide_xy_addr_x;
    reg [ WORD_EXT_W -1:0] wr_wide_x_dout_x;
    reg [ WORD_EXT_W -1:0] wr_wide_y_dout_x;

    reg                    wr_narrow_xy_ena_x = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_narrow_xy_bank_x;
    reg [  OP_ADDR_W -1:0] wr_narrow_xy_addr_x;
    reg [ WORD_EXT_W -1:0] wr_narrow_x_dout_x;
    reg [ WORD_EXT_W -1:0] wr_narrow_y_dout_x;

    reg                    wr_wide_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_y;
    reg [  OP_ADDR_W -1:0] wr_wide_xy_addr_y;
    reg [ WORD_EXT_W -1:0] wr_wide_x_dout_y;
    reg [ WORD_EXT_W -1:0] wr_wide_y_dout_y;

    reg                    wr_narrow_xy_ena_y = 1'b0;
    reg [BANK_ADDR_W -1:0] wr_narrow_xy_bank_y;
    reg [  OP_ADDR_W -1:0] wr_narrow_xy_addr_y;
    reg [ WORD_EXT_W -1:0] wr_narrow_x_dout_y;
    reg [ WORD_EXT_W -1:0] wr_narrow_y_dout_y;


    //
    // Mapping
    //
    assign wrk_rd_wide_xy_ena_x     = rd_wide_xy_ena_x;
    assign wrk_rd_wide_xy_bank_x    = rd_wide_xy_bank_x;
    assign wrk_rd_wide_xy_addr_x    = rd_wide_xy_addr_x;

    assign wrk_rd_narrow_xy_ena_x   = rd_narrow_xy_ena_x;
    assign wrk_rd_narrow_xy_bank_x  = rd_narrow_xy_bank_x;
    assign wrk_rd_narrow_xy_addr_x  = rd_narrow_xy_addr_x;
    
    assign wrk_rd_wide_xy_ena_y     = rd_wide_xy_ena_y;
    assign wrk_rd_wide_xy_bank_y    = rd_wide_xy_bank_y;
    assign wrk_rd_wide_xy_addr_y    = rd_wide_xy_addr_y;

    assign wrk_rd_narrow_xy_ena_y   = rd_narrow_xy_ena_y;
    assign wrk_rd_narrow_xy_bank_y  = rd_narrow_xy_bank_y;
    assign wrk_rd_narrow_xy_addr_y  = rd_narrow_xy_addr_y;

    assign wrk_wr_wide_xy_ena_x     = wr_wide_xy_ena_x;
    assign wrk_wr_wide_xy_bank_x    = wr_wide_xy_bank_x;
    assign wrk_wr_wide_xy_addr_x    = wr_wide_xy_addr_x;
    assign wrk_wr_wide_x_dout_x     = wr_wide_x_dout_x;
    assign wrk_wr_wide_y_dout_x     = wr_wide_y_dout_x;

    assign wrk_wr_narrow_xy_ena_x   = wr_narrow_xy_ena_x;
    assign wrk_wr_narrow_xy_bank_x  = wr_narrow_xy_bank_x;
    assign wrk_wr_narrow_xy_addr_x  = wr_narrow_xy_addr_x;
    assign wrk_wr_narrow_x_dout_x   = wr_narrow_x_dout_x;
    assign wrk_wr_narrow_y_dout_x   = wr_narrow_y_dout_x;
    
    assign wrk_wr_wide_xy_ena_y     = wr_wide_xy_ena_y;
    assign wrk_wr_wide_xy_bank_y    = wr_wide_xy_bank_y;
    assign wrk_wr_wide_xy_addr_y    = wr_wide_xy_addr_y;
    assign wrk_wr_wide_x_dout_y     = wr_wide_x_dout_y;
    assign wrk_wr_wide_y_dout_y     = wr_wide_y_dout_y;

    assign wrk_wr_narrow_xy_ena_y   = wr_narrow_xy_ena_y;
    assign wrk_wr_narrow_xy_bank_y  = wr_narrow_xy_bank_y;
    assign wrk_wr_narrow_xy_addr_y  = wr_narrow_xy_addr_y;
    assign wrk_wr_narrow_x_dout_y   = wr_narrow_x_dout_y;
    assign wrk_wr_narrow_y_dout_y   = wr_narrow_y_dout_y;
   
   
    //
    // Delays
    //    
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly1;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly2;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly3;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly4;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly1;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly2;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly3;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly4;

    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly1;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly2;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly3;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly4;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly1;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly2;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly3;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly4;
    
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly3;
    
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly3;
    
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3;
    
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2;
    reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3;

    
    always @(posedge clk) begin
        //
        {rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x};
        {rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y};        
        //
        {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2};
        {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2};                
        //
        {rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x};
        {rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y};
        //
        {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2};
        {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2};
        //
        {wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x};
        {wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y};
        //
        {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x};
        {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y};    
        //
    end
        

    //
    // Read Enable Logic
    //
    
    task _update_wide_xy_rd_en;   input _en; {rd_wide_xy_ena_x,   rd_wide_xy_ena_y  } <= {2{_en}}; endtask
    task _update_narrow_xy_rd_en; input _en; {rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{_en}}; endtask
    
    task enable_wide_xy_rd_en;  _update_wide_xy_rd_en(1'b1); endtask
    task disable_wide_xy_rd_en; _update_wide_xy_rd_en(1'b0); endtask
    
    task enable_narrow_xy_rd_en;  _update_narrow_xy_rd_en(1'b1); endtask
    task disable_narrow_xy_rd_en; _update_narrow_xy_rd_en(1'b0); endtask
    
    always @(posedge clk)
        //
        if (rst) begin
            //
            disable_wide_xy_rd_en;
            disable_narrow_xy_rd_en;
            /*
            rd_wide_xy_ena_x    <= 1'b0;
            rd_wide_xy_ena_y    <= 1'b0;
            rd_narrow_xy_ena_x  <= 1'b0;
            rd_narrow_xy_ena_y  <= 1'b0;
            */
        end else begin
            //
            disable_wide_xy_rd_en;
            disable_narrow_xy_rd_en;
            //
            //rd_wide_xy_ena_x    <= 1'b0;
            //rd_wide_xy_ena_y    <= 1'b0;
            //rd_narrow_xy_ena_x  <= 1'b0;
            //rd_narrow_xy_ena_y  <= 1'b0;
            //
            case (opcode)
                //
                UOP_OPCODE_PROPAGATE_CARRIES,
                UOP_OPCODE_OUTPUT_FROM_NARROW,
                UOP_OPCODE_MODULAR_REDUCE_INIT:
                    //
                    case (wrk_fsm_state_next_one_pass)
                        //
                        WRK_FSM_STATE_LATENCY_PRE1,
                        WRK_FSM_STATE_LATENCY_PRE2,
                        WRK_FSM_STATE_BUSY:
                            //
                            enable_narrow_xy_rd_en;
                            //{rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{1'b1}};
                            //
                        //
                    endcase
                    //
                //
                UOP_OPCODE_COPY_CRT_Y2X:
                    //
                    case (wrk_fsm_state_next_one_pass)
                        //
                        WRK_FSM_STATE_LATENCY_PRE1,
                        WRK_FSM_STATE_LATENCY_PRE2,
                        WRK_FSM_STATE_BUSY: begin
                            //
                            enable_wide_xy_rd_en;
                            enable_narrow_xy_rd_en;
                            //
                        end
                        //
                    endcase
                    //
                UOP_OPCODE_COPY_LADDERS_X2Y:
                    //
                    case (wrk_fsm_state_next_one_pass_meander)
                        //
                        WRK_FSM_STATE_LATENCY_PRE1_M1,
                        WRK_FSM_STATE_LATENCY_PRE1_M2,
                        WRK_FSM_STATE_LATENCY_PRE2_M1,
                        WRK_FSM_STATE_LATENCY_PRE2_M2,
                        WRK_FSM_STATE_BUSY_M1,
                        WRK_FSM_STATE_BUSY_M2: begin
                            //
                            enable_wide_xy_rd_en;
                            enable_narrow_xy_rd_en;
                            //
                        end
                        //
                    endcase                    
                //
            endcase
            //
        end


    //
    // Write Enable Logic
    //
    
    task _update_wide_xy_wr_en;   input _en; {wr_wide_xy_ena_x,   wr_wide_xy_ena_y  } <= {2{_en}}; endtask
    task _update_narrow_xy_wr_en; input _en; {wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{_en}}; endtask
    
    task enable_wide_xy_wr_en;  _update_wide_xy_wr_en(1'b1); endtask
    task disable_wide_xy_wr_en; _update_wide_xy_wr_en(1'b0); endtask
    
    task enable_narrow_xy_wr_en;  _update_narrow_xy_wr_en(1'b1); endtask
    task disable_narrow_xy_wr_en; _update_narrow_xy_wr_en(1'b0); endtask
    
    always @(posedge clk)
        //
        if (rst) begin
            //
            disable_wide_xy_wr_en;
            disable_narrow_xy_wr_en;
            //wr_wide_xy_ena_x    <= 1'b0;
            //wr_wide_xy_ena_y    <= 1'b0;
            //wr_narrow_xy_ena_x  <= 1'b0;
            //wr_narrow_xy_ena_y  <= 1'b0;
            //
        end else begin
            //
            disable_wide_xy_wr_en;
            disable_narrow_xy_wr_en;
            //
            //wr_wide_xy_ena_x    <= 1'b0;
            //wr_wide_xy_ena_y    <= 1'b0;
            //wr_narrow_xy_ena_x  <= 1'b0;
            //wr_narrow_xy_ena_y  <= 1'b0;
            //
            case (opcode)
                //
                UOP_OPCODE_PROPAGATE_CARRIES:
                    //
                    case (wrk_fsm_state)
                        //
                        WRK_FSM_STATE_BUSY,
                        WRK_FSM_STATE_LATENCY_POST1,
                        WRK_FSM_STATE_LATENCY_POST2:
                            //
                            enable_narrow_xy_wr_en;
                            //
                        //
                    endcase
                    //
                UOP_OPCODE_COPY_CRT_Y2X:
                    //
                    case (wrk_fsm_state)
                        //
                        WRK_FSM_STATE_BUSY,
                        WRK_FSM_STATE_LATENCY_POST1,
                        WRK_FSM_STATE_LATENCY_POST2: begin
                            //
                            enable_wide_xy_wr_en;
                            enable_narrow_xy_wr_en;
                            //
                        end
                        //
                    endcase
                    //
                UOP_OPCODE_MODULAR_REDUCE_INIT:
                    //
                    case (wrk_fsm_state)
                        //
                        WRK_FSM_STATE_BUSY,
                        WRK_FSM_STATE_LATENCY_POST1,
                        WRK_FSM_STATE_LATENCY_POST2:
                            //
                            enable_wide_xy_wr_en;
                            //
                        //
                    endcase
                    //
                UOP_OPCODE_COPY_LADDERS_X2Y:
                    //
                    case (wrk_fsm_state)
                        //
                        WRK_FSM_STATE_BUSY_M2,
                        WRK_FSM_STATE_LATENCY_POST1_M2,
                        WRK_FSM_STATE_LATENCY_POST2_M2: begin
                            //
                            enable_wide_xy_wr_en;
                            enable_narrow_xy_wr_en;
                            //
                        end
                        //
                    endcase
                //
            endcase
            //
        end


    //
    // Data Logic
    //
    reg [CARRY_W -1:0] rd_narrow_x_din_x_cry_r;
    reg [CARRY_W -1:0] rd_narrow_y_din_x_cry_r;
    reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r;
    reg [CARRY_W -1:0] rd_narrow_y_din_y_cry_r;
    
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry = wrk_rd_narrow_x_din_x + {{WORD_W{1'b0}}, rd_narrow_x_din_x_cry_r};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry = wrk_rd_narrow_y_din_x + {{WORD_W{1'b0}}, rd_narrow_y_din_x_cry_r};
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r};
    
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_x_w_cry[WORD_W -1:0]};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_x_w_cry[WORD_W -1:0]};
    wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_y_w_cry[WORD_W -1:0]};
    wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_y_w_cry[WORD_W -1:0]};
    
    always @(posedge clk) begin
        //
        wr_wide_x_dout_x    <= WORD_EXT_DNC;
        wr_wide_y_dout_x    <= WORD_EXT_DNC;
        wr_wide_x_dout_y    <= WORD_EXT_DNC;
        wr_wide_y_dout_y    <= WORD_EXT_DNC;
        wr_narrow_x_dout_x  <= WORD_EXT_DNC;
        wr_narrow_y_dout_x  <= WORD_EXT_DNC;
        wr_narrow_x_dout_y  <= WORD_EXT_DNC;
        wr_narrow_y_dout_y  <= WORD_EXT_DNC;
        //
        case (opcode)
            //
            UOP_OPCODE_PROPAGATE_CARRIES:
                //
                case (wrk_fsm_state)
                    //
                    WRK_FSM_STATE_LATENCY_PRE2: begin
                        rd_narrow_x_din_x_cry_r <= CARRY_ZERO;
                        rd_narrow_y_din_x_cry_r <= CARRY_ZERO;
                        rd_narrow_x_din_y_cry_r <= CARRY_ZERO;
                        rd_narrow_y_din_y_cry_r <= CARRY_ZERO;
                    end
                    //
                    WRK_FSM_STATE_BUSY,
                    WRK_FSM_STATE_LATENCY_POST1,
                    WRK_FSM_STATE_LATENCY_POST2: begin // TODO: post2 doesn't need update of carry, since that's the last word
                        //
                        rd_narrow_x_din_x_cry_r <= rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W];
                        rd_narrow_y_din_x_cry_r <= rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W];
                        rd_narrow_x_din_y_cry_r <= rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W];
                        rd_narrow_y_din_y_cry_r <= rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W];
                        //
                        wr_narrow_x_dout_x <= rd_narrow_x_din_x_w_cry_reduced;
                        wr_narrow_y_dout_x <= rd_narrow_y_din_x_w_cry_reduced;
                        wr_narrow_x_dout_y <= rd_narrow_x_din_y_w_cry_reduced;
                        wr_narrow_y_dout_y <= rd_narrow_y_din_y_w_cry_reduced;                       
                        //
                    end
                    //
                endcase
                //
            UOP_OPCODE_COPY_CRT_Y2X:
                //
                case (wrk_fsm_state)
                    //
                    WRK_FSM_STATE_BUSY,
                    WRK_FSM_STATE_LATENCY_POST1,
                    WRK_FSM_STATE_LATENCY_POST2: begin
                        //
                        wr_wide_x_dout_x   <= wrk_rd_wide_x_din_y;
                        wr_wide_y_dout_x   <= wrk_rd_wide_y_din_y;
                        wr_wide_x_dout_y   <= wrk_rd_wide_x_din_y;
                        wr_wide_y_dout_y   <= wrk_rd_wide_y_din_y;
                        //
                        wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_y;
                        wr_narrow_y_dout_x <= wrk_rd_narrow_y_din_y;
                        wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y;
                        wr_narrow_y_dout_y <= wrk_rd_narrow_y_din_y;                       
                        //
                    end
                    //
                endcase
                //
            UOP_OPCODE_COPY_LADDERS_X2Y:
                //
                case (wrk_fsm_state)
                    //
                    WRK_FSM_STATE_BUSY_M2,
                    WRK_FSM_STATE_LATENCY_POST1_M2,
                    WRK_FSM_STATE_LATENCY_POST2_M2: begin
                        //
                        wr_wide_x_dout_x <= wrk_rd_wide_x_din_x_dly3;
                        wr_wide_y_dout_x <= wrk_rd_wide_x_din_x_dly2;
                        wr_wide_x_dout_y <= wrk_rd_wide_x_din_y_dly3;
                        wr_wide_y_dout_y <= wrk_rd_wide_x_din_y_dly2;
                        //
                        wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_x_dly3;
                        wr_narrow_y_dout_x <= wrk_rd_narrow_x_din_x_dly2;
                        wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y_dly3;
                        wr_narrow_y_dout_y <= wrk_rd_narrow_x_din_y_dly2;
                        //
                    end
                    //
                endcase
                //
            UOP_OPCODE_MODULAR_REDUCE_INIT:
                //
                case (wrk_fsm_state)
                    //
                    WRK_FSM_STATE_BUSY,
                    WRK_FSM_STATE_LATENCY_POST1,
                    WRK_FSM_STATE_LATENCY_POST2: begin
                        //
                        wr_wide_x_dout_x   <= wrk_rd_narrow_x_din_x;
                        wr_wide_y_dout_x   <= wrk_rd_narrow_y_din_x;
                        wr_wide_x_dout_y   <= wrk_rd_narrow_x_din_y;
                        wr_wide_y_dout_y   <= wrk_rd_narrow_y_din_y;
                        //
                    end
                    //
                endcase
            //
        endcase
        //
    end


    //
    // Write Address Logic
    //
    wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half;
    wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half;
    
    always @(posedge clk) begin
        //
        {wr_wide_xy_bank_x,   wr_wide_xy_addr_x }  <= {BANK_DNC, OP_ADDR_DNC};
        {wr_wide_xy_bank_y,   wr_wide_xy_addr_y }  <= {BANK_DNC, OP_ADDR_DNC};
        {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC};
        {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC};
        //
        case (opcode)
            //
            UOP_OPCODE_PROPAGATE_CARRIES,
            UOP_OPCODE_COPY_CRT_Y2X:
                //
                case (wrk_fsm_state)
                    //
                    WRK_FSM_STATE_BUSY,
                    WRK_FSM_STATE_LATENCY_POST1,
                    WRK_FSM_STATE_LATENCY_POST2: begin
                        //
                        {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly2};
                        {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly2};                        
                        //
                        {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly2};
                        {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly2};
                        //
                    end
                    //
                endcase
                //
            UOP_OPCODE_MODULAR_REDUCE_INIT:
                //
                case (wrk_fsm_state)
                    //
                    WRK_FSM_STATE_BUSY,
                    WRK_FSM_STATE_LATENCY_POST1,
                    WRK_FSM_STATE_LATENCY_POST2: begin
                        //
                        wr_wide_xy_bank_x <= uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H;
                        wr_wide_xy_bank_y <= uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H;
                        //
                        wr_wide_xy_addr_x <= rd_wide_xy_addr_x_dly2;
                        wr_wide_xy_addr_y <= rd_wide_xy_addr_y_dly2;
                        //
                    end
                    //
                endcase
                //
            UOP_OPCODE_COPY_LADDERS_X2Y:
                //
                case (wrk_fsm_state)
                    //
                    WRK_FSM_STATE_BUSY_M2,
                    WRK_FSM_STATE_LATENCY_POST1_M2,
                    WRK_FSM_STATE_LATENCY_POST2_M2: begin
                        //
                        {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly4};
                        {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly4};                        
                        //
                        {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly4};
                        {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly4};
                        //
                    end
                    //
                endcase
                //
            //
        endcase
        //
    end


    //
    // Read Address Logic
    //
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_next;
    reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_next;

    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_next;
    reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_next;

    wire rd_wide_xy_addr_x_next_is_last = rd_wide_xy_addr_x_next == word_index_last_half;
    wire rd_wide_xy_addr_y_next_is_last = rd_wide_xy_addr_y_next == word_index_last_half;

    wire rd_narrow_xy_addr_x_next_is_last = rd_narrow_xy_addr_x_next == word_index_last;
    wire rd_narrow_xy_addr_y_next_is_last = rd_narrow_xy_addr_y_next == word_index_last;
    
    always @(posedge clk) begin // TODO: Maybe split into two blocks (read address / next address)??
        //
        {rd_wide_xy_bank_x,   rd_wide_xy_addr_x  } <= {BANK_DNC, OP_ADDR_DNC}; // TODO: Add same default path for io_manager ??
        {rd_wide_xy_bank_y,   rd_wide_xy_addr_y  } <= {BANK_DNC, OP_ADDR_DNC};
        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC};
        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC};
        //
        case (opcode)
            //
            UOP_OPCODE_PROPAGATE_CARRIES,
            UOP_OPCODE_OUTPUT_FROM_NARROW,
            UOP_OPCODE_COPY_CRT_Y2X:
                //
                case (wrk_fsm_state_next_one_pass)
                    //
                    WRK_FSM_STATE_LATENCY_PRE1: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO};
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO};
                        //
                        rd_wide_xy_addr_x_next <= OP_ADDR_ONE;
                        rd_wide_xy_addr_y_next <= OP_ADDR_ONE;
                        //
                        rd_narrow_xy_addr_x_next <= OP_ADDR_ONE;
                        rd_narrow_xy_addr_y_next <= OP_ADDR_ONE;
                        //
                    end
                    //
                    WRK_FSM_STATE_LATENCY_PRE2,
                    WRK_FSM_STATE_BUSY: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_narrow_xy_addr_x_next};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_narrow_xy_addr_y_next};                        
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next};
                        //
                        rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; 
                        rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO;
                        //
                        rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1;
                        rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1;
                        //
                    end
                    //
                endcase
                //
            UOP_OPCODE_MODULAR_REDUCE_INIT:
                //
                case (wrk_fsm_state_next_one_pass)
                    //
                    WRK_FSM_STATE_LATENCY_PRE1: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO};
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO};
                        //
                        rd_wide_xy_addr_x_next <= OP_ADDR_ONE;
                        rd_wide_xy_addr_y_next <= OP_ADDR_ONE;
                        //
                        rd_narrow_xy_addr_x_next <= OP_ADDR_ONE;
                        rd_narrow_xy_addr_y_next <= OP_ADDR_ONE;
                        //
                    end
                    //
                    WRK_FSM_STATE_LATENCY_PRE2,
                    WRK_FSM_STATE_BUSY: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x_next};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y_next};                        
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next};
                        //
                        rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; 
                        rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO;
                        //
                        rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1;
                        rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1;
                        //
                    end
                    //
                endcase
                //
            UOP_OPCODE_COPY_LADDERS_X2Y:
                //
                case (wrk_fsm_state_next_one_pass_meander)
                    //
                    WRK_FSM_STATE_LATENCY_PRE1_M1: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, OP_ADDR_ZERO};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, OP_ADDR_ZERO};
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, OP_ADDR_ZERO};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, OP_ADDR_ZERO};
                        //
                        rd_wide_xy_addr_x_next <= OP_ADDR_ONE;
                        rd_wide_xy_addr_y_next <= OP_ADDR_ONE;
                        //
                        rd_narrow_xy_addr_x_next <= OP_ADDR_ONE;
                        rd_narrow_xy_addr_y_next <= OP_ADDR_ONE;
                        //
                    end
                    //
                    WRK_FSM_STATE_LATENCY_PRE1_M2: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y};
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y};
                        //
                    end
                    //
                    WRK_FSM_STATE_LATENCY_PRE2_M1,
                    WRK_FSM_STATE_BUSY_M1: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_next};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_next};                        
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_next};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_next};
                        //
                        rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; 
                        rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO;
                        //
                        rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1;
                        rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1;
                        //
                    end
                    //
                    WRK_FSM_STATE_LATENCY_PRE2_M2,
                    WRK_FSM_STATE_BUSY_M2: begin
                        //
                        {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x};
                        {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y};
                        //
                        {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x};
                        {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y};
                        //
                    end
                    //
                endcase
                //
            //
        endcase
        //
    end
    

    //
    // FSM Process
    //
    always @(posedge clk)
        //
        if (rst) wrk_fsm_state <= WRK_FSM_STATE_IDLE;
        else case (opcode)
            UOP_OPCODE_PROPAGATE_CARRIES,
            UOP_OPCODE_OUTPUT_FROM_NARROW,
            UOP_OPCODE_COPY_CRT_Y2X,
            UOP_OPCODE_MODULAR_REDUCE_INIT: wrk_fsm_state <= wrk_fsm_state_next_one_pass;
            UOP_OPCODE_COPY_LADDERS_X2Y:    wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander;
            default:                        wrk_fsm_state <= WRK_FSM_STATE_IDLE;
        endcase
    
  
    //
    // Busy Exit Logic
    //
    reg wrk_fsm_done_one_pass         = 1'b0; 
    reg wrk_fsm_done_one_pass_meander = 1'b0;
    
    always @(posedge clk) begin
        //
        wrk_fsm_done_one_pass         <= 1'b0;
        wrk_fsm_done_one_pass_meander <= 1'b0;
        //
        case (opcode)
            //
            UOP_OPCODE_PROPAGATE_CARRIES,
            UOP_OPCODE_OUTPUT_FROM_NARROW,
            UOP_OPCODE_COPY_CRT_Y2X,
            UOP_OPCODE_MODULAR_REDUCE_INIT: begin
                //
                if (wrk_fsm_state == WRK_FSM_STATE_BUSY) begin
                    //
                    if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass <= 1'b1; // TODO: Check, whether both are necessary...
                    if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass <= 1'b1;
                    //
                end
                //
            end
            //
            UOP_OPCODE_COPY_LADDERS_X2Y: begin
                //
                if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M2) begin
                    //
                    if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; // TODO: Check, whether both are necessary...
                    if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1;
                    //
                end
                //
                if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M1)
                    wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander;
                //
            end
            //
        endcase
        //
    end
    
        
    //
    // FSM Transition Logic
    //
    always @* begin
        //
        case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE:          wrk_fsm_state_next_one_pass = ena                   ? WRK_FSM_STATE_LATENCY_PRE1  : WRK_FSM_STATE_IDLE ;
            WRK_FSM_STATE_LATENCY_PRE1:  wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_LATENCY_PRE2  ;
            WRK_FSM_STATE_LATENCY_PRE2:  wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_BUSY          ;
            WRK_FSM_STATE_BUSY:          wrk_fsm_state_next_one_pass = wrk_fsm_done_one_pass ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY ;
            WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_LATENCY_POST2 ;
            WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_STOP          ;
            WRK_FSM_STATE_STOP:          wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_IDLE          ;
            default:                     wrk_fsm_state_next_one_pass =                         WRK_FSM_STATE_IDLE          ;
        endcase
        //
    end
    
    always @* begin
        //
        case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE:             wrk_fsm_state_next_one_pass_meander = ena                           ? WRK_FSM_STATE_LATENCY_PRE1_M1  : WRK_FSM_STATE_IDLE    ;           
            //
            WRK_FSM_STATE_LATENCY_PRE1_M1:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE1_M2  ;
            WRK_FSM_STATE_LATENCY_PRE1_M2:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE2_M1  ;
            WRK_FSM_STATE_LATENCY_PRE2_M1:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_PRE2_M2  ;
            WRK_FSM_STATE_LATENCY_PRE2_M2:  wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_BUSY_M1          ;
            WRK_FSM_STATE_BUSY_M1:          wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_BUSY_M2          ;
            WRK_FSM_STATE_BUSY_M2:          wrk_fsm_state_next_one_pass_meander = wrk_fsm_done_one_pass_meander ? WRK_FSM_STATE_LATENCY_POST1_M1 : WRK_FSM_STATE_BUSY_M1 ;
            WRK_FSM_STATE_LATENCY_POST1_M1: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST1_M2 ;
            WRK_FSM_STATE_LATENCY_POST1_M2: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST2_M1 ;
            WRK_FSM_STATE_LATENCY_POST2_M1: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_LATENCY_POST2_M2 ;
            WRK_FSM_STATE_LATENCY_POST2_M2: wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_STOP             ;
            //
            WRK_FSM_STATE_STOP:             wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_IDLE             ;
            //
            default:                        wrk_fsm_state_next_one_pass_meander =                                 WRK_FSM_STATE_IDLE             ;
        endcase
        //
    end
    

    //
    // Ready Logic
    //
    reg rdy_reg = 1'b1;
    
    assign rdy = rdy_reg;
    
    always @(posedge clk)
        //
        if (rst)                  rdy_reg <= 1'b1;
        else case (wrk_fsm_state)
            WRK_FSM_STATE_IDLE:   rdy_reg <= ~ena;
            WRK_FSM_STATE_STOP:   rdy_reg <= 1'b1;
        endcase


endmodule