summaryrefslogblamecommitdiff
path: root/rtl/modular_multiplier/ed25519_modular_multiplier.v
blob: 012091a2a56a19d18abe0b83211546859d659166 (plain) (tree)





































                                                                                
 









                                   





                                 


























































                                                                                
                                    






                               



































































                                                                                                       






                                                                                          






















































                                                                                                                                                                                                      



                
          







                                          

                                                                         


           





                                                         
                                                                                     

 





                                                                           
                                                                                                           
 




                           
 
    
 









                                                           
 

                         
                                                                 

                                               
 






                            
                                                                                             
 
 



                          
 

                         
                                                         




                                                                         
 
 



                                
 








                                               
                                             
 
                                       
 







                                                   
 
 




                                                
 







                                                          
 




                                                                                                   
 

                         
                                


                                                               

                                                          
           
 
 


                   




                                                                                                  
 
    




                            
 

                         
                                                        
                                                
 







                                                        
                                                                                  















                                           
    



                        










                                                 
    
        
 
        

                         





























                                                                                                              
     





                                                                              

                      











































                                                                              
 









                                                         
      
    
 

                                                    
 





                                                                                                                                     
 







                                                                                                                  
 
 















                                                                                               
 






                                         
 










                                                                          





                                                                                
//------------------------------------------------------------------------------
//
// ed25519_modular_multiplier.v
// -----------------------------------------------------------------------------
// Curve25519 Modular Multiplier.
//
// Authors: Pavel Shatov
//
// Copyright (c) 2015-2016, 2018 NORDUnet A/S
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// - Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------


module ed25519_modular_multiplier
(
    clk, rst_n,
    ena, rdy,
    a_addr, b_addr, p_addr, p_wren,
    a_din, b_din, p_dout
);


    //
    // Settings
    //
`include "../ed25519_settings.vh"


    //
    // Constants
    //
    localparam integer OPERAND_NUM_WORDS    = 8;
    localparam integer WORD_COUNTER_WIDTH   = 3;


    //
    // Handy Numbers
    //
    localparam [WORD_COUNTER_WIDTH-1:0] WORD_INDEX_ZERO = 0;
    localparam [WORD_COUNTER_WIDTH-1:0] WORD_INDEX_LAST = OPERAND_NUM_WORDS - 1;


    //
    // Handy Functions
    //
    function [WORD_COUNTER_WIDTH-1:0] WORD_INDEX_NEXT_OR_ZERO;
    input    [WORD_COUNTER_WIDTH-1:0] WORD_INDEX_CURRENT;
    begin
        WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
            WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
    end
    endfunction

    function [WORD_COUNTER_WIDTH-1:0] WORD_INDEX_PREVIOUS_OR_LAST;
    input    [WORD_COUNTER_WIDTH-1:0] WORD_INDEX_CURRENT;
    begin
        WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
            WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
    end
    endfunction


    //
    // Ports
    //
    input   clk;    // system clock
    input   rst_n;  // active-low async reset

    input   ena;    // enable input
    output  rdy;    // ready output

    output  [WORD_COUNTER_WIDTH-1:0]    a_addr;    // index of current A word
    output  [WORD_COUNTER_WIDTH-1:0]    b_addr;    // index of current B word
    output  [WORD_COUNTER_WIDTH-1:0]    p_addr;    // index of current P word
    
    output  p_wren;    // store current P word now

    input   [31:0]  a_din;      // current word of A
    input   [31:0]  b_din;      // current word of B
    output  [31:0]  p_dout;     // current word of P


    //
    // Word Indices
    //
    reg [WORD_COUNTER_WIDTH-1:0] index_a;
    reg [WORD_COUNTER_WIDTH-1:0] index_b;

    // map registers to output ports
    assign a_addr    = index_a;
    assign b_addr    = index_b;


    //
    // FSM
    //

    
    localparam integer PHASE_INCREMENT_INDEX_A_OFFSET           =   0;
    localparam integer PHASE_INCREMENT_INDEX_A_DURATION         =   OPERAND_NUM_WORDS;
    
    localparam integer PHASE_DECREMENT_INDEX_B_OFFSET           =   PHASE_INCREMENT_INDEX_A_DURATION;
    localparam integer PHASE_DECREMENT_INDEX_B_DURATION         =   OPERAND_NUM_WORDS * 2;

    localparam integer PHASE_STORE_MSB_SI_OFFSET                =   PHASE_DECREMENT_INDEX_B_OFFSET + 2;
    localparam integer PHASE_STORE_MSB_SI_DURATION              =   OPERAND_NUM_WORDS * 2 - 1;

    localparam integer PHASE_STORE_LSB_SI_OFFSET                =   PHASE_STORE_MSB_SI_OFFSET +
                                                                    PHASE_STORE_MSB_SI_DURATION;
    localparam integer PHASE_STORE_LSB_SI_DURATION              =   1;
    
    localparam integer PHASE_SHIFT_SI_OFFSET                    =   PHASE_STORE_LSB_SI_OFFSET + 1;
    localparam integer PHASE_SHIFT_SI_DURATION                  =   OPERAND_NUM_WORDS * 2 - 1;

    localparam integer PHASE_MASK_SUM_CW1_OFFSET                =   PHASE_SHIFT_SI_OFFSET + 1;
    localparam integer PHASE_MASK_SUM_CW1_DURATION              =   1;    
    
    localparam integer PHASE_STORE_LSB_C_OFFSET                 =   PHASE_MASK_SUM_CW1_OFFSET + 1;
    localparam integer PHASE_STORE_LSB_C_DURATION               =   OPERAND_NUM_WORDS;

    localparam integer PHASE_STORE_MSB_C_OFFSET                 =   PHASE_STORE_LSB_C_OFFSET +
                                                                    PHASE_STORE_LSB_C_DURATION;
    localparam integer PHASE_STORE_MSB_C_DURATION               =   OPERAND_NUM_WORDS;

    localparam integer PHASE_MASK_B_R3_OFFSET                   =   PHASE_STORE_MSB_C_OFFSET + 3;
    localparam integer PHASE_MASK_B_R3_DURATION                 =   1;

    localparam integer PHASE_CALCULATE_CARRY_MSB_S1_OFFSET      =   PHASE_STORE_MSB_C_OFFSET +
                                                                    PHASE_STORE_MSB_C_DURATION + 4;
    localparam integer PHASE_CALCULATE_CARRY_MSB_S1_DURATION    =   1;
    
    localparam integer PHASE_STORE_LSB_S1_OFFSET                =   PHASE_STORE_MSB_C_OFFSET + 4;
    localparam integer PHASE_STORE_LSB_S1_DURATION              =   OPERAND_NUM_WORDS;
    
    localparam integer PHASE_SHIFT_S1_OFFSET                    =   PHASE_STORE_LSB_S1_OFFSET +
                                                                    PHASE_STORE_LSB_S1_DURATION + 1;
    localparam integer PHASE_SHIFT_S1_DURATION                  =   OPERAND_NUM_WORDS;

    localparam integer PHASE_CHANGE_LSB_B_P_OFFSET              =   PHASE_SHIFT_S1_OFFSET;
    localparam integer PHASE_CHANGE_LSB_B_P_DURATION            =   1;

    localparam integer PHASE_SELECT_S2_OR_PN_OFFSET             =   PHASE_SHIFT_S1_OFFSET +
                                                                    PHASE_SHIFT_S1_DURATION + 1;
    localparam integer PHASE_SELECT_S2_OR_PN_DURATION           =   1;

    localparam integer PHASE_UPDATE_P_DOUT_OFFSET               =   PHASE_SHIFT_S1_OFFSET +
                                                                    PHASE_SHIFT_S1_DURATION + 2;
    localparam integer PHASE_UPDATE_P_DOUT_DURATION             =   OPERAND_NUM_WORDS;

    
    
    localparam integer FSM_SHREG_WIDTH =    PHASE_INCREMENT_INDEX_A_DURATION +
                                            PHASE_DECREMENT_INDEX_B_DURATION +
                                            1 +
                                            PHASE_STORE_LSB_SI_DURATION +
                                            PHASE_SHIFT_SI_DURATION +
                                            -1 +
                                            PHASE_STORE_LSB_S1_DURATION +
                                            PHASE_CALCULATE_CARRY_MSB_S1_DURATION +
                                            PHASE_SHIFT_S1_DURATION +
                                            1 +
                                            PHASE_SELECT_S2_OR_PN_DURATION +
                                            PHASE_UPDATE_P_DOUT_DURATION +
                                            2;

    localparam [FSM_SHREG_WIDTH-1:0] FSM_SHREG_INIT = {{(FSM_SHREG_WIDTH-1){1'b0}}, 1'b1};

    reg [FSM_SHREG_WIDTH-1:0] fsm_shreg = FSM_SHREG_INIT;

    assign rdy = fsm_shreg[0];

    
    
    
    
    
    
    wire [PHASE_INCREMENT_INDEX_A_DURATION     -1:0] fsm_shreg_increment_index_a      = fsm_shreg[FSM_SHREG_WIDTH - PHASE_INCREMENT_INDEX_A_OFFSET      - 1 -: PHASE_INCREMENT_INDEX_A_DURATION];
    wire [PHASE_DECREMENT_INDEX_B_DURATION     -1:0] fsm_shreg_decrement_index_b      = fsm_shreg[FSM_SHREG_WIDTH - PHASE_DECREMENT_INDEX_B_OFFSET      - 1 -: PHASE_DECREMENT_INDEX_B_DURATION];
    wire [PHASE_STORE_MSB_SI_DURATION          -1:0] fsm_shreg_store_msb_si           = fsm_shreg[FSM_SHREG_WIDTH - PHASE_STORE_MSB_SI_OFFSET           - 1 -: PHASE_STORE_MSB_SI_DURATION];
    wire [PHASE_STORE_LSB_SI_DURATION          -1:0] fsm_shreg_store_lsb_si           = fsm_shreg[FSM_SHREG_WIDTH - PHASE_STORE_LSB_SI_OFFSET           - 1 -: PHASE_STORE_LSB_SI_DURATION];
    wire [PHASE_SHIFT_SI_DURATION              -1:0] fsm_shreg_shift_si               = fsm_shreg[FSM_SHREG_WIDTH - PHASE_SHIFT_SI_OFFSET               - 1 -: PHASE_SHIFT_SI_DURATION];
    wire [PHASE_MASK_SUM_CW1_DURATION          -1:0] fsm_shreg_mask_sum_cw1           = fsm_shreg[FSM_SHREG_WIDTH - PHASE_MASK_SUM_CW1_OFFSET           - 1 -: PHASE_MASK_SUM_CW1_DURATION];
    wire [PHASE_STORE_LSB_C_DURATION           -1:0] fsm_shreg_store_lsb_c            = fsm_shreg[FSM_SHREG_WIDTH - PHASE_STORE_LSB_C_OFFSET            - 1 -: PHASE_STORE_LSB_C_DURATION];
    wire [PHASE_STORE_MSB_C_DURATION           -1:0] fsm_shreg_store_msb_c            = fsm_shreg[FSM_SHREG_WIDTH - PHASE_STORE_MSB_C_OFFSET            - 1 -: PHASE_STORE_MSB_C_DURATION];
    wire [PHASE_MASK_B_R3_DURATION             -1:0] fsm_shreg_mask_b_r3              = fsm_shreg[FSM_SHREG_WIDTH - PHASE_MASK_B_R3_OFFSET              - 1 -: PHASE_MASK_B_R3_DURATION];
    wire [PHASE_CALCULATE_CARRY_MSB_S1_DURATION-1:0] fsm_shreg_calculate_carry_msb_s1 = fsm_shreg[FSM_SHREG_WIDTH - PHASE_CALCULATE_CARRY_MSB_S1_OFFSET - 1 -: PHASE_CALCULATE_CARRY_MSB_S1_DURATION];
    wire [PHASE_STORE_LSB_S1_DURATION          -1:0] fsm_shreg_store_lsb_s1           = fsm_shreg[FSM_SHREG_WIDTH - PHASE_STORE_LSB_S1_OFFSET           - 1 -: PHASE_STORE_LSB_S1_DURATION];
    wire [PHASE_SHIFT_S1_DURATION              -1:0] fsm_shreg_shift_s1               = fsm_shreg[FSM_SHREG_WIDTH - PHASE_SHIFT_S1_OFFSET               - 1 -: PHASE_SHIFT_S1_DURATION];
    wire [PHASE_CHANGE_LSB_B_P_DURATION        -1:0] fsm_shreg_change_lsb_b_p         = fsm_shreg[FSM_SHREG_WIDTH - PHASE_CHANGE_LSB_B_P_OFFSET         - 1 -: PHASE_CHANGE_LSB_B_P_DURATION];
    wire [PHASE_SELECT_S2_OR_PN_DURATION       -1:0] fsm_shreg_select_s2_or_pn        = fsm_shreg[FSM_SHREG_WIDTH - PHASE_SELECT_S2_OR_PN_OFFSET        - 1 -: PHASE_SELECT_S2_OR_PN_DURATION];
    wire [PHASE_UPDATE_P_DOUT_DURATION         -1:0] fsm_shreg_update_p_dout          = fsm_shreg[FSM_SHREG_WIDTH - PHASE_UPDATE_P_DOUT_OFFSET          - 1 -: PHASE_UPDATE_P_DOUT_DURATION];

    wire flag_increment_index_a      = |fsm_shreg_increment_index_a;
    wire flag_decrement_index_b      = |fsm_shreg_decrement_index_b;
    wire flag_store_msb_si           = |fsm_shreg_store_msb_si;
    wire flag_store_lsb_si           = |fsm_shreg_store_lsb_si;
    wire flag_shift_si               = |fsm_shreg_shift_si;
    wire flag_mask_sum_cw1           = |fsm_shreg_mask_sum_cw1;
    wire flag_store_lsb_c            = |fsm_shreg_store_lsb_c;
    wire flag_store_msb_c            = |fsm_shreg_store_msb_c;
    wire flag_mask_b_r3              = |fsm_shreg_mask_b_r3;
    wire flag_calculate_carry_msb_s1 = |fsm_shreg_calculate_carry_msb_s1;
    wire flag_store_lsb_s1           = |fsm_shreg_store_lsb_s1;
    wire flag_shift_s1               = |fsm_shreg_shift_s1;
    wire flag_change_lsb_b_p         = |fsm_shreg_change_lsb_b_p;
    wire flag_select_s2_or_pn        = |fsm_shreg_select_s2_or_pn;
    wire flag_update_p_dout          = |fsm_shreg_update_p_dout;
    
    reg flag_store_word_a   = 0;
    reg flag_enable_mac_ab  = 0;
    reg flag_delay_msb_c    = 0;
    reg flag_mask_a_s2      = 0;
    reg flag_mask_b_out_p   = 0;
    reg flag_store_s2       = 0;
    reg flag_store_pn       = 0;
    
    always @(posedge clk) begin
        flag_store_word_a   <= flag_increment_index_a;
        flag_enable_mac_ab  <= flag_decrement_index_b;
        flag_delay_msb_c    <= flag_store_msb_c;
        flag_mask_a_s2      <= flag_calculate_carry_msb_s1;
        flag_mask_b_out_p   <= flag_change_lsb_b_p;
        flag_store_s2       <= flag_shift_s1;
        flag_store_pn       <= flag_store_s2;
    end    


    //
    // FSM Logic
    //    
    always @(posedge clk or negedge rst_n)
        //
        if (rst_n == 1'b0)
            //
            fsm_shreg <= FSM_SHREG_INIT;
        //
        else begin
            //
            if (rdy) fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
            else     fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
        end


    //
    // A Word Index Increment Logic
    //
    always @(posedge clk)
        //
        if (rdy)              index_a <= WORD_INDEX_ZERO;
        else if (flag_increment_index_a) index_a <= WORD_INDEX_NEXT_OR_ZERO(index_a);


    //
    // B Word Index Decrement Logic
    //
    always @(posedge clk)
        //
        if (rdy)                                index_b <= WORD_INDEX_LAST;
        else if (flag_decrement_index_b && !index_b_ff)    index_b <= WORD_INDEX_PREVIOUS_OR_LAST(index_b);

        
    //
    // Wide Operand Buffer
    //
    reg [255:0] buf_a_wide;

    

    //
    // B Word Splitter
    //
    
    /*
     * 0: store the upper 16-bit part of the current B word
     * 1: store the lower 16-bit part of the current B word
     */
    
    reg index_b_ff = 1'b0;

    always @(posedge clk)
        //
        if (flag_decrement_index_b)    index_b_ff <= ~index_b_ff;
        else                index_b_ff <= 1'b0;
        

    //
    // Narrow Operand Buffer
    //
    reg [15:0] buf_b_narrow;
    
    always @(posedge clk)
        //
        if (flag_decrement_index_b) buf_b_narrow <= !index_b_ff ? b_din[31:16] : b_din[15:0];


    //
    // MAC Clear Logic
    //
    reg  [15:0] mac_clear;

    always @(posedge clk)
        //
        if (!flag_enable_mac_ab) mac_clear <= {16{1'b1}};
        else begin
            if (mac_clear[0])       mac_clear <= 16'b0000000000000010;
            else if (mac_clear[15]) mac_clear <= 16'b1111111111111111;
            else                    mac_clear <= {mac_clear[14:0], 1'b0};
        end


    //
    // MAC Array
    //
    wire [46:0] mac_accum[0:15];

    genvar i;
    
    generate for (i=0; i<16; i=i+1)
        //
        begin : gen_mac16_array
            //
            `ED25519_MAC16_PRIMITIVE mac16_inst
            (
                .clk    (clk),
                .ce     (flag_enable_mac_ab),

                .clr    (mac_clear[i]),

                .a      (buf_a_wide[16 * i +: 16]),
                .b      (buf_b_narrow),
                .s      (mac_accum[i])
            );
            //
        end
        //
    endgenerate


    //
    // Intermediate Words
    //
    reg [47*(2*OPERAND_NUM_WORDS-1)-1:0] si_msb;
    reg [47*(2*OPERAND_NUM_WORDS-0)-1:0] si_lsb;

    wire [47*(2*OPERAND_NUM_WORDS-1)-1:0] si_msb_new;
    wire [47*(2*OPERAND_NUM_WORDS-0)-1:0] si_lsb_new;
    
    generate for (i=0; i<16; i=i+1)
        begin : gen_si_lsb_new
            assign si_lsb_new[47*i+:47] = mac_accum[15-i];
        end
    endgenerate

    generate for (i=1; i<16; i=i+1)
        begin : gen_si_msb_new
            assign si_msb_new[47*(15-i)+:47] = mac_clear[i] ? mac_accum[i] : si_msb[47*(15-i)+:47];
        end
    endgenerate

    always @(posedge clk)
        //
        if (flag_shift_si) begin
            si_msb <= {{2*47{1'b0}}, si_msb[15*47-1:2*47]};
            si_lsb <= {si_msb[2*47-1:0], si_lsb[16*47-1:2*47]};
        end else begin
            if (flag_store_msb_si)   si_msb <= si_msb_new;
            if (flag_store_lsb_si)   si_lsb <= si_lsb_new;
        end


    //
    // Accumulators
    //
    wire [46:0] add47_cw0_s;
    wire [46:0] add47_cw1_s;
    wire [14:0] add47_cw1_s_masked = flag_mask_sum_cw1 ? {15{1'b0}} : add47_cw1_s[32+:15];
    
    wire [46:0] add47_r3_b_masked = {{32{1'b0}}, flag_mask_b_r3 ? {15{1'b0}} : add47_r3_s[46:32]};

    
    //
    // cw0, cw1
    //
    reg [30: 0] si_prev_dly;
    reg [15: 0] si_next_dly;

    always @(posedge clk)
        //
        if (flag_shift_si) si_prev_dly <= si_lsb[93:63];
        else          si_prev_dly <= {31{1'b0}};

    always @(posedge clk)
        //
        si_next_dly <= si_lsb[47+:16];

    wire [46:0] add47_cw0_a = si_lsb[46:0];
    wire [46:0] add47_cw0_b = {{16{1'b0}}, si_prev_dly};

    wire [46:0] add47_cw1_a = add47_cw0_s;
    wire [46:0] add47_cw1_b = {{15{1'b0}}, si_next_dly, 1'b0, add47_cw1_s_masked};

    `ED25519_ADD47_PRIMITIVE add47_cw0_inst
    (
        .clk    (clk),
        .a      (add47_cw0_a),
        .b      (add47_cw0_b),
        .s      (add47_cw0_s)
    );

    `ED25519_ADD47_PRIMITIVE add47_cw1_inst
    (
        .clk    (clk),
        .a      (add47_cw1_a),
        .b      (add47_cw1_b),
        .s      (add47_cw1_s)
    );
    
    
    //
    // Full-Size Product
    //
    wire [31:0] c_word_lower = add47_cw1_s[31:0];

     
    wire [46:0] add47_r0_s;
    wire [46:0] add47_r1_s;
    wire [46:0] add47_r2_s;
    wire [46:0] add47_r3_s;
    
    reg [255:0] c_lsb_s1_shreg;
    reg [ 31:0] c_msb_latch;
            
    
        

        
    always @(posedge clk)
        //
        if (flag_store_msb_c) c_msb_latch <= c_word_lower;
        else             c_msb_latch <= {32{1'b0}};
    
        
    reg [4:0] c_msb_latch_upper_dly;
    reg [31:0] c_lsb_shreg_lower_dly;
    
    always @(posedge clk)
        //
        if (flag_delay_msb_c) c_msb_latch_upper_dly <= c_msb_latch[31:27];
        else             c_msb_latch_upper_dly <= {5{1'b0}};
            
    
    always @(posedge clk)
        //
        if (flag_store_msb_c) c_lsb_shreg_lower_dly <= c_lsb_s1_shreg[31:0];
        else             c_lsb_shreg_lower_dly <= {32{1'b0}};
    
    
    
    reg [11:0] carry_msb_s1;
    
    always @(posedge clk)
        //
        if (flag_calculate_carry_msb_s1) carry_msb_s1 <= {{6{1'b0}}, 6'd38} * {{6{1'b0}}, add47_r3_s[5:0]};
    
    
    wire [46:0] add47_s2_a_masked = {{32{1'b0}}, flag_mask_a_s2 ? {3'b000, carry_msb_s1} : add47_s2_s[46:32]};
    
    `ED25519_ADD47_PRIMITIVE add47_r0
    (
        .clk    (clk),
        .a      ({{15{1'b0}}, c_msb_latch[30:0], c_msb_latch_upper_dly[4]}),
        .b      ({{15{1'b0}}, c_msb_latch[29:0], c_msb_latch_upper_dly[4:3]}),
        .s      (add47_r0_s)
    );
    `ED25519_ADD47_PRIMITIVE add47_r1
    (
        .clk    (clk),
        .a      ({{15{1'b0}}, c_msb_latch[26:0], c_msb_latch_upper_dly[4:0]}),
        .b      ({{15{1'b0}}, c_lsb_shreg_lower_dly}),
        .s      (add47_r1_s)
    );
    `ED25519_ADD47_PRIMITIVE add47_r2
    (
        .clk    (clk),
        .a      (add47_r0_s),
        .b      (add47_r1_s),
        .s      (add47_r2_s)
    );
    `ED25519_ADD47_PRIMITIVE add47_r3
    (
        .clk    (clk),
        .a      (add47_r2_s),
        .b      (add47_r3_b_masked),
        .s      (add47_r3_s)
    );
    
          
    
    wire [46:0] add47_s2_s;
    `ED25519_ADD47_PRIMITIVE add47_s2
    (
        .clk    (clk),
        .a      (add47_s2_a_masked),
        .b      ({{15{1'b0}}, c_lsb_s1_shreg[31:0]}),
        .s      (add47_s2_s)
    );
    
    
    reg sub32_b_bit;
    
    wire [31:0] sub32_b = {{26{1'b1}},  // ...*11*1*
        sub32_b_bit, {2{1'b1}}, sub32_b_bit, 1'b1, sub32_b_bit};
    
    always @(posedge clk)
        //
        if (!fsm_shreg_change_lsb_b_p) sub32_b_bit <= 1'b1;
        else                           sub32_b_bit <= 1'b0;
    
    wire [31:0] sub32_pn_d;
    wire        sub32_b_in;
    wire        sub32_b_out;

    assign sub32_b_in = sub32_b_out & !flag_mask_b_out_p;
    
    `ED25519_SUB32_PRIMITIVE sub32_pn
    (
        .clk    (clk),
        .a      (add47_s2_s[31:0]),
        .b      (sub32_b),
        .d      (sub32_pn_d),
        .b_in   (sub32_b_in),
        .b_out  (sub32_b_out)
    );
    

    wire [31:0] add47_r3_s_lower = add47_r3_s[31:0];
    

    always @(posedge clk)
        //
        if (flag_store_word_a)       buf_a_wide <= {buf_a_wide[16+:256-3*16], {a_din[15:0], a_din[31:16]}, buf_a_wide[256-2*16+:16]};
        else if (flag_enable_mac_ab) buf_a_wide <= {buf_a_wide[256-(16+1):0], buf_a_wide[256-16+:16]};
        else if (flag_store_s2)      buf_a_wide <= {add47_s2_s[31:0], buf_a_wide[255:32]};
        else if (flag_update_p_dout)  buf_a_wide <= {{32{1'bX}}, buf_a_wide[255:32]};

        
    always @(posedge clk)
        //
        if      (flag_store_lsb_c)                  c_lsb_s1_shreg <= {c_word_lower,     c_lsb_s1_shreg[255:32]};
        else if (flag_store_lsb_s1)                 c_lsb_s1_shreg <= {add47_r3_s_lower, c_lsb_s1_shreg[255:32]};
        else if (flag_store_pn)                     c_lsb_s1_shreg <= {sub32_pn_d,       c_lsb_s1_shreg[255:32]};
        else if (flag_store_msb_c || flag_shift_s1) c_lsb_s1_shreg <= {{32{1'b0}},       c_lsb_s1_shreg[255:32]};
        else if (flag_update_p_dout)                 c_lsb_s1_shreg <= {{32{1'b0}},       c_lsb_s1_shreg[255:32]};


    reg sel_pn; // 0: output in S2, 1: output in PN
    
    always @(posedge clk)
        //
        if (flag_select_s2_or_pn) sel_pn <= sub32_b_out & add47_s2_s[0];
        
        
    reg [31:0] p_dout_reg;
    
    assign p_dout = p_dout_reg;
    
    always @(posedge clk)
        //
        if (flag_update_p_dout) p_dout_reg <= sel_pn ? c_lsb_s1_shreg[31:0] : buf_a_wide[31:0];
        else                    p_dout_reg <= {32{1'bX}};
        

    reg p_wren_reg = 0;
    
    assign p_wren = p_wren_reg;
    
    always @(posedge clk)
        //
        p_wren_reg <= flag_update_p_dout;

        
    reg [WORD_COUNTER_WIDTH-1:0] p_addr_reg;
        
    assign p_addr = p_addr_reg;
        
    always @(posedge clk)
        //
        if (p_wren_reg) p_addr_reg <= WORD_INDEX_NEXT_OR_ZERO(p_addr_reg);
        else            p_addr_reg <= WORD_INDEX_ZERO;
    
        
endmodule


//------------------------------------------------------------------------------
// End-of-File
//------------------------------------------------------------------------------