path: root/ecdsa_fpga_modular.cpp



//------------------------------------------------------------------------------
//
// ecdsa_fpga_modular.cpp
// -------------------------------------
// Modular arithmetic routines for ECDSA
//
// Authors: Pavel Shatov
//
// Copyright 2015-2016, 2018 NORDUnet A/S
// Copyright 2021 The Commons Conservancy Cryptech Project
// SPDX-License-Identifier: BSD-3-Clause
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// - Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
//
// - Neither the name of the copyright holder nor the names of its
//   contributors may be used to endorse or promote products derived from
//   this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------


//------------------------------------------------------------------------------
// Headers
//------------------------------------------------------------------------------
#include "ecdsa_fpga_model.h"


//------------------------------------------------------------------------------
// Globals
//------------------------------------------------------------------------------
FPGA_BUFFER ECDSA_Q;
FPGA_BUFFER ECDSA_DELTA;


//------------------------------------------------------------------------------
// Settings
//------------------------------------------------------------------------------
bool _DUMP_MODULAR_RESULTS = false;


//------------------------------------------------------------------------------
void fpga_modular_init()
//------------------------------------------------------------------------------
{
    int w_src, w_dst;   // word counters

        // temporary things
    FPGA_WORD TMP_Q    [FPGA_OPERAND_NUM_WORDS] = ECDSA_Q_INIT;
    FPGA_WORD TMP_DELTA[FPGA_OPERAND_NUM_WORDS] = ECDSA_DELTA_INIT;

        /* fill buffers for large multi-word integers, we need to fill them in
         * reverse order because of the way C arrays are initialized
         */
    for (   w_src = 0, w_dst = FPGA_OPERAND_NUM_WORDS - 1;
            w_src < FPGA_OPERAND_NUM_WORDS;
            w_src++, w_dst--)
    {
        ECDSA_Q.words[w_dst] = TMP_Q[w_src];
        ECDSA_DELTA.words[w_dst] = TMP_DELTA[w_src];
    }
}


//------------------------------------------------------------------------------
//
// Modular addition.
//
// This routine implements algorithm 3. from "Ultra High Performance ECC over
// NIST Primes on Commercial FPGAs".
//
// s = (a + b) mod q
//
// The naive approach is like the following:
//
// 1. s = a + b
// 2. if (s >= q) s -= q
//
// The speed-up trick is to simultaneously calculate (a + b) and (a + b - q)
// and then select the right variant.
//
//------------------------------------------------------------------------------
void fpga_modular_add(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *s)
//------------------------------------------------------------------------------
{
    int w;                  // word counter
    FPGA_BUFFER ab, ab_n;   // intermediate buffers
    bool c_in, c_out;       // carries
    bool b_in, b_out;       // borrows

    c_in = false;           // first word has no carry
    b_in = false;           // first word has no borrow
    
        // run parallel addition and subtraction
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
    {
        fpga_lowlevel_add32(a->words[w], b->words[w],      c_in, &ab.words[w],   &c_out);
        fpga_lowlevel_sub32(ab.words[w], ECDSA_Q.words[w], b_in, &ab_n.words[w], &b_out);

        c_in = c_out;   // propagate carry
        b_in = b_out;   // propagate borrow
    }

        // now select the right buffer

        /*
         * We select the right variant based on borrow and carry flags after
         * addition and subtraction of the very last pair of words. Note, that
         * we only need to select the first variant (a + b) when (a + b) < q.
         * This way if we get negative number after subtraction, we discard it
         * and use the output of the adder instead. The subtractor output is
         * negative when borrow flag is set *and* carry flag is not set. When
         * both borrow and carry are set, the number is non-negative, because
         * borrow and carry cancel each other out.
         */
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
        s->words[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w];

    if (_DUMP_MODULAR_RESULTS)
        dump_uop_output("ADD", s);
}


//------------------------------------------------------------------------------
//
// Modular subtraction.
//
// This routine implements algorithm 3. from "Ultra High Performance ECC over
// NIST Primes on Commercial FPGAs".
//
// d = (a - b) mod q
//
// The naive approach is like the following:
//
// 1. d = a - b
// 2. if (a < b) d += q
//
// The speed-up trick is to simultaneously calculate (a - b) and (a - b + q)
// and then select the right variant.
//
//------------------------------------------------------------------------------
void fpga_modular_sub(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *d)
//------------------------------------------------------------------------------
{
    int w;                  // word counter
    FPGA_BUFFER ab, ab_n;   // intermediate buffers
    bool c_in, c_out;       // carries
    bool b_in, b_out;       // borrows

    c_in = false;           // first word has no carry
    b_in = false;           // first word has no borrow
    
        // run parallel subtraction and addition
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
    {
        fpga_lowlevel_sub32(a->words[w], b->words[w],      b_in, &ab.words[w],   &b_out);
        fpga_lowlevel_add32(ab.words[w], ECDSA_Q.words[w], c_in, &ab_n.words[w], &c_out);

        b_in = b_out;   // propagate borrow
        c_in = c_out;   // propagate carry
    }

        // now select the right buffer

        /*
         * We select the right variant based on borrow flag after subtraction
         * and addition of the very last pair of words. Note, that we only
         * need to select the second variant (a - b + q) when a < b. This way
         * if we get negative number after subtraction, we discard it
         * and use the output of the adder instead. The Subtractor output is
         * negative when borrow flag is set.
         */
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
        d->words[w] = b_out ? ab_n.words[w] : ab.words[w];

    if (_DUMP_MODULAR_RESULTS)
        dump_uop_output("SUB", d);
}


//------------------------------------------------------------------------------
//
// Modular multiplication.
//
// This routine implements modular multiplication algorithm from "Ultra High
// Performance ECC over NIST Primes on Commercial FPGAs".
//
// p = (a * b) mod q
//
// The complex algorithm is split into three parts:
//
// 1. Calculation of partial words
// 2. Acccumulation of partial words into full-size product
// 3. Modular reduction of the full-size product
//
// See comments for corresponding helper routines for more information.
//
//------------------------------------------------------------------------------
void fpga_modular_mul(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *p)
//------------------------------------------------------------------------------
{
    FPGA_WORD_EXTENDED si[4*FPGA_OPERAND_NUM_WORDS-1];  // parts of intermediate product
    FPGA_WORD c[2*FPGA_OPERAND_NUM_WORDS];              // full-size intermediate product

        /* save debug flag */
    bool _save_dump_modular_results = _DUMP_MODULAR_RESULTS;

        /* mask debug flag to not garble output */
    _DUMP_MODULAR_RESULTS = false;

        /* multiply to get partial words */
    fpga_modular_mul_helper_multiply(a, b, si);

        /* accumulate partial words into full-size product */
    fpga_modular_mul_helper_accumulate(si, c);

        /* reduce full-size product using special routine */
    fpga_modular_mul_helper_reduce(c, p);

        /* restore debug flag */
    _DUMP_MODULAR_RESULTS = _save_dump_modular_results;

        /* now dump result if needed */
    if (_DUMP_MODULAR_RESULTS)
        dump_uop_output("MUL", p);
}


//------------------------------------------------------------------------------
//
// Parallelized multiplication.
//
// This routine implements the algorithm in Fig. 3. from "Ultra High
// Performance ECC over NIST Primes on Commercial FPGAs".
//
// Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2
// bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide
// inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS
// multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial
// output words si[].
//
// The order of loading a and b into the multipliers is a bit complicated,
// during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is
// obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the
// remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously.
//
//------------------------------------------------------------------------------
void fpga_modular_mul_helper_multiply(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si)
//------------------------------------------------------------------------------
{
    int w;          // counter
    int t, x;       // more counters
    int j, i;       // word indices
    FPGA_WORD p;    // product

        // buffers for smaller words that multipliers can handle
    FPGA_WORD_REDUCED ai[2*FPGA_OPERAND_NUM_WORDS];
    FPGA_WORD_REDUCED bj[2*FPGA_OPERAND_NUM_WORDS];
    
        // split a and b into smaller words
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
        ai[2*w] = (FPGA_WORD_REDUCED)a->words[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)),
        bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2));

        // accumulators
    FPGA_WORD_EXTENDED mac[2*FPGA_OPERAND_NUM_WORDS];
    
        // clear accumulators
    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) mac[w] = 0;

        // run the crazy algorithm :)
    for (t=0; t<(2*FPGA_OPERAND_NUM_WORDS); t++)
    {
            // save upper half of si[] (one word per cycle)
        if (t > 0)
        {   si[4*FPGA_OPERAND_NUM_WORDS - (t+1)] = mac[t];
            mac[t] = 0;
        }

            // update index
        j = 2*FPGA_OPERAND_NUM_WORDS - (t+1);

            // parallel multiplication
        for (x=0; x<(2*FPGA_OPERAND_NUM_WORDS); x++)
        {
                // update index
            i = t - x;
            if (i < 0) i += 2*FPGA_OPERAND_NUM_WORDS;

                // multiply...
            fpga_lowlevel_mul16(ai[i], bj[j], &p);

                // ...accumulate
            mac[x] += p;
        }

    }

        // now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once)
    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++)
        si[w] = mac[2*FPGA_OPERAND_NUM_WORDS - (w+1)];
}


//------------------------------------------------------------------------------
//
// Accumulation of partial words into full-size product.
//
// This routine implements the Algorithm 4. from "Ultra High Performance ECC
// over NIST Primes on Commercial FPGAs".
//
// Input words si[] are accumulated into full-size product c[].
//
// The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in
// si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are
// summed in full, odd words are split into two parts. During every cycle the
// upper part of the previous word and the lower part of the following word are
// summed too.
//
//------------------------------------------------------------------------------
void fpga_modular_mul_helper_accumulate(const FPGA_WORD_EXTENDED *si, FPGA_WORD *c)
//------------------------------------------------------------------------------
{
    int w;                          // word counter
    FPGA_WORD_EXTENDED cw0, cw1;    // intermediate sums
    FPGA_WORD_REDUCED  cw_carry;    // wide carry

        // clear carry
    cw_carry = 0;

        // execute the algorithm
    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++)
    {
            // handy flags
        bool w_is_first = (w == 0);
        bool w_is_last  = (w == (2*FPGA_OPERAND_NUM_WORDS-1));

            // accumulate full current even word...
            // ...and also the upper part of the previous odd word (if not the first word)
        fpga_lowlevel_add47(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0);

            // generate another word from "carry" part of the previous even word...
            // ...and also the lower part of the following odd word (if not the last word)
        cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2));
        cw1 |= (FPGA_WORD_EXTENDED)cw_carry;

            // accumulate once again
        fpga_lowlevel_add47(cw0, cw1, &cw1);

            // store current word...
        c[w] = (FPGA_WORD)cw1;

            // ...and carry
        cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH);
    }
}


//------------------------------------------------------------------------------
//
// Fast modular reduction for NIST prime P-256.
//
// p = c mod p256
//
// This routine implements the algorithm 2.29 from "Guide to Elliptic Curve
// Cryptography".
//
// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
// on the other hand is the output of the parallelized Comba multiplier, so it
// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
//
// To save FPGA resources, the calculation is done using only two adders and
// one subtractor. The algorithm is split into five steps.
//
//------------------------------------------------------------------------------
#if USE_CURVE == 1
void fpga_modular_mul_helper_reduce_p256(const FPGA_WORD *c, FPGA_BUFFER *p)
{
        // "funny" words
    FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9;

        // compose "funny" words out of input words
    s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0];
    s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0,     s2.words[1] = 0,     s2.words[0] = 0;
    s3.words[7] = 0,     s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0,     s3.words[1] = 0,     s3.words[0] = 0;
    s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0,     s4.words[4] = 0,     s4.words[3] = 0,     s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8];
    s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9];
    s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0,     s6.words[4] = 0,     s6.words[3] = 0,     s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11];
    s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0,     s7.words[4] = 0,     s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12];
    s8.words[7] = c[12], s8.words[6] = 0,     s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13];
    s9.words[7] = c[13], s9.words[6] = 0,     s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0,     s9.words[1] = c[15], s9.words[0] = c[14];
    
        // intermediate results
    FPGA_BUFFER sum0, sum1, difference;

        /* Step 1. */
    fpga_modular_add(&s2,         &s2,         &sum0);          // sum0 = 2*s2
    fpga_modular_add(&s3,         &s3,         &sum1);          // sum1 = 2*s3
    fpga_modular_sub(&ECDSA_ZERO, &s6,         &difference);    // difference = -s6

        /* Step 2. */
    fpga_modular_add(&sum0,       &s1,         &sum0);          // sum0 = s1 + 2*s2
    fpga_modular_add(&sum1,       &s4,         &sum1);          // sum1 = s4 + 2*s3
    fpga_modular_sub(&difference, &s7,         &difference);    // difference = -(s6 + s7)

        /* Step 3. */
    fpga_modular_add(&sum0,       &s5,         &sum0);          // sum0 = s1 + 2*s2 + s5
    fpga_modular_add(&sum1,       &ECDSA_ZERO, &sum1);          // compulsory cycle to keep sum1 constant for next stage
    fpga_modular_sub(&difference, &s8,         &difference);    // difference = -(s6 + s7 + s8)

        /* Step 4. */
    fpga_modular_add(&sum0,       &sum1,       &sum0);          // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
//  fpga_modular_add(<dummy>,     <dummy>,     &sum1);          // dummy cycle, result ignored
    fpga_modular_sub(&difference, &s9,         &difference);    // difference = -(s6 + s7 + s8 + s9)

        /* Step 5. */
    fpga_modular_add(&sum0,       &difference, p);              // p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9
//  fpga_modular_add(<dummy>,     <dummy>,     &sum1);          // dummy cycle, result ignored
//  fpga_modular_add(<dummy>,     <dummy>,     &difference);    // dummy cycle, result ignored
}
#endif


//------------------------------------------------------------------------------
//
// Fast modular reduction for NIST prime P-384.
//
// p = c mod p384
//
// This routine implements the algorithm 2.30 from "Guide to Elliptic Curve
// Cryptography".
//
// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
// on the other hand is the output of the parallelized Comba multiplier, so it
// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
//
// To save FPGA resources, the calculation is done using only two adders and
// one subtractor. The algorithm is split into five steps.
//
//------------------------------------------------------------------------------
#if USE_CURVE == 2
void fpga_modular_mul_helper_reduce_p384(const FPGA_WORD *c, FPGA_BUFFER *p)
{
		// "funny" words
	FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;

		// compose "funny" words
	 s1.words[11] = c[11],   s1.words[10] = c[10],   s1.words[ 9] = c[ 9],   s1.words[ 8] = c[ 8],   s1.words[ 7] = c[ 7],   s1.words[ 6] = c[ 6],   s1.words[ 5] = c[ 5],   s1.words[ 4] = c[ 4],   s1.words[ 3] = c[ 3],   s1.words[ 2] = c[ 2],   s1.words[ 1] = c[ 1],   s1.words[ 0] = c[ 0];
	 s2.words[11] = 0,       s2.words[10] = 0,       s2.words[ 9] = 0,       s2.words[ 8] = 0,       s2.words[ 7] = 0,       s2.words[ 6] = c[23],   s2.words[ 5] = c[22],   s2.words[ 4] = c[21],   s2.words[ 3] = 0,       s2.words[ 2] = 0,       s2.words[ 1] = 0,       s2.words[ 0] = 0;
	 s3.words[11] = c[23],   s3.words[10] = c[22],   s3.words[ 9] = c[21],   s3.words[ 8] = c[20],   s3.words[ 7] = c[19],   s3.words[ 6] = c[18],   s3.words[ 5] = c[17],   s3.words[ 4] = c[16],   s3.words[ 3] = c[15],   s3.words[ 2] = c[14],   s3.words[ 1] = c[13],   s3.words[ 0] = c[12];
	 s4.words[11] = c[20],   s4.words[10] = c[19],   s4.words[ 9] = c[18],   s4.words[ 8] = c[17],   s4.words[ 7] = c[16],   s4.words[ 6] = c[15],   s4.words[ 5] = c[14],   s4.words[ 4] = c[13],   s4.words[ 3] = c[12],   s4.words[ 2] = c[23],   s4.words[ 1] = c[22],   s4.words[ 0] = c[21];
	 s5.words[11] = c[19],   s5.words[10] = c[18],   s5.words[ 9] = c[17],   s5.words[ 8] = c[16],   s5.words[ 7] = c[15],   s5.words[ 6] = c[14],   s5.words[ 5] = c[13],   s5.words[ 4] = c[12],   s5.words[ 3] = c[20],   s5.words[ 2] = 0,       s5.words[ 1] = c[23],   s5.words[ 0] = 0;
	 s6.words[11] = 0,       s6.words[10] = 0,       s6.words[ 9] = 0,       s6.words[ 8] = 0,       s6.words[ 7] = c[23],   s6.words[ 6] = c[22],   s6.words[ 5] = c[21],   s6.words[ 4] = c[20],   s6.words[ 3] = 0,       s6.words[ 2] = 0,       s6.words[ 1] = 0,       s6.words[ 0] = 0;
	 s7.words[11] = 0,       s7.words[10] = 0,       s7.words[ 9] = 0,       s7.words[ 8] = 0,       s7.words[ 7] = 0,       s7.words[ 6] = 0,       s7.words[ 5] = c[23],   s7.words[ 4] = c[22],   s7.words[ 3] = c[21],   s7.words[ 2] = 0,       s7.words[ 1] = 0,       s7.words[ 0] = c[20];
	 s8.words[11] = c[22],   s8.words[10] = c[21],   s8.words[ 9] = c[20],   s8.words[ 8] = c[19],   s8.words[ 7] = c[18],   s8.words[ 6] = c[17],   s8.words[ 5] = c[16],   s8.words[ 4] = c[15],   s8.words[ 3] = c[14],   s8.words[ 2] = c[13],   s8.words[ 1] = c[12],   s8.words[ 0] = c[23];
	 s9.words[11] = 0,       s9.words[10] = 0,       s9.words[ 9] = 0,       s9.words[ 8] = 0,       s9.words[ 7] = 0,       s9.words[ 6] = 0,       s9.words[ 5] = 0,       s9.words[ 4] = c[23],   s9.words[ 3] = c[22],   s9.words[ 2] = c[21],   s9.words[ 1] = c[20],   s9.words[ 0] = 0;
	s10.words[11] = 0,      s10.words[10] = 0,      s10.words[ 9] = 0,      s10.words[ 8] = 0,      s10.words[ 7] = 0,      s10.words[ 6] = 0,      s10.words[ 5] = 0,      s10.words[ 4] = c[23],  s10.words[ 3] = c[23],  s10.words[ 2] = 0,      s10.words[ 1] = 0,      s10.words[ 0] = 0;

		// intermediate results
	FPGA_BUFFER sum0, sum1, difference;

		/* Step 1. */
	fpga_modular_add(&s1,         &s3,         &sum0);			// sum0 = s1 + s3
	fpga_modular_add(&s2,         &s2,         &sum1);			// sum1 = 2*s2
	fpga_modular_sub(&ECDSA_ZERO, &s8,         &difference);	// difference = -s8

		/* Step 2. */
	fpga_modular_add(&sum0,       &s4,         &sum0);			// sum0 = s1 + s3 + s4
	fpga_modular_add(&sum1,       &s5,         &sum1);			// sum1 = 2*s2 + s5
	fpga_modular_sub(&difference, &s9,         &difference);	// difference = -(s8 + s9)

		/* Step 3. */
	fpga_modular_add(&sum0,       &s6,         &sum0);			// sum0 = s1 + s3 + s4 + s6
	fpga_modular_add(&sum1,       &s7,         &sum1);			// sum1 = 2*s2 + s5 + s7
	fpga_modular_sub(&difference, &s10,        &difference);	// difference = -(s8 + s9 + s10)

		/* Step 4. */
	fpga_modular_add(&sum0,       &sum1,       &sum0);			// sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
	fpga_modular_sub(&difference, &ECDSA_ZERO, &difference);	// compulsory cycle to keep difference constant for next stage

		/* Step 5. */
	fpga_modular_add(&sum0,       &difference, p);				// p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
//	fpga_modular_add(<dummy>,     <dummy>,     &difference);	// dummy cycle, result ignored
}
#endif


#if USE_CURVE == 1
//------------------------------------------------------------------------------
void fpga_modular_inv23_p256(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3)
//------------------------------------------------------------------------------
//
// This uses the addition chain from
//
// < https://briansmith.org/ecc-inversion-addition-chains-01 >
//
// to calculate A2 = A^-2 and A3 = A^-3.
//
//------------------------------------------------------------------------------
{
    // counter
	int cyc_cnt;

    // working variables
	FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X32;

    // first obtain intermediate helper quantities (X1..X32)

    // X1
	fpga_multiword_copy(A, &X1);

    // X2
	fpga_modular_mul(&X1, &X1, &R1);
	fpga_modular_mul(&R1, &X1, &X2);

    // X3
	fpga_modular_mul(&X2, &X2, &R1);
	fpga_modular_mul(&R1, &X1, &X3);

    // X6
	fpga_multiword_copy(&X3, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X6);

    // X12
	fpga_multiword_copy(&X6, &R1);
	for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X6, &X12);

    // X15
	fpga_multiword_copy(&X12, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X15);

    // X30
	fpga_multiword_copy(&X15, &R1);
	for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X15, &X30);

    // X32
	fpga_multiword_copy(&X30, &R1);
	for (cyc_cnt=0; cyc_cnt<2; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X2, &X32);

	// now compute the final results

	fpga_multiword_copy(&X32, &R1);
	for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X1, &R2);

	for (cyc_cnt=0; cyc_cnt<128; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
	fpga_modular_mul(&R2, &X32, &R1);

	for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X32, &R2);

	for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
	fpga_modular_mul(&R2, &X30, &R1);

	fpga_modular_mul(&R1, &R1, &R2);
	fpga_modular_mul(&R2, &R2, &R1);

    // A2 obtained
	fpga_multiword_copy(&R1, A2);

    // now calculate compute inverse cubed from inverse squared
	fpga_modular_mul(&R1, &R1, &R2);
	fpga_modular_mul(&R2, A,   &R1);

    // A3 obtained
	fpga_multiword_copy(&R1, A3);
}
#endif


#if USE_CURVE == 2
//------------------------------------------------------------------------------
void fpga_modular_inv23_p384(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3)
//------------------------------------------------------------------------------
//
// This uses the addition chain from
//
// < https://briansmith.org/ecc-inversion-addition-chains-01 >
//
// to calculate A2 = A^-2 and A3 = A^-3.
//
//------------------------------------------------------------------------------
{
    // counter
	int cyc_cnt;

    // working variables
	FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X60, X120;

    // first obtain intermediate helper quantities (X1..X120)

    // X1
	fpga_multiword_copy(A, &X1);

    // X2
	fpga_modular_mul(&X1, &X1, &R1);
	fpga_modular_mul(&R1, &X1, &X2);

    // X3
	fpga_modular_mul(&X2, &X2, &R1);
	fpga_modular_mul(&R1, &X1, &X3);

    // X6
	fpga_multiword_copy(&X3, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X6);

    // X12
	fpga_multiword_copy(&X6, &R1);
	for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X6, &X12);

    // X15
	fpga_multiword_copy(&X12, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X15);

    // X30
	fpga_multiword_copy(&X15, &R1);
	for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X15, &X30);

    // X60
	fpga_multiword_copy(&X30, &R1);
	for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X30, &X60);

    // X120
	fpga_multiword_copy(&X60, &R1);
	for (cyc_cnt=0; cyc_cnt<60; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X60, &X120);

	// now compute the final results

	fpga_multiword_copy(&X120, &R1);
	for (cyc_cnt=0; cyc_cnt<120; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X120, &R2);

    for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
    fpga_modular_mul(&R1, &X15, &R2);

    for (cyc_cnt=0; cyc_cnt<31; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
    fpga_modular_mul(&R1, &X30, &R2);
    fpga_modular_mul(&R2, &R2, &R1);
    fpga_modular_mul(&R1, &R1, &R2);
    fpga_modular_mul(&R2, &X2, &R1);

    for (cyc_cnt=0; cyc_cnt<94; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
    fpga_modular_mul(&R1, &X30, &R2);
    fpga_modular_mul(&R2, &R2, &R1);
    fpga_modular_mul(&R1, &R1, &R2);

    // A2 obtained
	fpga_multiword_copy(&R2, A2);

    // now calculate compute inverse cubed from inverse squared
	fpga_modular_mul(&R2, &R2, &R1);
	fpga_modular_mul(&R1, A,   &R2);

    // A3 obtained
	fpga_multiword_copy(&R2, A3);
}
#endif

//------------------------------------------------------------------------------
// End-of-File
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//
// ecdsa_fpga_modular.cpp
// -------------------------------------
// Modular arithmetic routines for ECDSA
//
// Authors: Pavel Shatov
//
// Copyright 2015-2016, 2018 NORDUnet A/S
// Copyright 2021 The Commons Conservancy Cryptech Project
// SPDX-License-Identifier: BSD-3-Clause
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// - Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
//
// - Neither the name of the copyright holder nor the names of its
//   contributors may be used to endorse or promote products derived from
//   this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------


//------------------------------------------------------------------------------
// Headers
//------------------------------------------------------------------------------
#include "ecdsa_fpga_model.h"


//------------------------------------------------------------------------------
// Globals
//------------------------------------------------------------------------------
FPGA_BUFFER ECDSA_Q;
FPGA_BUFFER ECDSA_DELTA;


//------------------------------------------------------------------------------
// Settings
//------------------------------------------------------------------------------
bool _DUMP_MODULAR_RESULTS = false;


//------------------------------------------------------------------------------
void fpga_modular_init()
//------------------------------------------------------------------------------
{
    int w_src, w_dst;   // word counters

        // temporary things
    FPGA_WORD TMP_Q    [FPGA_OPERAND_NUM_WORDS] = ECDSA_Q_INIT;
    FPGA_WORD TMP_DELTA[FPGA_OPERAND_NUM_WORDS] = ECDSA_DELTA_INIT;

        /* fill buffers for large multi-word integers, we need to fill them in
         * reverse order because of the way C arrays are initialized
         */
    for (   w_src = 0, w_dst = FPGA_OPERAND_NUM_WORDS - 1;
            w_src < FPGA_OPERAND_NUM_WORDS;
            w_src++, w_dst--)
    {
        ECDSA_Q.words[w_dst] = TMP_Q[w_src];
        ECDSA_DELTA.words[w_dst] = TMP_DELTA[w_src];
    }
}


//------------------------------------------------------------------------------
//
// Modular addition.
//
// This routine implements algorithm 3. from "Ultra High Performance ECC over
// NIST Primes on Commercial FPGAs".
//
// s = (a + b) mod q
//
// The naive approach is like the following:
//
// 1. s = a + b
// 2. if (s >= q) s -= q
//
// The speed-up trick is to simultaneously calculate (a + b) and (a + b - q)
// and then select the right variant.
//
//------------------------------------------------------------------------------
void fpga_modular_add(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *s)
//------------------------------------------------------------------------------
{
    int w;                  // word counter
    FPGA_BUFFER ab, ab_n;   // intermediate buffers
    bool c_in, c_out;       // carries
    bool b_in, b_out;       // borrows

    c_in = false;           // first word has no carry
    b_in = false;           // first word has no borrow
    
        // run parallel addition and subtraction
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
    {
        fpga_lowlevel_add32(a->words[w], b->words[w],      c_in, &ab.words[w],   &c_out);
        fpga_lowlevel_sub32(ab.words[w], ECDSA_Q.words[w], b_in, &ab_n.words[w], &b_out);

        c_in = c_out;   // propagate carry
        b_in = b_out;   // propagate borrow
    }

        // now select the right buffer

        /*
         * We select the right variant based on borrow and carry flags after
         * addition and subtraction of the very last pair of words. Note, that
         * we only need to select the first variant (a + b) when (a + b) < q.
         * This way if we get negative number after subtraction, we discard it
         * and use the output of the adder instead. The subtractor output is
         * negative when borrow flag is set *and* carry flag is not set. When
         * both borrow and carry are set, the number is non-negative, because
         * borrow and carry cancel each other out.
         */
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
        s->words[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w];

    if (_DUMP_MODULAR_RESULTS)
        dump_uop_output("ADD", s);
}


//------------------------------------------------------------------------------
//
// Modular subtraction.
//
// This routine implements algorithm 3. from "Ultra High Performance ECC over
// NIST Primes on Commercial FPGAs".
//
// d = (a - b) mod q
//
// The naive approach is like the following:
//
// 1. d = a - b
// 2. if (a < b) d += q
//
// The speed-up trick is to simultaneously calculate (a - b) and (a - b + q)
// and then select the right variant.
//
//------------------------------------------------------------------------------
void fpga_modular_sub(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *d)
//------------------------------------------------------------------------------
{
    int w;                  // word counter
    FPGA_BUFFER ab, ab_n;   // intermediate buffers
    bool c_in, c_out;       // carries
    bool b_in, b_out;       // borrows

    c_in = false;           // first word has no carry
    b_in = false;           // first word has no borrow
    
        // run parallel subtraction and addition
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
    {
        fpga_lowlevel_sub32(a->words[w], b->words[w],      b_in, &ab.words[w],   &b_out);
        fpga_lowlevel_add32(ab.words[w], ECDSA_Q.words[w], c_in, &ab_n.words[w], &c_out);

        b_in = b_out;   // propagate borrow
        c_in = c_out;   // propagate carry
    }

        // now select the right buffer

        /*
         * We select the right variant based on borrow flag after subtraction
         * and addition of the very last pair of words. Note, that we only
         * need to select the second variant (a - b + q) when a < b. This way
         * if we get negative number after subtraction, we discard it
         * and use the output of the adder instead. The Subtractor output is
         * negative when borrow flag is set.
         */
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
        d->words[w] = b_out ? ab_n.words[w] : ab.words[w];

    if (_DUMP_MODULAR_RESULTS)
        dump_uop_output("SUB", d);
}


//------------------------------------------------------------------------------
//
// Modular multiplication.
//
// This routine implements modular multiplication algorithm from "Ultra High
// Performance ECC over NIST Primes on Commercial FPGAs".
//
// p = (a * b) mod q
//
// The complex algorithm is split into three parts:
//
// 1. Calculation of partial words
// 2. Acccumulation of partial words into full-size product
// 3. Modular reduction of the full-size product
//
// See comments for corresponding helper routines for more information.
//
//------------------------------------------------------------------------------
void fpga_modular_mul(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *p)
//------------------------------------------------------------------------------
{
    FPGA_WORD_EXTENDED si[4*FPGA_OPERAND_NUM_WORDS-1];  // parts of intermediate product
    FPGA_WORD c[2*FPGA_OPERAND_NUM_WORDS];              // full-size intermediate product

        /* save debug flag */
    bool _save_dump_modular_results = _DUMP_MODULAR_RESULTS;

        /* mask debug flag to not garble output */
    _DUMP_MODULAR_RESULTS = false;

        /* multiply to get partial words */
    fpga_modular_mul_helper_multiply(a, b, si);

        /* accumulate partial words into full-size product */
    fpga_modular_mul_helper_accumulate(si, c);

        /* reduce full-size product using special routine */
    fpga_modular_mul_helper_reduce(c, p);

        /* restore debug flag */
    _DUMP_MODULAR_RESULTS = _save_dump_modular_results;

        /* now dump result if needed */
    if (_DUMP_MODULAR_RESULTS)
        dump_uop_output("MUL", p);
}


//------------------------------------------------------------------------------
//
// Parallelized multiplication.
//
// This routine implements the algorithm in Fig. 3. from "Ultra High
// Performance ECC over NIST Primes on Commercial FPGAs".
//
// Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2
// bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide
// inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS
// multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial
// output words si[].
//
// The order of loading a and b into the multipliers is a bit complicated,
// during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is
// obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the
// remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously.
//
//------------------------------------------------------------------------------
void fpga_modular_mul_helper_multiply(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si)
//------------------------------------------------------------------------------
{
    int w;          // counter
    int t, x;       // more counters
    int j, i;       // word indices
    FPGA_WORD p;    // product

        // buffers for smaller words that multipliers can handle
    FPGA_WORD_REDUCED ai[2*FPGA_OPERAND_NUM_WORDS];
    FPGA_WORD_REDUCED bj[2*FPGA_OPERAND_NUM_WORDS];
    
        // split a and b into smaller words
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
        ai[2*w] = (FPGA_WORD_REDUCED)a->words[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)),
        bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2));

        // accumulators
    FPGA_WORD_EXTENDED mac[2*FPGA_OPERAND_NUM_WORDS];
    
        // clear accumulators
    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) mac[w] = 0;

        // run the crazy algorithm :)
    for (t=0; t<(2*FPGA_OPERAND_NUM_WORDS); t++)
    {
            // save upper half of si[] (one word per cycle)
        if (t > 0)
        {   si[4*FPGA_OPERAND_NUM_WORDS - (t+1)] = mac[t];
            mac[t] = 0;
        }

            // update index
        j = 2*FPGA_OPERAND_NUM_WORDS - (t+1);

            // parallel multiplication
        for (x=0; x<(2*FPGA_OPERAND_NUM_WORDS); x++)
        {
                // update index
            i = t - x;
            if (i < 0) i += 2*FPGA_OPERAND_NUM_WORDS;

                // multiply...
            fpga_lowlevel_mul16(ai[i], bj[j], &p);

                // ...accumulate
            mac[x] += p;
        }

    }

        // now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once)
    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++)
        si[w] = mac[2*FPGA_OPERAND_NUM_WORDS - (w+1)];
}


//------------------------------------------------------------------------------
//
// Accumulation of partial words into full-size product.
//
// This routine implements the Algorithm 4. from "Ultra High Performance ECC
// over NIST Primes on Commercial FPGAs".
//
// Input words si[] are accumulated into full-size product c[].
//
// The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in
// si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are
// summed in full, odd words are split into two parts. During every cycle the
// upper part of the previous word and the lower part of the following word are
// summed too.
//
//------------------------------------------------------------------------------
void fpga_modular_mul_helper_accumulate(const FPGA_WORD_EXTENDED *si, FPGA_WORD *c)
//------------------------------------------------------------------------------
{
    int w;                          // word counter
    FPGA_WORD_EXTENDED cw0, cw1;    // intermediate sums
    FPGA_WORD_REDUCED  cw_carry;    // wide carry

        // clear carry
    cw_carry = 0;

        // execute the algorithm
    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++)
    {
            // handy flags
        bool w_is_first = (w == 0);
        bool w_is_last  = (w == (2*FPGA_OPERAND_NUM_WORDS-1));

            // accumulate full current even word...
            // ...and also the upper part of the previous odd word (if not the first word)
        fpga_lowlevel_add47(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0);

            // generate another word from "carry" part of the previous even word...
            // ...and also the lower part of the following odd word (if not the last word)
        cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2));
        cw1 |= (FPGA_WORD_EXTENDED)cw_carry;

            // accumulate once again
        fpga_lowlevel_add47(cw0, cw1, &cw1);

            // store current word...
        c[w] = (FPGA_WORD)cw1;

            // ...and carry
        cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH);
    }
}


//------------------------------------------------------------------------------
//
// Fast modular reduction for NIST prime P-256.
//
// p = c mod p256
//
// This routine implements the algorithm 2.29 from "Guide to Elliptic Curve
// Cryptography".
//
// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
// on the other hand is the output of the parallelized Comba multiplier, so it
// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
//
// To save FPGA resources, the calculation is done using only two adders and
// one subtractor. The algorithm is split into five steps.
//
//------------------------------------------------------------------------------
#if USE_CURVE == 1
void fpga_modular_mul_helper_reduce_p256(const FPGA_WORD *c, FPGA_BUFFER *p)
{
        // "funny" words
    FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9;

        // compose "funny" words out of input words
    s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0];
    s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0,     s2.words[1] = 0,     s2.words[0] = 0;
    s3.words[7] = 0,     s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0,     s3.words[1] = 0,     s3.words[0] = 0;
    s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0,     s4.words[4] = 0,     s4.words[3] = 0,     s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8];
    s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9];
    s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0,     s6.words[4] = 0,     s6.words[3] = 0,     s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11];
    s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0,     s7.words[4] = 0,     s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12];
    s8.words[7] = c[12], s8.words[6] = 0,     s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13];
    s9.words[7] = c[13], s9.words[6] = 0,     s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0,     s9.words[1] = c[15], s9.words[0] = c[14];
    
        // intermediate results
    FPGA_BUFFER sum0, sum1, difference;

        /* Step 1. */
    fpga_modular_add(&s2,         &s2,         &sum0);          // sum0 = 2*s2
    fpga_modular_add(&s3,         &s3,         &sum1);          // sum1 = 2*s3
    fpga_modular_sub(&ECDSA_ZERO, &s6,         &difference);    // difference = -s6

        /* Step 2. */
    fpga_modular_add(&sum0,       &s1,         &sum0);          // sum0 = s1 + 2*s2
    fpga_modular_add(&sum1,       &s4,         &sum1);          // sum1 = s4 + 2*s3
    fpga_modular_sub(&difference, &s7,         &difference);    // difference = -(s6 + s7)

        /* Step 3. */
    fpga_modular_add(&sum0,       &s5,         &sum0);          // sum0 = s1 + 2*s2 + s5
    fpga_modular_add(&sum1,       &ECDSA_ZERO, &sum1);          // compulsory cycle to keep sum1 constant for next stage
    fpga_modular_sub(&difference, &s8,         &difference);    // difference = -(s6 + s7 + s8)

        /* Step 4. */
    fpga_modular_add(&sum0,       &sum1,       &sum0);          // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
//  fpga_modular_add(<dummy>,     <dummy>,     &sum1);          // dummy cycle, result ignored
    fpga_modular_sub(&difference, &s9,         &difference);    // difference = -(s6 + s7 + s8 + s9)

        /* Step 5. */
    fpga_modular_add(&sum0,       &difference, p);              // p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9
//  fpga_modular_add(<dummy>,     <dummy>,     &sum1);          // dummy cycle, result ignored
//  fpga_modular_add(<dummy>,     <dummy>,     &difference);    // dummy cycle, result ignored
}
#endif


//------------------------------------------------------------------------------
//
// Fast modular reduction for NIST prime P-384.
//
// p = c mod p384
//
// This routine implements the algorithm 2.30 from "Guide to Elliptic Curve
// Cryptography".
//
// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
// on the other hand is the output of the parallelized Comba multiplier, so it
// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
//
// To save FPGA resources, the calculation is done using only two adders and
// one subtractor. The algorithm is split into five steps.
//
//------------------------------------------------------------------------------
#if USE_CURVE == 2
void fpga_modular_mul_helper_reduce_p384(const FPGA_WORD *c, FPGA_BUFFER *p)
{
		// "funny" words
	FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;

		// compose "funny" words
	 s1.words[11] = c[11],   s1.words[10] = c[10],   s1.words[ 9] = c[ 9],   s1.words[ 8] = c[ 8],   s1.words[ 7] = c[ 7],   s1.words[ 6] = c[ 6],   s1.words[ 5] = c[ 5],   s1.words[ 4] = c[ 4],   s1.words[ 3] = c[ 3],   s1.words[ 2] = c[ 2],   s1.words[ 1] = c[ 1],   s1.words[ 0] = c[ 0];
	 s2.words[11] = 0,       s2.words[10] = 0,       s2.words[ 9] = 0,       s2.words[ 8] = 0,       s2.words[ 7] = 0,       s2.words[ 6] = c[23],   s2.words[ 5] = c[22],   s2.words[ 4] = c[21],   s2.words[ 3] = 0,       s2.words[ 2] = 0,       s2.words[ 1] = 0,       s2.words[ 0] = 0;
	 s3.words[11] = c[23],   s3.words[10] = c[22],   s3.words[ 9] = c[21],   s3.words[ 8] = c[20],   s3.words[ 7] = c[19],   s3.words[ 6] = c[18],   s3.words[ 5] = c[17],   s3.words[ 4] = c[16],   s3.words[ 3] = c[15],   s3.words[ 2] = c[14],   s3.words[ 1] = c[13],   s3.words[ 0] = c[12];
	 s4.words[11] = c[20],   s4.words[10] = c[19],   s4.words[ 9] = c[18],   s4.words[ 8] = c[17],   s4.words[ 7] = c[16],   s4.words[ 6] = c[15],   s4.words[ 5] = c[14],   s4.words[ 4] = c[13],   s4.words[ 3] = c[12],   s4.words[ 2] = c[23],   s4.words[ 1] = c[22],   s4.words[ 0] = c[21];
	 s5.words[11] = c[19],   s5.words[10] = c[18],   s5.words[ 9] = c[17],   s5.words[ 8] = c[16],   s5.words[ 7] = c[15],   s5.words[ 6] = c[14],   s5.words[ 5] = c[13],   s5.words[ 4] = c[12],   s5.words[ 3] = c[20],   s5.words[ 2] = 0,       s5.words[ 1] = c[23],   s5.words[ 0] = 0;
	 s6.words[11] = 0,       s6.words[10] = 0,       s6.words[ 9] = 0,       s6.words[ 8] = 0,       s6.words[ 7] = c[23],   s6.words[ 6] = c[22],   s6.words[ 5] = c[21],   s6.words[ 4] = c[20],   s6.words[ 3] = 0,       s6.words[ 2] = 0,       s6.words[ 1] = 0,       s6.words[ 0] = 0;
	 s7.words[11] = 0,       s7.words[10] = 0,       s7.words[ 9] = 0,       s7.words[ 8] = 0,       s7.words[ 7] = 0,       s7.words[ 6] = 0,       s7.words[ 5] = c[23],   s7.words[ 4] = c[22],   s7.words[ 3] = c[21],   s7.words[ 2] = 0,       s7.words[ 1] = 0,       s7.words[ 0] = c[20];
	 s8.words[11] = c[22],   s8.words[10] = c[21],   s8.words[ 9] = c[20],   s8.words[ 8] = c[19],   s8.words[ 7] = c[18],   s8.words[ 6] = c[17],   s8.words[ 5] = c[16],   s8.words[ 4] = c[15],   s8.words[ 3] = c[14],   s8.words[ 2] = c[13],   s8.words[ 1] = c[12],   s8.words[ 0] = c[23];
	 s9.words[11] = 0,       s9.words[10] = 0,       s9.words[ 9] = 0,       s9.words[ 8] = 0,       s9.words[ 7] = 0,       s9.words[ 6] = 0,       s9.words[ 5] = 0,       s9.words[ 4] = c[23],   s9.words[ 3] = c[22],   s9.words[ 2] = c[21],   s9.words[ 1] = c[20],   s9.words[ 0] = 0;
	s10.words[11] = 0,      s10.words[10] = 0,      s10.words[ 9] = 0,      s10.words[ 8] = 0,      s10.words[ 7] = 0,      s10.words[ 6] = 0,      s10.words[ 5] = 0,      s10.words[ 4] = c[23],  s10.words[ 3] = c[23],  s10.words[ 2] = 0,      s10.words[ 1] = 0,      s10.words[ 0] = 0;

		// intermediate results
	FPGA_BUFFER sum0, sum1, difference;

		/* Step 1. */
	fpga_modular_add(&s1,         &s3,         &sum0);			// sum0 = s1 + s3
	fpga_modular_add(&s2,         &s2,         &sum1);			// sum1 = 2*s2
	fpga_modular_sub(&ECDSA_ZERO, &s8,         &difference);	// difference = -s8

		/* Step 2. */
	fpga_modular_add(&sum0,       &s4,         &sum0);			// sum0 = s1 + s3 + s4
	fpga_modular_add(&sum1,       &s5,         &sum1);			// sum1 = 2*s2 + s5
	fpga_modular_sub(&difference, &s9,         &difference);	// difference = -(s8 + s9)

		/* Step 3. */
	fpga_modular_add(&sum0,       &s6,         &sum0);			// sum0 = s1 + s3 + s4 + s6
	fpga_modular_add(&sum1,       &s7,         &sum1);			// sum1 = 2*s2 + s5 + s7
	fpga_modular_sub(&difference, &s10,        &difference);	// difference = -(s8 + s9 + s10)

		/* Step 4. */
	fpga_modular_add(&sum0,       &sum1,       &sum0);			// sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
	fpga_modular_sub(&difference, &ECDSA_ZERO, &difference);	// compulsory cycle to keep difference constant for next stage

		/* Step 5. */
	fpga_modular_add(&sum0,       &difference, p);				// p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
//	fpga_modular_add(<dummy>,     <dummy>,     &difference);	// dummy cycle, result ignored
}
#endif


#if USE_CURVE == 1
//------------------------------------------------------------------------------
void fpga_modular_inv23_p256(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3)
//------------------------------------------------------------------------------
//
// This uses the addition chain from
//
// < https://briansmith.org/ecc-inversion-addition-chains-01 >
//
// to calculate A2 = A^-2 and A3 = A^-3.
//
//------------------------------------------------------------------------------
{
    // counter
	int cyc_cnt;

    // working variables
	FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X32;

    // first obtain intermediate helper quantities (X1..X32)

    // X1
	fpga_multiword_copy(A, &X1);

    // X2
	fpga_modular_mul(&X1, &X1, &R1);
	fpga_modular_mul(&R1, &X1, &X2);

    // X3
	fpga_modular_mul(&X2, &X2, &R1);
	fpga_modular_mul(&R1, &X1, &X3);

    // X6
	fpga_multiword_copy(&X3, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X6);

    // X12
	fpga_multiword_copy(&X6, &R1);
	for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X6, &X12);

    // X15
	fpga_multiword_copy(&X12, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X15);

    // X30
	fpga_multiword_copy(&X15, &R1);
	for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X15, &X30);

    // X32
	fpga_multiword_copy(&X30, &R1);
	for (cyc_cnt=0; cyc_cnt<2; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X2, &X32);

	// now compute the final results

	fpga_multiword_copy(&X32, &R1);
	for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X1, &R2);

	for (cyc_cnt=0; cyc_cnt<128; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
	fpga_modular_mul(&R2, &X32, &R1);

	for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X32, &R2);

	for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
	fpga_modular_mul(&R2, &X30, &R1);

	fpga_modular_mul(&R1, &R1, &R2);
	fpga_modular_mul(&R2, &R2, &R1);

    // A2 obtained
	fpga_multiword_copy(&R1, A2);

    // now calculate compute inverse cubed from inverse squared
	fpga_modular_mul(&R1, &R1, &R2);
	fpga_modular_mul(&R2, A,   &R1);

    // A3 obtained
	fpga_multiword_copy(&R1, A3);
}
#endif


#if USE_CURVE == 2
//------------------------------------------------------------------------------
void fpga_modular_inv23_p384(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3)
//------------------------------------------------------------------------------
//
// This uses the addition chain from
//
// < https://briansmith.org/ecc-inversion-addition-chains-01 >
//
// to calculate A2 = A^-2 and A3 = A^-3.
//
//------------------------------------------------------------------------------
{
    // counter
	int cyc_cnt;

    // working variables
	FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X60, X120;

    // first obtain intermediate helper quantities (X1..X120)

    // X1
	fpga_multiword_copy(A, &X1);

    // X2
	fpga_modular_mul(&X1, &X1, &R1);
	fpga_modular_mul(&R1, &X1, &X2);

    // X3
	fpga_modular_mul(&X2, &X2, &R1);
	fpga_modular_mul(&R1, &X1, &X3);

    // X6
	fpga_multiword_copy(&X3, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X6);

    // X12
	fpga_multiword_copy(&X6, &R1);
	for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X6, &X12);

    // X15
	fpga_multiword_copy(&X12, &R1);
	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X3, &X15);

    // X30
	fpga_multiword_copy(&X15, &R1);
	for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R2, &X15, &X30);

    // X60
	fpga_multiword_copy(&X30, &R1);
	for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X30, &X60);

    // X120
	fpga_multiword_copy(&X60, &R1);
	for (cyc_cnt=0; cyc_cnt<60; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X60, &X120);

	// now compute the final results

	fpga_multiword_copy(&X120, &R1);
	for (cyc_cnt=0; cyc_cnt<120; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
	fpga_modular_mul(&R1, &X120, &R2);

    for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
    fpga_modular_mul(&R1, &X15, &R2);

    for (cyc_cnt=0; cyc_cnt<31; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
		else			    fpga_modular_mul(&R1, &R1, &R2);
	}
    fpga_modular_mul(&R1, &X30, &R2);
    fpga_modular_mul(&R2, &R2, &R1);
    fpga_modular_mul(&R1, &R1, &R2);
    fpga_modular_mul(&R2, &X2, &R1);

    for (cyc_cnt=0; cyc_cnt<94; cyc_cnt++)
	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
		else			    fpga_modular_mul(&R2, &R2, &R1);
	}
    fpga_modular_mul(&R1, &X30, &R2);
    fpga_modular_mul(&R2, &R2, &R1);
    fpga_modular_mul(&R1, &R1, &R2);

    // A2 obtained
	fpga_multiword_copy(&R2, A2);

    // now calculate compute inverse cubed from inverse squared
	fpga_modular_mul(&R2, &R2, &R1);
	fpga_modular_mul(&R1, A,   &R2);

    // A3 obtained
	fpga_multiword_copy(&R2, A3);
}
#endif

//------------------------------------------------------------------------------
// End-of-File
//------------------------------------------------------------------------------