path: root/ecdsa_fpga_curve_abstract.cpp



//------------------------------------------------------------------------------
//
// ecdsa_fpga_curve_abstract.cpp
// ----------------------------------------------
// Elliptic curve arithmetic procedures for ECDSA
//
// Authors: Pavel Shatov
//
// Copyright (c) 2015-2016, 2018, 2021 NORDUnet A/S
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// - Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------


//------------------------------------------------------------------------------
// Headers
//------------------------------------------------------------------------------
#include "ecdsa_fpga_model.h"


//------------------------------------------------------------------------------
// Globals
//------------------------------------------------------------------------------
FPGA_BUFFER ECDSA_GX, ECDSA_GY;
FPGA_BUFFER ECDSA_HX, ECDSA_HY;
FPGA_BUFFER ECDSA_N;


//------------------------------------------------------------------------------
void fpga_curve_init()
//------------------------------------------------------------------------------
{
    int w;  // word counter

    FPGA_BUFFER tmp_gx = ECDSA_GX_INIT, tmp_gy = ECDSA_GY_INIT;
    FPGA_BUFFER tmp_hx = ECDSA_HX_INIT, tmp_hy = ECDSA_HY_INIT;
    FPGA_BUFFER tmp_n = ECDSA_N_INIT;

        /* fill buffers for large multi-word integers */
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
    {   ECDSA_GX.words[w] = tmp_gx.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_GY.words[w] = tmp_gy.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_HX.words[w] = tmp_hx.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_HY.words[w] = tmp_hy.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_N.words[w] = tmp_n.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
    }
}


//------------------------------------------------------------------------------
//
// Elliptic curve base point scalar multiplication routine.
//
// Q(qx,qy) = k * G(px,py)
//
// Note, that Q is supposed to be in affine coordinates. Multiplication is done
// using the Montgomery ladder method.
//
//------------------------------------------------------------------------------
void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
//------------------------------------------------------------------------------
{
    int word_count, bit_count;  // counters

    FPGA_BUFFER r0x, r0y, r0z;  // intermediate result
    FPGA_BUFFER r1x, r1y, r1z;  // intermediate result
    FPGA_BUFFER sx, sy, sz;     // temporary variable
	FPGA_BUFFER tx, ty, tz;     // temporary variable

        /* set initial value of R0 to point at infinity, R1 to the base point */
    fpga_multiword_copy(&ECDSA_ONE,  &r0x);
    fpga_multiword_copy(&ECDSA_ONE,  &r0y);
    fpga_multiword_copy(&ECDSA_ZERO, &r0z);

    fpga_multiword_copy(&ECDSA_GX,  &r1x);
    fpga_multiword_copy(&ECDSA_GY,  &r1y);
    fpga_multiword_copy(&ECDSA_ONE, &r1z);

        /* handy vars */
    FPGA_WORD k_word_shifted;
    bool k_bit;

        /* process bits of k left-to-right */
    for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
        for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
        {
            k_word_shifted = k->words[word_count-1] >> (bit_count-1);
			k_bit = (k_word_shifted & 1) == 1;

#ifdef DUMP_CYCLE_STATES
            dump_cycle_header(word_count, bit_count, k_bit);
#endif

                /* calculate S = R0 + R */
            fpga_curve_add_jacobian_abstract_2(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z, &sx, &sy, &sz);

                /* calculate T = 2 * (R0 | R1) */
			if (!k_bit)
				fpga_curve_double_jacobian_abstract(&r0x, &r0y, &r0z, &tx, &ty, &tz);
			else
				fpga_curve_double_jacobian_abstract(&r1x, &r1y, &r1z, &tx, &ty, &tz);

                //
                // dump cycle state
                //
#ifdef DUMP_CYCLE_STATES
            dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z,
                             &sx,  &sy,  &sz,  &tx,  &ty,  &tz);
#endif

				/* now update working variables */
			if (!k_bit)
			{	fpga_multiword_copy(&tx, &r0x);
				fpga_multiword_copy(&ty, &r0y);
				fpga_multiword_copy(&tz, &r0z);

				fpga_multiword_copy(&sx, &r1x);
				fpga_multiword_copy(&sy, &r1y);
				fpga_multiword_copy(&sz, &r1z);
			}
			else
			{	fpga_multiword_copy(&tx, &r1x);
				fpga_multiword_copy(&ty, &r1y);
				fpga_multiword_copy(&tz, &r1z);

				fpga_multiword_copy(&sx, &r0x);
				fpga_multiword_copy(&sy, &r0y);
				fpga_multiword_copy(&sz, &r0z);
			}
        }

		//
		// we now need to convert the point to affine coordinates
		//
    FPGA_BUFFER a2, a3; 

#ifdef DUMP_UOP_OUTPUTS
    _DUMP_MODULAR_RESULTS = true;
#endif

	fpga_modular_inv23(&r0z, &a2, &a3);

    fpga_modular_mul(&r0x, &a2, qx);      // qx = px * (pz^-1)^2 (mod q)
    fpga_modular_mul(&r0y, &a3, qy);      // qy = py * (pz^-1)^3 (mod q)

    _DUMP_MODULAR_RESULTS = false;

        // check, that rz is non-zero (not point at infinity)
    bool rz_is_zero = fpga_multiword_is_zero(&r0z);

        // handle special case (result is point at infinity)
    if (rz_is_zero)
    {   fpga_multiword_copy(&ECDSA_ZERO, qx);
        fpga_multiword_copy(&ECDSA_ZERO, qy);
    }
}


//------------------------------------------------------------------------------
//
// Elliptic curve point doubling routine.
//
// R(rx,ry,rz) = 2 * P(px,py,pz)
//
// Note, that P(px,py,pz) is supposed to be in projective Jacobian coordinates,
// R will be in projective Jacobian coordinates.
//
// This routine implements algorithm 3.21 from "Guide to Elliptic Curve
// Cryptography", the only difference is that step 6. does T1 = T2 + T2 and
// then T2 = T2 + T1 instead of T2 = 3 * T2, because our addition is much
// faster, than multiplication.
//
// Note, that this routine also handles one special case, namely when P is at
// infinity. No actual extra "handling" is necessary, since when pz is zero,
// rz will also be zero (and that's what the "at infinity" check takes into
// account).
//
// Instead of actual modular division, multiplication by pre-computed constant
// (2^-1 mod q) is done.
//
//------------------------------------------------------------------------------
void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
                                         const FPGA_BUFFER *py,
                                         const FPGA_BUFFER *pz,
                                               FPGA_BUFFER *rx,
                                               FPGA_BUFFER *ry,
                                               FPGA_BUFFER *rz)
//------------------------------------------------------------------------------
{
    FPGA_BUFFER t1, t2, t3, t4, t5; // temporary variables

#ifdef DUMP_UOP_OUTPUTS
    _DUMP_MODULAR_RESULTS = true;
#endif

    fpga_modular_mul(pz,  pz,           &t1);
    fpga_modular_sub(px,  &t1,          &t2);
    fpga_modular_add(px,  &t1,          &t3);
    fpga_modular_mul(&t3, &t2,          &t4);
    fpga_modular_add(&t4, &t4,          &t1);
    fpga_modular_add(&t1, &t4,          &t2);
    fpga_modular_add(py,  py,           ry);
    fpga_modular_mul(pz,  ry,           rz);
	fpga_modular_mul(ry,  ry,           &t1);
    fpga_modular_mul(px,  &t1,          &t3);
	fpga_modular_mul(&t1, &t1,          &t4);
	fpga_modular_mul(&t4, &ECDSA_DELTA, &t5);
	fpga_modular_mul(&t2, &t2,          &t4);
    fpga_modular_add(&t3, &t3,          &t1);
    fpga_modular_sub(&t4, &t1,          rx); 
    fpga_modular_sub(&t3, rx,           &t1);
    fpga_modular_mul(&t1, &t2,          &t3);
    fpga_modular_sub(&t3, &t5,          ry);

    _DUMP_MODULAR_RESULTS = false;
}


//------------------------------------------------------------------------------
//
// Elliptic curve point addition routine.
//
// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy,qz)
//
// Note, that P(px, py, pz) and Q(qx, qy, qz) are supposed to be in projective
// Jacobian coordinates, R(rx, ry, rz) will be in projective Jacobian
// coordinates too.
//
// This routine implements the Point Addition algorithm from
// https://en.wikibooks.org/wiki/Cryptography/Prime_Curve/Jacobian_Coordinates
// 
// Since the routine is means to be used with Montgomery ladder, the invariant
// R1 - R0 = G means, that the two special cases P == Q and P == -Q can never
// happen and the checks are redundant. The checks for P === O and Q == O are
// necessary, however. Note, that P and Q can't be at infinity at the same time
// though.
//
// WARNING: This procedure does not take any active measures to keep run-time
// constant. The main purpose of this model is to help debug Verilog code for
// FPGA, so *DO NOT* use is anywhere near production!
//
//------------------------------------------------------------------------------
void fpga_curve_add_jacobian_abstract_2(const FPGA_BUFFER *px,
                                        const FPGA_BUFFER *py,
                                        const FPGA_BUFFER *pz,
                                        const FPGA_BUFFER *qx,
                                        const FPGA_BUFFER *qy,
                                        const FPGA_BUFFER *qz,
                                              FPGA_BUFFER *rx,
                                              FPGA_BUFFER *ry,
                                              FPGA_BUFFER *rz)
//------------------------------------------------------------------------------
{
	bool pz_is_zero = fpga_multiword_is_zero(pz);
	bool qz_is_zero = fpga_multiword_is_zero(qz);

	FPGA_BUFFER t1, t2, t3, t4, t5, t6, t7, t8;

#ifdef DUMP_UOP_OUTPUTS
    _DUMP_MODULAR_RESULTS = true;
#endif

	fpga_modular_mul(pz, pz, &t1);		// pz2 = pz * pz (pz squared)
	fpga_modular_mul(qz, qz, &t2);		// qz2 = qz * qz (qz squared)

	fpga_modular_mul(pz, &t1, &t3);		// pz3 = pz * pz2 (pz cubed)
	fpga_modular_mul(qz, &t2, &t4);     // qz3 = qz * qz2 (qz cubed)
    
	fpga_modular_mul(px, &t2, &t5);	    // pxz = px * qz2 (px z-adjusted)
	fpga_modular_mul(qx, &t1, &t2);     // qxz = qx * pz2 (qx z-adjusted)

	fpga_modular_mul(py, &t4, &t6);		// pyz = py * qz3 (py z-adjusted)
	fpga_modular_mul(qy, &t3, &t4);		// qyz = qy * pz3 (qy z-adjusted)

	fpga_modular_sub(&t2, &t5, &t7);	// dqpx = qxz - pxz (x-coordinate delta)
	fpga_modular_sub(&t4, &t6, &t8);	// dqpy = qyz - pyz (y-coordinate delta) 
 
	fpga_modular_mul(pz,  qz,  &t1);	// pqz = pz * qz
	fpga_modular_mul(&t7, &t1, rz);		// rz = pqz * qdpx
    
	fpga_modular_mul(&t8, &t8, &t2);	// dqpy2 = dqpy * dqpy
	fpga_modular_mul(&t7, &t7, &t3);	// dqpx2 = dqpx * dqpx
	fpga_modular_mul(&t7, &t3, &t4);	// dqpx3 = dqpx * dqpx2
    
	fpga_modular_sub(&t2, &t4, &t1);	// t1 = dqpy2 - dqpx3
	fpga_modular_mul(&t5, &t3, &t2);	// t2 = pxz * dqpx2
	fpga_modular_add(&t2, &t2, &t3);	// t3 = 2 * t2 (= t2 + t2, which is faster)
	fpga_modular_sub(&t1, &t3, rx);		// rx = t1 - t3
    
	fpga_modular_sub(&t2, rx,  &t1);	// t1 = t2 - rx
    fpga_modular_mul(&t1, &t8, &t2);	// t2 = t1 * dqpy
	fpga_modular_mul(&t6, &t4, &t3);	// t3 = pyz * dqpx3
	fpga_modular_sub(&t2, &t3, ry);		// ry = t2 - t3
    
    _DUMP_MODULAR_RESULTS = false;

	// P == O
    if (pz_is_zero)
    {   fpga_multiword_copy(qx, rx);
        fpga_multiword_copy(qy, ry);
        fpga_multiword_copy(qz, rz);
		return;
    }

	// Q == O
    if (qz_is_zero)
    {   fpga_multiword_copy(px, rx);
        fpga_multiword_copy(py, ry);
        fpga_multiword_copy(pz, rz);
		return;
    }
}


//------------------------------------------------------------------------------
// End-of-File
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//
// ecdsa_fpga_curve_abstract.cpp
// ----------------------------------------------
// Elliptic curve arithmetic procedures for ECDSA
//
// Authors: Pavel Shatov
//
// Copyright (c) 2015-2016, 2018, 2021 NORDUnet A/S
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// - Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
//
// - Neither the name of the NORDUnet nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------


//------------------------------------------------------------------------------
// Headers
//------------------------------------------------------------------------------
#include "ecdsa_fpga_model.h"


//------------------------------------------------------------------------------
// Globals
//------------------------------------------------------------------------------
FPGA_BUFFER ECDSA_GX, ECDSA_GY;
FPGA_BUFFER ECDSA_HX, ECDSA_HY;
FPGA_BUFFER ECDSA_N;


//------------------------------------------------------------------------------
void fpga_curve_init()
//------------------------------------------------------------------------------
{
    int w;  // word counter

    FPGA_BUFFER tmp_gx = ECDSA_GX_INIT, tmp_gy = ECDSA_GY_INIT;
    FPGA_BUFFER tmp_hx = ECDSA_HX_INIT, tmp_hy = ECDSA_HY_INIT;
    FPGA_BUFFER tmp_n = ECDSA_N_INIT;

        /* fill buffers for large multi-word integers */
    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
    {   ECDSA_GX.words[w] = tmp_gx.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_GY.words[w] = tmp_gy.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_HX.words[w] = tmp_hx.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_HY.words[w] = tmp_hy.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
        ECDSA_N.words[w] = tmp_n.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
    }
}


//------------------------------------------------------------------------------
//
// Elliptic curve base point scalar multiplication routine.
//
// Q(qx,qy) = k * G(px,py)
//
// Note, that Q is supposed to be in affine coordinates. Multiplication is done
// using the Montgomery ladder method.
//
//------------------------------------------------------------------------------
void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
//------------------------------------------------------------------------------
{
    int word_count, bit_count;  // counters

    FPGA_BUFFER r0x, r0y, r0z;  // intermediate result
    FPGA_BUFFER r1x, r1y, r1z;  // intermediate result
    FPGA_BUFFER sx, sy, sz;     // temporary variable
	FPGA_BUFFER tx, ty, tz;     // temporary variable

        /* set initial value of R0 to point at infinity, R1 to the base point */
    fpga_multiword_copy(&ECDSA_ONE,  &r0x);
    fpga_multiword_copy(&ECDSA_ONE,  &r0y);
    fpga_multiword_copy(&ECDSA_ZERO, &r0z);

    fpga_multiword_copy(&ECDSA_GX,  &r1x);
    fpga_multiword_copy(&ECDSA_GY,  &r1y);
    fpga_multiword_copy(&ECDSA_ONE, &r1z);

        /* handy vars */
    FPGA_WORD k_word_shifted;
    bool k_bit;

        /* process bits of k left-to-right */
    for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
        for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
        {
            k_word_shifted = k->words[word_count-1] >> (bit_count-1);
			k_bit = (k_word_shifted & 1) == 1;

#ifdef DUMP_CYCLE_STATES
            dump_cycle_header(word_count, bit_count, k_bit);
#endif

                /* calculate S = R0 + R */
            fpga_curve_add_jacobian_abstract_2(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z, &sx, &sy, &sz);

                /* calculate T = 2 * (R0 | R1) */
			if (!k_bit)
				fpga_curve_double_jacobian_abstract(&r0x, &r0y, &r0z, &tx, &ty, &tz);
			else
				fpga_curve_double_jacobian_abstract(&r1x, &r1y, &r1z, &tx, &ty, &tz);

                //
                // dump cycle state
                //
#ifdef DUMP_CYCLE_STATES
            dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z,
                             &sx,  &sy,  &sz,  &tx,  &ty,  &tz);
#endif

				/* now update working variables */
			if (!k_bit)
			{	fpga_multiword_copy(&tx, &r0x);
				fpga_multiword_copy(&ty, &r0y);
				fpga_multiword_copy(&tz, &r0z);

				fpga_multiword_copy(&sx, &r1x);
				fpga_multiword_copy(&sy, &r1y);
				fpga_multiword_copy(&sz, &r1z);
			}
			else
			{	fpga_multiword_copy(&tx, &r1x);
				fpga_multiword_copy(&ty, &r1y);
				fpga_multiword_copy(&tz, &r1z);

				fpga_multiword_copy(&sx, &r0x);
				fpga_multiword_copy(&sy, &r0y);
				fpga_multiword_copy(&sz, &r0z);
			}
        }

		//
		// we now need to convert the point to affine coordinates
		//
    FPGA_BUFFER a2, a3; 

#ifdef DUMP_UOP_OUTPUTS
    _DUMP_MODULAR_RESULTS = true;
#endif

	fpga_modular_inv23(&r0z, &a2, &a3);

    fpga_modular_mul(&r0x, &a2, qx);      // qx = px * (pz^-1)^2 (mod q)
    fpga_modular_mul(&r0y, &a3, qy);      // qy = py * (pz^-1)^3 (mod q)

    _DUMP_MODULAR_RESULTS = false;

        // check, that rz is non-zero (not point at infinity)
    bool rz_is_zero = fpga_multiword_is_zero(&r0z);

        // handle special case (result is point at infinity)
    if (rz_is_zero)
    {   fpga_multiword_copy(&ECDSA_ZERO, qx);
        fpga_multiword_copy(&ECDSA_ZERO, qy);
    }
}


//------------------------------------------------------------------------------
//
// Elliptic curve point doubling routine.
//
// R(rx,ry,rz) = 2 * P(px,py,pz)
//
// Note, that P(px,py,pz) is supposed to be in projective Jacobian coordinates,
// R will be in projective Jacobian coordinates.
//
// This routine implements algorithm 3.21 from "Guide to Elliptic Curve
// Cryptography", the only difference is that step 6. does T1 = T2 + T2 and
// then T2 = T2 + T1 instead of T2 = 3 * T2, because our addition is much
// faster, than multiplication.
//
// Note, that this routine also handles one special case, namely when P is at
// infinity. No actual extra "handling" is necessary, since when pz is zero,
// rz will also be zero (and that's what the "at infinity" check takes into
// account).
//
// Instead of actual modular division, multiplication by pre-computed constant
// (2^-1 mod q) is done.
//
//------------------------------------------------------------------------------
void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
                                         const FPGA_BUFFER *py,
                                         const FPGA_BUFFER *pz,
                                               FPGA_BUFFER *rx,
                                               FPGA_BUFFER *ry,
                                               FPGA_BUFFER *rz)
//------------------------------------------------------------------------------
{
    FPGA_BUFFER t1, t2, t3, t4, t5; // temporary variables

#ifdef DUMP_UOP_OUTPUTS
    _DUMP_MODULAR_RESULTS = true;
#endif

    fpga_modular_mul(pz,  pz,           &t1);
    fpga_modular_sub(px,  &t1,          &t2);
    fpga_modular_add(px,  &t1,          &t3);
    fpga_modular_mul(&t3, &t2,          &t4);
    fpga_modular_add(&t4, &t4,          &t1);
    fpga_modular_add(&t1, &t4,          &t2);
    fpga_modular_add(py,  py,           ry);
    fpga_modular_mul(pz,  ry,           rz);
	fpga_modular_mul(ry,  ry,           &t1);
    fpga_modular_mul(px,  &t1,          &t3);
	fpga_modular_mul(&t1, &t1,          &t4);
	fpga_modular_mul(&t4, &ECDSA_DELTA, &t5);
	fpga_modular_mul(&t2, &t2,          &t4);
    fpga_modular_add(&t3, &t3,          &t1);
    fpga_modular_sub(&t4, &t1,          rx); 
    fpga_modular_sub(&t3, rx,           &t1);
    fpga_modular_mul(&t1, &t2,          &t3);
    fpga_modular_sub(&t3, &t5,          ry);

    _DUMP_MODULAR_RESULTS = false;
}


//------------------------------------------------------------------------------
//
// Elliptic curve point addition routine.
//
// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy,qz)
//
// Note, that P(px, py, pz) and Q(qx, qy, qz) are supposed to be in projective
// Jacobian coordinates, R(rx, ry, rz) will be in projective Jacobian
// coordinates too.
//
// This routine implements the Point Addition algorithm from
// https://en.wikibooks.org/wiki/Cryptography/Prime_Curve/Jacobian_Coordinates
// 
// Since the routine is means to be used with Montgomery ladder, the invariant
// R1 - R0 = G means, that the two special cases P == Q and P == -Q can never
// happen and the checks are redundant. The checks for P === O and Q == O are
// necessary, however. Note, that P and Q can't be at infinity at the same time
// though.
//
// WARNING: This procedure does not take any active measures to keep run-time
// constant. The main purpose of this model is to help debug Verilog code for
// FPGA, so *DO NOT* use is anywhere near production!
//
//------------------------------------------------------------------------------
void fpga_curve_add_jacobian_abstract_2(const FPGA_BUFFER *px,
                                        const FPGA_BUFFER *py,
                                        const FPGA_BUFFER *pz,
                                        const FPGA_BUFFER *qx,
                                        const FPGA_BUFFER *qy,
                                        const FPGA_BUFFER *qz,
                                              FPGA_BUFFER *rx,
                                              FPGA_BUFFER *ry,
                                              FPGA_BUFFER *rz)
//------------------------------------------------------------------------------
{
	bool pz_is_zero = fpga_multiword_is_zero(pz);
	bool qz_is_zero = fpga_multiword_is_zero(qz);

	FPGA_BUFFER t1, t2, t3, t4, t5, t6, t7, t8;

#ifdef DUMP_UOP_OUTPUTS
    _DUMP_MODULAR_RESULTS = true;
#endif

	fpga_modular_mul(pz, pz, &t1);		// pz2 = pz * pz (pz squared)
	fpga_modular_mul(qz, qz, &t2);		// qz2 = qz * qz (qz squared)

	fpga_modular_mul(pz, &t1, &t3);		// pz3 = pz * pz2 (pz cubed)
	fpga_modular_mul(qz, &t2, &t4);     // qz3 = qz * qz2 (qz cubed)
    
	fpga_modular_mul(px, &t2, &t5);	    // pxz = px * qz2 (px z-adjusted)
	fpga_modular_mul(qx, &t1, &t2);     // qxz = qx * pz2 (qx z-adjusted)

	fpga_modular_mul(py, &t4, &t6);		// pyz = py * qz3 (py z-adjusted)
	fpga_modular_mul(qy, &t3, &t4);		// qyz = qy * pz3 (qy z-adjusted)

	fpga_modular_sub(&t2, &t5, &t7);	// dqpx = qxz - pxz (x-coordinate delta)
	fpga_modular_sub(&t4, &t6, &t8);	// dqpy = qyz - pyz (y-coordinate delta) 
 
	fpga_modular_mul(pz,  qz,  &t1);	// pqz = pz * qz
	fpga_modular_mul(&t7, &t1, rz);		// rz = pqz * qdpx
    
	fpga_modular_mul(&t8, &t8, &t2);	// dqpy2 = dqpy * dqpy
	fpga_modular_mul(&t7, &t7, &t3);	// dqpx2 = dqpx * dqpx
	fpga_modular_mul(&t7, &t3, &t4);	// dqpx3 = dqpx * dqpx2
    
	fpga_modular_sub(&t2, &t4, &t1);	// t1 = dqpy2 - dqpx3
	fpga_modular_mul(&t5, &t3, &t2);	// t2 = pxz * dqpx2
	fpga_modular_add(&t2, &t2, &t3);	// t3 = 2 * t2 (= t2 + t2, which is faster)
	fpga_modular_sub(&t1, &t3, rx);		// rx = t1 - t3
    
	fpga_modular_sub(&t2, rx,  &t1);	// t1 = t2 - rx
    fpga_modular_mul(&t1, &t8, &t2);	// t2 = t1 * dqpy
	fpga_modular_mul(&t6, &t4, &t3);	// t3 = pyz * dqpx3
	fpga_modular_sub(&t2, &t3, ry);		// ry = t2 - t3
    
    _DUMP_MODULAR_RESULTS = false;

	// P == O
    if (pz_is_zero)
    {   fpga_multiword_copy(qx, rx);
        fpga_multiword_copy(qy, ry);
        fpga_multiword_copy(qz, rz);
		return;
    }

	// Q == O
    if (qz_is_zero)
    {   fpga_multiword_copy(px, rx);
        fpga_multiword_copy(py, ry);
        fpga_multiword_copy(pz, rz);
		return;
    }
}


//------------------------------------------------------------------------------
// End-of-File
//------------------------------------------------------------------------------