aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ecdsa_fpga_curve.h70
-rw-r--r--ecdsa_fpga_curve_abstract.cpp326
-rw-r--r--ecdsa_fpga_curve_microcode.cpp547
3 files changed, 488 insertions, 455 deletions
diff --git a/ecdsa_fpga_curve.h b/ecdsa_fpga_curve.h
index 00448eb..e9f2fe6 100644
--- a/ecdsa_fpga_curve.h
+++ b/ecdsa_fpga_curve.h
@@ -6,7 +6,7 @@
//
// Authors: Pavel Shatov
//
-// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+// Copyright (c) 2015-2016, 2018, 2021 NORDUnet A/S
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
@@ -139,14 +139,33 @@ extern FPGA_BUFFER ECDSA_N;
#ifdef USE_MICROCODE
#define fpga_curve_base_scalar_multiply fpga_curve_base_scalar_multiply_microcode
-#define fpga_curve_add_jacobian fpga_curve_add_jacobian_microcode_wrapper
-#define fpga_curve_double_jacobian fpga_curve_double_jacobian_microcode_wrapper
+
+#define fpga_curve_add_jacobian_2_shim fpga_curve_add_jacobian_microcode_2_wrapper
+#define fpga_curve_double_jacobian_shim fpga_curve_double_jacobian_microcode_wrapper
+
+void fpga_curve_double_jacobian_microcode_wrapper (const FPGA_BUFFER *px,
+ const FPGA_BUFFER *py,
+ const FPGA_BUFFER *pz,
+ FPGA_BUFFER *rx,
+ FPGA_BUFFER *ry,
+ FPGA_BUFFER *rz);
+
+void fpga_curve_add_jacobian_microcode_2_wrapper(const FPGA_BUFFER *px,
+ const FPGA_BUFFER *py,
+ const FPGA_BUFFER *pz,
+ const FPGA_BUFFER *qx,
+ const FPGA_BUFFER *qy,
+ const FPGA_BUFFER *qz,
+ FPGA_BUFFER *rx,
+ FPGA_BUFFER *ry,
+ FPGA_BUFFER *rz);
#else
#define fpga_curve_base_scalar_multiply fpga_curve_base_scalar_multiply_abstract
-#define fpga_curve_add_jacobian fpga_curve_add_jacobian_abstract
-#define fpga_curve_double_jacobian fpga_curve_double_jacobian_abstract
+
+#define fpga_curve_add_jacobian_2_shim fpga_curve_add_jacobian_abstract_2
+#define fpga_curve_double_jacobian_shim fpga_curve_double_jacobian_abstract
#endif
@@ -156,20 +175,23 @@ extern FPGA_BUFFER ECDSA_N;
//------------------------------------------------------------------------------
void fpga_curve_init ();
-void fpga_curve_base_scalar_multiply_abstract (const FPGA_BUFFER *k,
- FPGA_BUFFER *qx,
- FPGA_BUFFER *qy);
+void fpga_curve_base_scalar_multiply_abstract (const FPGA_BUFFER *k,
+ FPGA_BUFFER *qx,
+ FPGA_BUFFER *qy);
void fpga_curve_base_scalar_multiply_microcode (const FPGA_BUFFER *k,
FPGA_BUFFER *qx,
FPGA_BUFFER *qy);
-void fpga_curve_add_jacobian_abstract (const FPGA_BUFFER *px,
- const FPGA_BUFFER *py,
- const FPGA_BUFFER *pz,
- FPGA_BUFFER *rx,
- FPGA_BUFFER *ry,
- FPGA_BUFFER *rz);
+void fpga_curve_add_jacobian_abstract_2 (const FPGA_BUFFER *px,
+ const FPGA_BUFFER *py,
+ const FPGA_BUFFER *pz,
+ const FPGA_BUFFER *qx,
+ const FPGA_BUFFER *qy,
+ const FPGA_BUFFER *qz,
+ FPGA_BUFFER *rx,
+ FPGA_BUFFER *ry,
+ FPGA_BUFFER *rz);
void fpga_curve_double_jacobian_abstract (const FPGA_BUFFER *px,
const FPGA_BUFFER *py,
@@ -178,24 +200,10 @@ void fpga_curve_double_jacobian_abstract (const FPGA_BUFFER *px,
FPGA_BUFFER *ry,
FPGA_BUFFER *rz);
-void fpga_curve_add_jacobian_microcode ();
-
-void fpga_curve_double_jacobian_microcode ();
-
-void fpga_curve_add_jacobian_microcode_wrapper (const FPGA_BUFFER *px,
- const FPGA_BUFFER *py,
- const FPGA_BUFFER *pz,
- FPGA_BUFFER *rx,
- FPGA_BUFFER *ry,
- FPGA_BUFFER *rz);
+void fpga_curve_add_jacobian_microcode_2 ();
-
-void fpga_curve_double_jacobian_microcode_wrapper (const FPGA_BUFFER *px,
- const FPGA_BUFFER *py,
- const FPGA_BUFFER *pz,
- FPGA_BUFFER *rx,
- FPGA_BUFFER *ry,
- FPGA_BUFFER *rz);
+void fpga_curve_double_jacobian_microcode_r0 ();
+void fpga_curve_double_jacobian_microcode_r1 ();
//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_curve_abstract.cpp b/ecdsa_fpga_curve_abstract.cpp
index 5510ac1..2d25cfc 100644
--- a/ecdsa_fpga_curve_abstract.cpp
+++ b/ecdsa_fpga_curve_abstract.cpp
@@ -6,7 +6,7 @@
//
// Authors: Pavel Shatov
//
-// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+// Copyright (c) 2015-2016, 2018, 2021 NORDUnet A/S
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
@@ -79,14 +79,7 @@ void fpga_curve_init()
// Q(qx,qy) = k * G(px,py)
//
// Note, that Q is supposed to be in affine coordinates. Multiplication is done
-// using the double-and-add algorithm 3.27 from "Guide to Elliptic Curve
-// Cryptography".
-//
-// WARNING: Though this procedure always does the addition step, it only
-// updates the result when current bit of k is set. It does not take any
-// active measures to keep run-time constant. The main purpose of this model
-// is to help debug Verilog code for FPGA, so *DO NOT* use it anywhere near
-// production!
+// using the Montgomery ladder method.
//
//------------------------------------------------------------------------------
void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
@@ -94,42 +87,91 @@ void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER
{
int word_count, bit_count; // counters
- FPGA_BUFFER rx, ry, rz; // intermediate result
- FPGA_BUFFER tx, ty, tz; // temporary variable
+ FPGA_BUFFER r0x, r0y, r0z; // intermediate result
+ FPGA_BUFFER r1x, r1y, r1z; // intermediate result
+ FPGA_BUFFER sx, sy, sz; // temporary variable
+ FPGA_BUFFER tx, ty, tz; // temporary variable
+
+ /* set initial value of R0 to point at infinity, R1 to the base point */
+ fpga_multiword_copy(&ECDSA_ONE, &r0x);
+ fpga_multiword_copy(&ECDSA_ONE, &r0y);
+ fpga_multiword_copy(&ECDSA_ZERO, &r0z);
- /* set initial value of R to point at infinity */
- fpga_multiword_copy(&ECDSA_ONE, &rx);
- fpga_multiword_copy(&ECDSA_ONE, &ry);
- fpga_multiword_copy(&ECDSA_ZERO, &rz);
+ fpga_multiword_copy(&ECDSA_GX, &r1x);
+ fpga_multiword_copy(&ECDSA_GY, &r1y);
+ fpga_multiword_copy(&ECDSA_ONE, &r1z);
+
+ /* handy vars */
+ FPGA_WORD k_word_shifted;
+ bool k_bit;
/* process bits of k left-to-right */
for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
{
- /* calculate T = 2 * R */
- fpga_curve_double_jacobian_abstract(&rx, &ry, &rz, &tx, &ty, &tz);
-
- /* always calculate R = T + P for constant-time */
- fpga_curve_add_jacobian_abstract(&tx, &ty, &tz, &rx, &ry, &rz);
-
- /* revert to the value of T before addition if the current bit of k is not set */
- if (!((k->words[word_count-1] >> (bit_count-1)) & 1))
- { fpga_multiword_copy(&tx, &rx);
- fpga_multiword_copy(&ty, &ry);
- fpga_multiword_copy(&tz, &rz);
- }
-
+ k_word_shifted = k->words[word_count-1] >> (bit_count-1);
+ k_bit = (k_word_shifted & 1) == 1;
+
+#ifdef DUMP_CYCLE_STATES
+ dump_cycle_header(word_count, bit_count, k_bit);
+#endif
+
+ /* calculate S = R0 + R */
+ fpga_curve_add_jacobian_abstract_2(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z, &sx, &sy, &sz);
+
+ /* calculate T = 2 * (R0 | R1) */
+ if (!k_bit)
+ fpga_curve_double_jacobian_abstract(&r0x, &r0y, &r0z, &tx, &ty, &tz);
+ else
+ fpga_curve_double_jacobian_abstract(&r1x, &r1y, &r1z, &tx, &ty, &tz);
+
+ //
+ // dump cycle state
+ //
+#ifdef DUMP_CYCLE_STATES
+ dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z,
+ &sx, &sy, &sz, &tx, &ty, &tz);
+#endif
+
+ /* now update working variables */
+ if (!k_bit)
+ { fpga_multiword_copy(&tx, &r0x);
+ fpga_multiword_copy(&ty, &r0y);
+ fpga_multiword_copy(&tz, &r0z);
+
+ fpga_multiword_copy(&sx, &r1x);
+ fpga_multiword_copy(&sy, &r1y);
+ fpga_multiword_copy(&sz, &r1z);
+ }
+ else
+ { fpga_multiword_copy(&tx, &r1x);
+ fpga_multiword_copy(&ty, &r1y);
+ fpga_multiword_copy(&tz, &r1z);
+
+ fpga_multiword_copy(&sx, &r0x);
+ fpga_multiword_copy(&sy, &r0y);
+ fpga_multiword_copy(&sz, &r0z);
+ }
}
+ //
+ // we now need to convert the point to affine coordinates
+ //
FPGA_BUFFER a2, a3;
- fpga_modular_inv23(&rz, &a2, &a3);
+#ifdef DUMP_UOP_OUTPUTS
+ _DUMP_MODULAR_RESULTS = true;
+#endif
+
+ fpga_modular_inv23(&r0z, &a2, &a3);
+
+ fpga_modular_mul(&r0x, &a2, qx); // qx = px * (pz^-1)^2 (mod q)
+ fpga_modular_mul(&r0y, &a3, qy); // qy = py * (pz^-1)^3 (mod q)
- fpga_modular_mul(&rx, &a2, qx); // qx = px * (pz^-1)^2 (mod q)
- fpga_modular_mul(&ry, &a3, qy); // qy = py * (pz^-1)^3 (mod q)
+ _DUMP_MODULAR_RESULTS = false;
// check, that rz is non-zero (not point at infinity)
- bool rz_is_zero = fpga_multiword_is_zero(&rz);
+ bool rz_is_zero = fpga_multiword_is_zero(&r0z);
// handle special case (result is point at infinity)
if (rz_is_zero)
@@ -154,21 +196,13 @@ void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER
// faster, than multiplication.
//
// Note, that this routine also handles one special case, namely when P is at
-// infinity.
+// infinity. No actual extra "handling" is necessary, since when pz is zero,
+// rz will also be zero (and that's what the "at infinity" check takes into
+// account).
//
// Instead of actual modular division, multiplication by pre-computed constant
// (2^-1 mod q) is done.
//
-// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
-// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
-// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
-// then fpga_modular_mul(pz, &t1, &t1) instead.
-//
-// WARNING: Though this procedure always does doubling steps, it does not take
-// any active measures to keep run-time constant. The main purpose of this
-// model is to help debug Verilog code for FPGA, so *DO NOT* use is anywhere
-// near production!
-//
//------------------------------------------------------------------------------
void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
const FPGA_BUFFER *py,
@@ -178,41 +212,32 @@ void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
FPGA_BUFFER *rz)
//------------------------------------------------------------------------------
{
- FPGA_BUFFER t1, t2, t3; // temporary variables
-
- // check, whether P is at infinity
- bool pz_is_zero = fpga_multiword_is_zero(pz);
-
- /* 2. */ fpga_multiword_copy(pz, &t1);
- fpga_modular_mul(pz, &t1, &t1);
- /* 3. */ fpga_modular_sub(px, &t1, &t2);
- /* 4. */ fpga_modular_add(px, &t1, &t1);
- /* 5. */ fpga_modular_mul(&t1, &t2, &t2);
- /* 6. */ fpga_modular_add(&t2, &t2, &t1);
- /* */ fpga_modular_add(&t1, &t2, &t2);
- /* 7. */ fpga_modular_add(py, py, ry);
- /* 8. */ fpga_modular_mul(pz, ry, rz);
- /* 9. */ fpga_multiword_copy(ry, &t1);
- fpga_multiword_copy(ry, &t3);
- fpga_modular_mul(&t1, &t3, ry);
- /* 10. */ fpga_modular_mul(px, ry, &t3);
- /* 11. */ fpga_multiword_copy(ry, &t1);
- fpga_modular_mul(ry, &t1, &t1);
- /* 12. */ fpga_modular_mul(&t1, &ECDSA_DELTA, ry);
- /* 13. */ fpga_multiword_copy(&t2, &t1);
- fpga_modular_mul(&t1, &t2, rx);
- /* 14. */ fpga_modular_add(&t3, &t3, &t1);
- /* 15. */ fpga_modular_sub(rx, &t1, rx);
- /* 16. */ fpga_modular_sub(&t3, rx, &t1);
- /* 17. */ fpga_modular_mul(&t1, &t2, &t1);
- /* 18. */ fpga_modular_sub(&t1, ry, ry);
-
- // handle special case (input point is at infinity)
- if (pz_is_zero)
- { fpga_multiword_copy(&ECDSA_ONE, rx);
- fpga_multiword_copy(&ECDSA_ONE, ry);
- fpga_multiword_copy(&ECDSA_ZERO, rz);
- }
+ FPGA_BUFFER t1, t2, t3, t4, t5; // temporary variables
+
+#ifdef DUMP_UOP_OUTPUTS
+ _DUMP_MODULAR_RESULTS = true;
+#endif
+
+ fpga_modular_mul(pz, pz, &t1);
+ fpga_modular_sub(px, &t1, &t2);
+ fpga_modular_add(px, &t1, &t3);
+ fpga_modular_mul(&t3, &t2, &t4);
+ fpga_modular_add(&t4, &t4, &t1);
+ fpga_modular_add(&t1, &t4, &t2);
+ fpga_modular_add(py, py, ry);
+ fpga_modular_mul(pz, ry, rz);
+ fpga_modular_mul(ry, ry, &t1);
+ fpga_modular_mul(px, &t1, &t3);
+ fpga_modular_mul(&t1, &t1, &t4);
+ fpga_modular_mul(&t4, &ECDSA_DELTA, &t5);
+ fpga_modular_mul(&t2, &t2, &t4);
+ fpga_modular_add(&t3, &t3, &t1);
+ fpga_modular_sub(&t4, &t1, rx);
+ fpga_modular_sub(&t3, rx, &t1);
+ fpga_modular_mul(&t1, &t2, &t3);
+ fpga_modular_sub(&t3, &t5, ry);
+
+ _DUMP_MODULAR_RESULTS = false;
}
@@ -220,89 +245,94 @@ void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
//
// Elliptic curve point addition routine.
//
-// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy)
+// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy,qz)
//
-// Note, that P(px, py, pz) is supposed to be in projective Jacobian
-// coordinates, while Q(qx,qy) is supposed to be in affine coordinates,
-// R(rx, ry, rz) will be in projective Jacobian coordinates. Moreover, in this
-// particular implementation Q is always the base point G.
+// Note, that P(px, py, pz) and Q(qx, qy, qz) are supposed to be in projective
+// Jacobian coordinates, R(rx, ry, rz) will be in projective Jacobian
+// coordinates too.
//
-// This routine implements algorithm 3.22 from "Guide to Elliptic Curve
-// Cryptography". Differences from the original algorithm:
+// This routine implements the Point Addition algorithm from
+// https://en.wikibooks.org/wiki/Cryptography/Prime_Curve/Jacobian_Coordinates
//
-// 1) Step 1. is omitted, because point Q is always the base point, which is
-// not at infinity by definition.
-//
-// 2) Step 9.1 just returns the pre-computed double of the base point instead
-// of actually doubling it.
-//
-// Note, that this routine also handles three special cases:
-//
-// 1) P is at infinity
-// 2) P == Q
-// 3) P == -Q
-//
-// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
-// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
-// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
-// then fpga_modular_mul(pz, &t1, &t1) instead.
+// Since the routine is means to be used with Montgomery ladder, the invariant
+// R1 - R0 = G means, that the two special cases P == Q and P == -Q can never
+// happen and the checks are redundant. The checks for P === O and Q == O are
+// necessary, however. Note, that P and Q can't be at infinity at the same time
+// though.
//
// WARNING: This procedure does not take any active measures to keep run-time
// constant. The main purpose of this model is to help debug Verilog code for
// FPGA, so *DO NOT* use is anywhere near production!
//
//------------------------------------------------------------------------------
-void fpga_curve_add_jacobian_abstract(const FPGA_BUFFER *px,
- const FPGA_BUFFER *py,
- const FPGA_BUFFER *pz,
- FPGA_BUFFER *rx,
- FPGA_BUFFER *ry,
- FPGA_BUFFER *rz)
+void fpga_curve_add_jacobian_abstract_2(const FPGA_BUFFER *px,
+ const FPGA_BUFFER *py,
+ const FPGA_BUFFER *pz,
+ const FPGA_BUFFER *qx,
+ const FPGA_BUFFER *qy,
+ const FPGA_BUFFER *qz,
+ FPGA_BUFFER *rx,
+ FPGA_BUFFER *ry,
+ FPGA_BUFFER *rz)
//------------------------------------------------------------------------------
{
- FPGA_BUFFER t1, t2, t3, t4; // temporary variables
+ bool pz_is_zero = fpga_multiword_is_zero(pz);
+ bool qz_is_zero = fpga_multiword_is_zero(qz);
+
+ FPGA_BUFFER t1, t2, t3, t4, t5, t6, t7, t8;
+
+#ifdef DUMP_UOP_OUTPUTS
+ _DUMP_MODULAR_RESULTS = true;
+#endif
+
+ fpga_modular_mul(pz, pz, &t1); // pz2 = pz * pz (pz squared)
+ fpga_modular_mul(qz, qz, &t2); // qz2 = qz * qz (qz squared)
+
+ fpga_modular_mul(pz, &t1, &t3); // pz3 = pz * pz2 (pz cubed)
+ fpga_modular_mul(qz, &t2, &t4); // qz3 = qz * qz2 (qz cubed)
+
+ fpga_modular_mul(px, &t2, &t5); // pxz = px * qz2 (px z-adjusted)
+ fpga_modular_mul(qx, &t1, &t2); // qxz = qx * pz2 (qx z-adjusted)
+
+ fpga_modular_mul(py, &t4, &t6); // pyz = py * qz3 (py z-adjusted)
+ fpga_modular_mul(qy, &t3, &t4); // qyz = qy * pz3 (qy z-adjusted)
+
+ fpga_modular_sub(&t2, &t5, &t7); // dqpx = qxz - pxz (x-coordinate delta)
+ fpga_modular_sub(&t4, &t6, &t8); // dqpy = qyz - pyz (y-coordinate delta)
+
+ fpga_modular_mul(pz, qz, &t1); // pqz = pz * qz
+ fpga_modular_mul(&t7, &t1, rz); // rz = pqz * qdpx
+
+ fpga_modular_mul(&t8, &t8, &t2); // dqpy2 = dqpy * dqpy
+ fpga_modular_mul(&t7, &t7, &t3); // dqpx2 = dqpx * dqpx
+ fpga_modular_mul(&t7, &t3, &t4); // dqpx3 = dqpx * dqpx2
+
+ fpga_modular_sub(&t2, &t4, &t1); // t1 = dqpy2 - dqpx3
+ fpga_modular_mul(&t5, &t3, &t2); // t2 = pxz * dqpx2
+ fpga_modular_add(&t2, &t2, &t3); // t3 = 2 * t2 (= t2 + t2, which is faster)
+ fpga_modular_sub(&t1, &t3, rx); // rx = t1 - t3
+
+ fpga_modular_sub(&t2, rx, &t1); // t1 = t2 - rx
+ fpga_modular_mul(&t1, &t8, &t2); // t2 = t1 * dqpy
+ fpga_modular_mul(&t6, &t4, &t3); // t3 = pyz * dqpx3
+ fpga_modular_sub(&t2, &t3, ry); // ry = t2 - t3
- bool pz_is_zero = fpga_multiword_is_zero(pz); // Step 2.
-
- /* 3. */ fpga_multiword_copy(pz, &t1);
- fpga_modular_mul(pz, &t1, &t1);
- /* 4. */ fpga_modular_mul(pz, &t1, &t2);
- /* 5. */ fpga_modular_mul(&t1, &ECDSA_GX, &t1);
- /* 6. */ fpga_modular_mul(&t2, &ECDSA_GY, &t2);
- /* 7. */ fpga_modular_sub(&t1, px, &t1);
- /* 8. */ fpga_modular_sub(&t2, py, &t2);
-
- bool t1_is_zero = fpga_multiword_is_zero(&t1); // | Step 9.
- bool t2_is_zero = fpga_multiword_is_zero(&t2); // |
-
- /* 10. */ fpga_modular_mul(pz, &t1, rz);
- /* 11. */ fpga_multiword_copy(&t1, &t3);
- fpga_modular_mul(&t1, &t3, &t3);
- /* 12. */ fpga_modular_mul(&t1, &t3, &t4);
- /* 13. */ fpga_modular_mul(px, &t3, &t3);
- /* 14. */ fpga_modular_add(&t3, &t3, &t1);
- /* 15. */ fpga_multiword_copy(&t2, rx);
- fpga_modular_mul(rx, &t2, rx);
- /* 16. */ fpga_modular_sub(rx, &t1, rx);
- /* 17. */ fpga_modular_sub(rx, &t4, rx);
- /* 18. */ fpga_modular_sub(&t3, rx, &t3);
- /* 19. */ fpga_modular_mul(&t2, &t3, &t3);
- /* 20. */ fpga_modular_mul(py, &t4, &t4);
- /* 21. */ fpga_modular_sub(&t3, &t4, ry);
-
- //
- // final selection
- //
- if (pz_is_zero) // P at infinity ?
- { fpga_multiword_copy(&ECDSA_GX, rx);
- fpga_multiword_copy(&ECDSA_GY, ry);
- fpga_multiword_copy(&ECDSA_ONE, rz);
+ _DUMP_MODULAR_RESULTS = false;
+
+ // P == O
+ if (pz_is_zero)
+ { fpga_multiword_copy(qx, rx);
+ fpga_multiword_copy(qy, ry);
+ fpga_multiword_copy(qz, rz);
+ return;
}
- else if (t1_is_zero) // same x for P and Q ?
- {
- fpga_multiword_copy(t2_is_zero ? &ECDSA_HX : &ECDSA_ONE, rx); // | same y ? (P==Q => R=2*G) : (P==-Q => R=O)
- fpga_multiword_copy(t2_is_zero ? &ECDSA_HY : &ECDSA_ONE, ry); // |
- fpga_multiword_copy(t2_is_zero ? &ECDSA_ONE : &ECDSA_ZERO, rz); // |
+
+ // Q == O
+ if (qz_is_zero)
+ { fpga_multiword_copy(px, rx);
+ fpga_multiword_copy(py, ry);
+ fpga_multiword_copy(pz, rz);
+ return;
}
}
diff --git a/ecdsa_fpga_curve_microcode.cpp b/ecdsa_fpga_curve_microcode.cpp
index 553498c..128e087 100644
--- a/ecdsa_fpga_curve_microcode.cpp
+++ b/ecdsa_fpga_curve_microcode.cpp
@@ -6,7 +6,7 @@
//
// Authors: Pavel Shatov
//
-// Copyright (c) 2018 NORDUnet A/S
+// Copyright (c) 2018, 2021 NORDUnet A/S
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
@@ -51,205 +51,186 @@
//------------------------------------------------------------------------------
//
-// Doubles the point stored in CYCLE_R* and stores the result in CYCLE_S*.
+// Doubles the point stored in CYCLE_R0* and stores the result in CYCLE_T*.
//
//------------------------------------------------------------------------------
-void fpga_curve_double_jacobian_microcode()
+void fpga_curve_double_jacobian_microcode_r0()
//------------------------------------------------------------------------------
{
- // fpga_modular_mul(RZ, RZ, RZ2 ); // 2. RZ2 = RZ * RZ
- // fpga_modular_sub(RX, RZ2, T1 ); // 3. T1 = RX - RZ2
- // fpga_modular_add(RX, RZ2, T2 ); // 4. T2 = RX + RZ2
- // fpga_modular_mul(T1, T2, T3 ); // 5. T3 = T1 * T2
- // fpga_modular_add(T3, T3, T4 ); // 6a. T4 = T3 + T3
- // fpga_modular_add(T3, T4, A ); // 6b. A = T3 + T4
- // fpga_modular_add(RY, RY, B ); // 7. B = RY + RY
- // fpga_modular_mul(B, RZ, SZ ); // 8. SZ = B * RZ [output]
- // fpga_modular_mul(B, B, C ); // 9. C = B * B
- // fpga_modular_mul(C, RX, D ); // 10. D = C * RX
- // fpga_modular_mul(C, C, C2 ); // 11. C2 = C * C
- // fpga_modular_mul(C2, DELTA, C2_2); // 12. C2_2 = C / 2
- // fpga_modular_mul(A, A, A2 ); // 13. A2 = A * A
- // fpga_modular_add(D, D, T1 ); // 14. T1 = D + D
- // fpga_modular_sub(A2, T1, SX ); // 15. SX = A2 - T1 [output]
- // fpga_modular_sub(D, SX, T1 ); // 16. T1 = D - SX
- // fpga_modular_mul(A , T1, T2 ); // 17. T2 = A * T1
- // fpga_modular_sub(T2, C2_2, SY ); // 18. SY = T2 - C2_2 [output]
+ /* BEGIN_MICROCODE: CYCLE_DOUBLE_R0 */
+
+ uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z, BANK_HI, CYCLE_T1);
+ uop_calc(SUB, BANK_HI, CYCLE_R0X, CYCLE_T1, BANK_LO, CYCLE_T2);
+ uop_calc(ADD, BANK_HI, CYCLE_R0X, CYCLE_T1, BANK_LO, CYCLE_T3);
+ uop_calc(MUL, BANK_LO, CYCLE_T3, CYCLE_T2, BANK_HI, CYCLE_T4);
+ uop_calc(ADD, BANK_HI, CYCLE_T4, CYCLE_T4, BANK_LO, CYCLE_T1);
+
+ uop_move( BANK_HI, CYCLE_T4, BANK_LO, CYCLE_T4);
+
+ uop_calc(ADD, BANK_LO, CYCLE_T1, CYCLE_T4, BANK_HI, CYCLE_T2);
+ uop_calc(ADD, BANK_HI, CYCLE_R0Y, CYCLE_R0Y, BANK_LO, CYCLE_TY);
+ uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_TY, BANK_HI, CYCLE_TZ);
+ uop_calc(MUL, BANK_LO, CYCLE_TY, CYCLE_TY, BANK_HI, CYCLE_T1);
+ uop_calc(MUL, BANK_HI, CYCLE_R0X, CYCLE_T1, BANK_LO, CYCLE_T3);
+ uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T1, BANK_LO, CYCLE_T4);
+ uop_calc(MUL, BANK_LO, CYCLE_T4, CONST_DELTA, BANK_HI, CYCLE_T5);
+ uop_calc(MUL, BANK_HI, CYCLE_T2, CYCLE_T2, BANK_LO, CYCLE_T4);
+ uop_calc(ADD, BANK_LO, CYCLE_T3, CYCLE_T3, BANK_HI, CYCLE_T1);
+
+ uop_move( BANK_LO, CYCLE_T4, BANK_HI, CYCLE_T4);
+
+ uop_calc(SUB, BANK_HI, CYCLE_T4, CYCLE_T1, BANK_LO, CYCLE_TX);
+ uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_TX, BANK_HI, CYCLE_T1);
+ uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T2, BANK_LO, CYCLE_T3);
+
+ uop_move( BANK_LO, CYCLE_T3, BANK_HI, CYCLE_T3);
+
+ uop_calc(SUB, BANK_HI, CYCLE_T3, CYCLE_T5, BANK_LO, CYCLE_TY);
- /* BEGIN_MICROCODE: CYCLE_DOUBLE */
-
- FPGA_BUFFER TEMP;
-
- uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ, BANK_HI, CYCLE_Z2);
- uop_stor(BANK_HI, CYCLE_Z2, &TEMP); print_fpga_buffer("CYCLE_Z2 = ", &TEMP);
-
- uop_calc(SUB, BANK_HI, CYCLE_RX, CYCLE_Z2, BANK_LO, CYCLE_T1);
- uop_stor(BANK_LO, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
-
- uop_calc(ADD, BANK_HI, CYCLE_RX, CYCLE_Z2, BANK_LO, CYCLE_T2);
- uop_stor(BANK_LO, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP);
+ /* END_MICROCODE */
+}
- uop_calc(MUL, BANK_LO, CYCLE_T1, CYCLE_T2, BANK_HI, CYCLE_T3);
- uop_stor(BANK_HI, CYCLE_T3, &TEMP); print_fpga_buffer("CYCLE_T3 = ", &TEMP);
- uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T3, BANK_LO, CYCLE_T4);
- uop_stor(BANK_LO, CYCLE_T4, &TEMP); print_fpga_buffer("CYCLE_T4 = ", &TEMP);
+//------------------------------------------------------------------------------
+//
+// Doubles the point stored in CYCLE_R1* and stores the result in CYCLE_T*.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_double_jacobian_microcode_r1()
+//------------------------------------------------------------------------------
+{
+ /* BEGIN_MICROCODE: CYCLE_DOUBLE_R1 */
+
+ uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_R1Z, BANK_HI, CYCLE_T1);
+ uop_calc(SUB, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T2);
+ uop_calc(ADD, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T3);
+ uop_calc(MUL, BANK_LO, CYCLE_T3, CYCLE_T2, BANK_HI, CYCLE_T4);
+ uop_calc(ADD, BANK_HI, CYCLE_T4, CYCLE_T4, BANK_LO, CYCLE_T1);
+
+ uop_move( BANK_HI, CYCLE_T4, BANK_LO, CYCLE_T4);
+
+ uop_calc(ADD, BANK_LO, CYCLE_T1, CYCLE_T4, BANK_HI, CYCLE_T2);
+ uop_calc(ADD, BANK_HI, CYCLE_R1Y, CYCLE_R1Y, BANK_LO, CYCLE_TY);
+ uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_TY, BANK_HI, CYCLE_TZ);
+ uop_calc(MUL, BANK_LO, CYCLE_TY, CYCLE_TY, BANK_HI, CYCLE_T1);
+ uop_calc(MUL, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T3);
+ uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T1, BANK_LO, CYCLE_T4);
+ uop_calc(MUL, BANK_LO, CYCLE_T4, CONST_DELTA, BANK_HI, CYCLE_T5);
+ uop_calc(MUL, BANK_HI, CYCLE_T2, CYCLE_T2, BANK_LO, CYCLE_T4);
+ uop_calc(ADD, BANK_LO, CYCLE_T3, CYCLE_T3, BANK_HI, CYCLE_T1);
+
+ uop_move( BANK_LO, CYCLE_T4, BANK_HI, CYCLE_T4);
+
+ uop_calc(SUB, BANK_HI, CYCLE_T4, CYCLE_T1, BANK_LO, CYCLE_TX);
+ uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_TX, BANK_HI, CYCLE_T1);
+ uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T2, BANK_LO, CYCLE_T3);
+
+ uop_move( BANK_LO, CYCLE_T3, BANK_HI, CYCLE_T3);
+
+ uop_calc(SUB, BANK_HI, CYCLE_T3, CYCLE_T5, BANK_LO, CYCLE_TY);
- uop_move( BANK_LO, CYCLE_T4, BANK_HI, CYCLE_T4);
+ /* END_MICROCODE */
+}
- uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T4, BANK_LO, CYCLE_A);
- uop_stor(BANK_LO, CYCLE_A, &TEMP); print_fpga_buffer("CYCLE_A = ", &TEMP);
- uop_calc(ADD, BANK_HI, CYCLE_RY, CYCLE_RY, BANK_LO, CYCLE_B);
- uop_stor(BANK_LO, CYCLE_B, &TEMP); print_fpga_buffer("CYCLE_B = ", &TEMP);
+//------------------------------------------------------------------------------
+//
+// Adds the points stored in CYCLE_R0|1 and stores the result in CYCLE_S.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_add_jacobian_microcode_2()
+{
- uop_calc(MUL, BANK_LO, CYCLE_B, CYCLE_RZ, BANK_HI, CYCLE_SZ);
- uop_stor(BANK_HI, CYCLE_SZ, &TEMP); print_fpga_buffer("CYCLE_SZ = ", &TEMP);
+ /* BEGIN_MICROCODE: CYCLE_ADD */
- uop_calc(MUL, BANK_LO, CYCLE_B, CYCLE_B, BANK_HI, CYCLE_C);
- uop_stor(BANK_HI, CYCLE_C, &TEMP); print_fpga_buffer("CYCLE_C = ", &TEMP);
+ uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z, BANK_HI, CYCLE_T1);
+ uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_R1Z, BANK_HI, CYCLE_T2);
+
+ uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1);
+ uop_move( BANK_HI, CYCLE_T2, BANK_LO, CYCLE_T2);
- uop_calc(MUL, BANK_HI, CYCLE_C, CYCLE_RX, BANK_LO, CYCLE_D);
- uop_stor(BANK_LO, CYCLE_D, &TEMP); print_fpga_buffer("CYCLE_D = ", &TEMP);
+ uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_T1, BANK_HI, CYCLE_T3);
+ uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_T2, BANK_HI, CYCLE_T4);
- uop_calc(MUL, BANK_HI, CYCLE_C, CYCLE_C, BANK_LO, CYCLE_C2);
- uop_stor(BANK_LO, CYCLE_C2, &TEMP); print_fpga_buffer("CYCLE_C2 = ", &TEMP);
+ uop_calc(MUL, BANK_HI, CYCLE_R0X, CYCLE_T2, BANK_LO, CYCLE_T5);
+ uop_calc(MUL, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T2);
- uop_calc(MUL, BANK_LO, CYCLE_C2, CONST_DELTA, BANK_HI, CYCLE_C2_2);
- uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer("CYCLE_C2_2 = ", &TEMP);
+ uop_calc(MUL, BANK_HI, CYCLE_R0Y, CYCLE_T4, BANK_LO, CYCLE_T6);
+ uop_calc(MUL, BANK_HI, CYCLE_R1Y, CYCLE_T3, BANK_LO, CYCLE_T4);
- uop_calc(MUL, BANK_LO, CYCLE_A, CYCLE_A, BANK_HI, CYCLE_A2);
- uop_stor(BANK_HI, CYCLE_A2, &TEMP); print_fpga_buffer("CYCLE_A2 = ", &TEMP);
+ uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T5, BANK_HI, CYCLE_T7);
+ uop_calc(SUB, BANK_LO, CYCLE_T4, CYCLE_T6, BANK_HI, CYCLE_T8);
- uop_calc(ADD, BANK_LO, CYCLE_D, CYCLE_D, BANK_HI, CYCLE_T1);
- uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
+ uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R1Z, BANK_HI, CYCLE_T1);
- uop_calc(SUB, BANK_HI, CYCLE_A2, CYCLE_T1, BANK_LO, CYCLE_SX);
- uop_stor(BANK_LO, CYCLE_SX, &TEMP); print_fpga_buffer("CYCLE_SX = ", &TEMP);
+ uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1);
+ uop_move( BANK_HI, CYCLE_T7, BANK_LO, CYCLE_T7);
- uop_calc(SUB, BANK_LO, CYCLE_D, CYCLE_SX, BANK_HI, CYCLE_T1);
- uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
+ uop_calc(MUL, BANK_LO, CYCLE_T7, CYCLE_T1, BANK_HI, CYCLE_SZ);
- uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1);
+ uop_calc(MUL, BANK_HI, CYCLE_T8, CYCLE_T8, BANK_LO, CYCLE_T2);
+ uop_calc(MUL, BANK_LO, CYCLE_T7, CYCLE_T7, BANK_HI, CYCLE_T3);
+ uop_calc(MUL, BANK_HI, CYCLE_T7, CYCLE_T3, BANK_LO, CYCLE_T4);
- uop_calc(MUL, BANK_LO, CYCLE_A, CYCLE_T1, BANK_HI, CYCLE_T2);
- uop_stor(BANK_HI, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP);
+ uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T4, BANK_HI, CYCLE_T1);
- uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_C2_2, BANK_LO, CYCLE_SY);
- uop_stor(BANK_LO, CYCLE_SY, &TEMP); print_fpga_buffer("CYCLE_SY = ", &TEMP);
+ uop_move( BANK_LO, CYCLE_T5, BANK_HI, CYCLE_T5);
- /* END_MICROCODE */
-}
+ uop_calc(MUL, BANK_HI, CYCLE_T5, CYCLE_T3, BANK_LO, CYCLE_T2);
+ uop_calc(ADD, BANK_LO, CYCLE_T2, CYCLE_T2, BANK_HI, CYCLE_T3);
+ uop_calc(SUB, BANK_HI, CYCLE_T1, CYCLE_T3, BANK_LO, CYCLE_SX);
+ uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_SX, BANK_HI, CYCLE_T1);
-//------------------------------------------------------------------------------
-//
-// Adds the base point G to the point stored in CYCLE_S* and stores the result
-// again in CYCLE_R*.
-//
-//------------------------------------------------------------------------------
-void fpga_curve_add_jacobian_microcode()
-{
- //fpga_modular_mul(SZ, SZ, A) ; // 3. A = SZ * SZ
- //fpga_modular_mul(A, SZ, B ); // 4. B = A * SZ
- //fpga_modular_mul(A, &ECDSA_GX, C ); // 5. C = A * GX
- //fpga_modular_mul(B, &ECDSA_GY, D ); // 6. D = B * GY
- //fpga_modular_sub(C, SX, E ); // 7. E = C - SX
- //fpga_modular_sub(D, SY, F ); // 8. F = D - SY
- //fpga_modular_mul(E, SZ, RZ); // 10. RZ = E * SZ [output]
- //fpga_modular_mul(E, E, G ); // 11. G = E * E
- //fpga_modular_mul(E, G, H ); // 12. H = E * G
- //fpga_modular_mul(G, SX, J ); // 13. J = G * SX
- //fpga_modular_add(J, J, T1); // 14. T1 = J + J
- //fpga_modular_mul(F, F, T2); // 15. T2 = F * F
- //fpga_modular_sub(T2, T1, T3); // 16. T3 = T2 - T1
- //fpga_modular_sub(T3, H, RX); // 17. RX = T3 - H [output]
- //fpga_modular_sub(J, RX, T1); // 18. T1 = J - RX
- //fpga_modular_mul(F, T1, T2); // 19. T2 = F * T1
- //fpga_modular_mul(H, SY, T3); // 20. T3 = H * SY
- //fpga_modular_sub(T2, T3, RY); // 21. RY = T2 - T3 [output]
+ uop_move( BANK_HI, CYCLE_T8, BANK_LO, CYCLE_T8);
+ uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1);
- /* BEGIN_MICROCODE: CYCLE_ADD */
+ uop_calc(MUL, BANK_LO, CYCLE_T1, CYCLE_T8, BANK_HI, CYCLE_T2);
+ uop_calc(MUL, BANK_LO, CYCLE_T6, CYCLE_T4, BANK_HI, CYCLE_T3);
+ uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_T3, BANK_LO, CYCLE_SY);
- uop_cmpz( BANK_HI, CYCLE_SZ);
- uop_move( BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_SZ);
- uop_calc(MUL, BANK_LO, CYCLE_SZ, CYCLE_SZ, BANK_HI, CYCLE_A);
- uop_calc(MUL, BANK_HI, CYCLE_A, CYCLE_SZ, BANK_LO, CYCLE_B);
- uop_move( BANK_LO, CYCLE_B, BANK_HI, CYCLE_B);
- uop_calc(MUL, BANK_HI, CYCLE_A, CONST_GX, BANK_LO, CYCLE_C);
- uop_calc(MUL, BANK_HI, CYCLE_B, CONST_GY, BANK_LO, CYCLE_D);
- uop_calc(SUB, BANK_LO, CYCLE_C, CYCLE_SX, BANK_HI, CYCLE_E);
- uop_calc(SUB, BANK_LO, CYCLE_D, CYCLE_SY, BANK_HI, CYCLE_F);
- uop_cmpz( BANK_HI, CYCLE_E);
- uop_cmpz( BANK_HI, CYCLE_F);
- uop_calc(MUL, BANK_HI, CYCLE_E, CYCLE_SZ, BANK_LO, CYCLE_RZ);
- uop_calc(MUL, BANK_HI, CYCLE_E, CYCLE_E, BANK_LO, CYCLE_G);
- uop_move( BANK_LO, CYCLE_G, BANK_HI, CYCLE_G);
- uop_calc(MUL, BANK_HI, CYCLE_E, CYCLE_G, BANK_LO, CYCLE_H);
- uop_calc(MUL, BANK_LO, CYCLE_G, CYCLE_SX, BANK_HI, CYCLE_J);
- uop_calc(ADD, BANK_HI, CYCLE_J, CYCLE_J, BANK_LO, CYCLE_T1);
- uop_calc(MUL, BANK_HI, CYCLE_F, CYCLE_F, BANK_LO, CYCLE_T2);
- uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T1, BANK_HI, CYCLE_T3);
- uop_move( BANK_HI, CYCLE_T3, BANK_LO, CYCLE_T3);
- uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_H, BANK_HI, CYCLE_RX);
- uop_calc(SUB, BANK_HI, CYCLE_J, CYCLE_RX, BANK_LO, CYCLE_T1);
- uop_move( BANK_HI, CYCLE_F, BANK_LO, CYCLE_F);
- uop_calc(MUL, BANK_LO, CYCLE_F, CYCLE_T1, BANK_HI, CYCLE_T2);
- uop_calc(MUL, BANK_LO, CYCLE_H, CYCLE_SY, BANK_HI, CYCLE_T3);
- uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_T3, BANK_LO, CYCLE_RY);
- uop_move( BANK_LO, CYCLE_RY, BANK_HI, CYCLE_RY);
+ uop_cmpz(BANK_LO, CYCLE_R0Z);
+ uop_cmpz(BANK_LO, CYCLE_R1Z);
/* END_MICROCODE */
//
// handle special corner cases
//
- if (uop_flagz_sz)
+
+ if (uop_flagz_r0z && !uop_flagz_r1z)
{
- /* BEGIN_MICROCODE: CYCLE_ADD_AT_INFINITY */
+ /* BEGIN_MICROCODE: CYCLE_ADD_R0_AT_INFINITY */
- uop_move(BANK_LO, CONST_GX, BANK_HI, CYCLE_RX);
- uop_move(BANK_LO, CONST_GY, BANK_HI, CYCLE_RY);
- uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ);
+ uop_move(BANK_HI, CYCLE_R1X, BANK_LO, CYCLE_SX);
+ uop_move(BANK_HI, CYCLE_R1Y, BANK_LO, CYCLE_SY);
+ uop_move(BANK_LO, CYCLE_R1Z, BANK_HI, CYCLE_SZ);
/* END_MICROCODE */
+
+ return;
}
- else
+
+ if (!uop_flagz_r0z && uop_flagz_r1z)
{
- if (uop_flagz_e)
- {
- if (uop_flagz_f)
- {
- /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X_SAME_Y */
+ /* BEGIN_MICROCODE: CYCLE_ADD_R1_AT_INFINITY */
- uop_move(BANK_LO, CONST_HX, BANK_HI, CYCLE_RX);
- uop_move(BANK_LO, CONST_HY, BANK_HI, CYCLE_RY);
- uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ);
+ uop_move(BANK_HI, CYCLE_R0X, BANK_LO, CYCLE_SX);
+ uop_move(BANK_HI, CYCLE_R0Y, BANK_LO, CYCLE_SY);
+ uop_move(BANK_LO, CYCLE_R0Z, BANK_HI, CYCLE_SZ);
- /* END_MICROCODE */
- }
- else
- {
- /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X */
+ /* END_MICROCODE */
- uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RX);
- uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RY);
- uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ);
+ return;
+ }
- /* END_MICROCODE */
- }
- }
- else
- {
- /* BEGIN_MICROCODE: CYCLE_ADD_REGULAR */
+ /* BEGIN_MICROCODE: CYCLE_ADD_REGULAR */
- uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_T1);
- uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_T2);
- uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_T3);
+ uop_move(BANK_LO, CONST_GX, BANK_HI, CYCLE_SX);
+ uop_move(BANK_LO, CONST_GY, BANK_HI, CYCLE_SY);
+ uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_SZ);
- /* END_MICROCODE */
- }
- }
+ /* END_MICROCODE */
}
@@ -262,6 +243,13 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
FPGA_WORD k_word;
bool k_bit;
+#ifdef DUMP_CYCLE_STATES
+ FPGA_BUFFER r0x, r0y, r0z;
+ FPGA_BUFFER r1x, r1y, r1z;
+ FPGA_BUFFER sx, sy, sz;
+ FPGA_BUFFER tx, ty, tz;
+#endif
+
// initialize internal banks
fpga_multiword_copy(&ECDSA_ZERO, &BUF_LO[CONST_ZERO]);
fpga_multiword_copy(&ECDSA_ZERO, &BUF_HI[CONST_ZERO]);
@@ -278,61 +266,110 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
fpga_multiword_copy(&ECDSA_GY, &BUF_LO[CONST_GY]);
fpga_multiword_copy(&ECDSA_GY, &BUF_HI[CONST_GY]);
- fpga_multiword_copy(&ECDSA_HX, &BUF_LO[CONST_HX]);
- fpga_multiword_copy(&ECDSA_HX, &BUF_HI[CONST_HX]);
-
- fpga_multiword_copy(&ECDSA_HY, &BUF_LO[CONST_HY]);
- fpga_multiword_copy(&ECDSA_HY, &BUF_HI[CONST_HY]);
-
/* BEGIN_MICROCODE: PREPARE */
- // set initial value of R to point at infinity
- uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RX);
- uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RY);
- uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ);
+ // set initial value of R0 to point at infinity
+ // set initial value of R1 to the base point
+
+ uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_R0X);
+ uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_R0Y);
+ uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_R0Z);
+
+ uop_move(BANK_LO, CONST_GX, BANK_HI, CYCLE_R1X);
+ uop_move(BANK_LO, CONST_GY, BANK_HI, CYCLE_R1Y);
+ uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_R1Z);
/* END_MICROCODE */
+
/* process bits of k left-to-right */
for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
{
k_word = k->words[word_count-1];
- k_bit = (k_word & (FPGA_WORD)(1 << (bit_count-1))) > 0;
+ k_bit = (k_word & (FPGA_WORD)(1 << (bit_count-1))) > 0;
- // Banks of working cycle operands
- // -------------------------------
- // RX: HI
- // RY: HI
- // RZ: LO
+#ifdef DUMP_CYCLE_STATES
+ dump_cycle_header(word_count, bit_count, k_bit);
+#endif
- // calculate S = 2 * R
- fpga_curve_double_jacobian_microcode();
+ //
+ // calculate S = R0 + R1
+ //
// Banks of working cycle operands
// -------------------------------
+ // R0|1X: HI
+ // R0|1Y: HI
+ // R0|1Z: LO
+
// SX: LO
// SY: LO
// SZ: HI
- // always calculate R = S * G for constant-time operation
- fpga_curve_add_jacobian_microcode();
+ fpga_curve_add_jacobian_microcode_2();
+
+ //
+ // calculate T = 2 * R0 or T = 2 * R1
+ //
// Banks of working cycle operands
// -------------------------------
- // RX: HI
- // RY: HI
- // RZ: LO
+ // R0|1X: HI
+ // R0|1Y: HI
+ // R0|1Z: LO
+
+ // TX: LO
+ // TY: LO
+ // TZ: HI
+ if (!k_bit)
+ fpga_curve_double_jacobian_microcode_r0();
+ else
+ fpga_curve_double_jacobian_microcode_r1();
+
+ //
+ // dump cycle state
+ //
+#ifdef DUMP_CYCLE_STATES
+ uop_stor(BANK_HI, CYCLE_R0X, &r0x);
+ uop_stor(BANK_HI, CYCLE_R0Y, &r0y);
+ uop_stor(BANK_LO, CYCLE_R0Z, &r0z);
+
+ uop_stor(BANK_HI, CYCLE_R1X, &r1x);
+ uop_stor(BANK_HI, CYCLE_R1Y, &r1y);
+ uop_stor(BANK_LO, CYCLE_R1Z, &r1z);
+
+ uop_stor(BANK_LO, CYCLE_SX, &sx);
+ uop_stor(BANK_LO, CYCLE_SY, &sy);
+ uop_stor(BANK_HI, CYCLE_SZ, &sz);
+
+ uop_stor(BANK_LO, CYCLE_TX, &tx);
+ uop_stor(BANK_LO, CYCLE_TY, &ty);
+ uop_stor(BANK_HI, CYCLE_TZ, &tz);
+
+ dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z,
+ &sx, &sy, &sz, &tx, &ty, &tz);
+#endif
+
+ //
+ // update working variables
+ //
if (!k_bit)
{
/* BEGIN_MICROCODE: CYCLE_K0 */
- // revert to the value of S before addition if the current bit of k is not set
- uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX);
- uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY);
- uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_RZ);
+ // R0 = 2 * R0 (double)
+ // R1 = R0 + R1 (add)
+
+ uop_move(BANK_LO, CYCLE_TX, BANK_HI, CYCLE_R0X);
+ uop_move(BANK_LO, CYCLE_TY, BANK_HI, CYCLE_R0Y);
+ uop_move(BANK_HI, CYCLE_TZ, BANK_LO, CYCLE_R0Z);
+
+ uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R1X);
+ uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R1Y);
+ uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_R1Z);
/* END_MICROCODE */
}
@@ -340,74 +377,20 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
{
/* BEGIN_MICROCODE: CYCLE_K1 */
- // do dummy overwrite for constant-time operation
- uop_move(BANK_HI, CYCLE_RX, BANK_LO, CYCLE_SX);
- uop_move(BANK_HI, CYCLE_RY, BANK_LO, CYCLE_SY);
- uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_SZ);
+ // R0 = R0 + R1 (add)
+ // R1 = 2 * R1 (double)
+
+ uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R0X);
+ uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R0Y);
+ uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_R0Z);
+
+ uop_move(BANK_LO, CYCLE_TX, BANK_HI, CYCLE_R1X);
+ uop_move(BANK_LO, CYCLE_TY, BANK_HI, CYCLE_R1Y);
+ uop_move(BANK_HI, CYCLE_TZ, BANK_LO, CYCLE_R1Z);
/* END_MICROCODE */
}
- FPGA_BUFFER TEMP;
-
- //printf("wc = %d, bc = %d\n", word_count-1, bit_count-1);
-
- uop_stor(BANK_LO, CYCLE_RX, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RX = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_RY, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RY = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_RZ, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RZ = ", &TEMP);
-
- uop_stor(BANK_LO, CYCLE_SX, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SX = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_SY, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SY = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_SZ, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SZ = ", &TEMP);
-
- uop_stor(BANK_LO, CYCLE_A, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_A2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A2 = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_B, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_B = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_C, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_C2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2 = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2_2 = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_D, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_D = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_E, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_E = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_F, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_F = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_G, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_G = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_H, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_H = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_J, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_J = ", &TEMP);
-
- uop_stor(BANK_LO, CYCLE_Z2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_Z2 = ", &TEMP);
-
- uop_stor(BANK_LO, CYCLE_T1, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T1 = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_T2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T2 = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_T3, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T3 = ", &TEMP);
- uop_stor(BANK_LO, CYCLE_T4, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T4 = ", &TEMP);
-
- uop_stor(BANK_HI, CYCLE_RX, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RX = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_RY, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RY = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_RZ, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RZ = ", &TEMP);
-
- uop_stor(BANK_HI, CYCLE_SX, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SX = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_SY, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SY = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_SZ, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SZ = ", &TEMP);
-
- uop_stor(BANK_HI, CYCLE_A, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_A2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A2 = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_B, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_B = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_C, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_C2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2 = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2_2 = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_D, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_D = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_E, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_E = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_F, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_F = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_G, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_G = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_H, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_H = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_J, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_J = ", &TEMP);
-
- uop_stor(BANK_HI, CYCLE_Z2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_Z2 = ", &TEMP);
-
- uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T1 = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_T2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T2 = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_T3, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T3 = ", &TEMP);
- uop_stor(BANK_HI, CYCLE_T4, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T4 = ", &TEMP);
-
}
// now convert to affine coordinates
@@ -415,18 +398,18 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
/* BEGIN_MICROCODE: CONVERT */
- uop_calc(MUL, BANK_HI, INVERT_A2, CYCLE_RX, BANK_LO, CYCLE_SX);
- uop_calc(MUL, BANK_HI, INVERT_A3, CYCLE_RY, BANK_LO, CYCLE_SY);
- uop_cmpz(BANK_LO, CYCLE_RZ);
+ uop_calc(MUL, BANK_HI, INVERT_A2, CYCLE_R0X, BANK_LO, CYCLE_SX);
+ uop_calc(MUL, BANK_HI, INVERT_A3, CYCLE_R0Y, BANK_LO, CYCLE_SY);
+ uop_cmpz(BANK_LO, CYCLE_R0Z);
/* END_MICROCODE */
- if (uop_flagz_rz)
+ if (uop_flagz_r0z)
{
/* BEGIN_MICROCODE: CONVERT_AT_INFINITY */
- uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RX);
- uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RY);
+ uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_R0X);
+ uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_R0Y);
/* END_MICROCODE */
}
@@ -434,15 +417,15 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER
{
/* BEGIN_MICROCODE: CONVERT_REGULAR */
- uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX);
- uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY);
+ uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R0X);
+ uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R0Y);
/* END_MICROCODE */
}
// return
- uop_stor(BANK_HI, CYCLE_RX, qx);
- uop_stor(BANK_HI, CYCLE_RY, qy);
+ uop_stor(BANK_HI, CYCLE_R0X, qx);
+ uop_stor(BANK_HI, CYCLE_R0Y, qy);
}
#endif USE_MICROCODE
@@ -456,36 +439,48 @@ void fpga_curve_double_jacobian_microcode_wrapper(const FPGA_BUFFER *rx,
FPGA_BUFFER *sz)
//------------------------------------------------------------------------------
{
- uop_load(rx, BANK_HI, CYCLE_RX);
- uop_load(ry, BANK_HI, CYCLE_RY);
- uop_load(rz, BANK_LO, CYCLE_RZ);
+ //
+ // we have two pieces of microcode to double either R0 or R1 (this
+ // depends on the current multiplier bit), here we can just always
+ // use the one meant for R0
- fpga_curve_double_jacobian_microcode();
+ uop_load(rx, BANK_HI, CYCLE_R0X);
+ uop_load(ry, BANK_HI, CYCLE_R0Y);
+ uop_load(rz, BANK_LO, CYCLE_R0Z);
- uop_stor(BANK_LO, CYCLE_SX, sx);
- uop_stor(BANK_LO, CYCLE_SY, sy);
- uop_stor(BANK_HI, CYCLE_SZ, sz);
+ fpga_curve_double_jacobian_microcode_r0();
+
+ uop_stor(BANK_LO, CYCLE_TX, sx);
+ uop_stor(BANK_LO, CYCLE_TY, sy);
+ uop_stor(BANK_HI, CYCLE_TZ, sz);
}
//------------------------------------------------------------------------------
-void fpga_curve_add_jacobian_microcode_wrapper(const FPGA_BUFFER *sx,
- const FPGA_BUFFER *sy,
- const FPGA_BUFFER *sz,
- FPGA_BUFFER *rx,
- FPGA_BUFFER *ry,
- FPGA_BUFFER *rz)
+void fpga_curve_add_jacobian_microcode_2_wrapper(const FPGA_BUFFER *px,
+ const FPGA_BUFFER *py,
+ const FPGA_BUFFER *pz,
+ const FPGA_BUFFER *qx,
+ const FPGA_BUFFER *qy,
+ const FPGA_BUFFER *qz,
+ FPGA_BUFFER *rx,
+ FPGA_BUFFER *ry,
+ FPGA_BUFFER *rz)
//------------------------------------------------------------------------------
{
- uop_load(sx, BANK_LO, CYCLE_SX);
- uop_load(sy, BANK_LO, CYCLE_SY);
- uop_load(sz, BANK_HI, CYCLE_SZ);
+ uop_load(px, BANK_HI, CYCLE_R0X);
+ uop_load(py, BANK_HI, CYCLE_R0Y);
+ uop_load(pz, BANK_LO, CYCLE_R0Z);
+
+ uop_load(qx, BANK_HI, CYCLE_R1X);
+ uop_load(qy, BANK_HI, CYCLE_R1Y);
+ uop_load(qz, BANK_LO, CYCLE_R1Z);
- fpga_curve_add_jacobian_microcode();
+ fpga_curve_add_jacobian_microcode_2();
- uop_stor(BANK_HI, CYCLE_RX, rx);
- uop_stor(BANK_HI, CYCLE_RY, ry);
- uop_stor(BANK_LO, CYCLE_RZ, rz);
+ uop_stor(BANK_HI, CYCLE_SX, rx);
+ uop_stor(BANK_HI, CYCLE_SY, ry);
+ uop_stor(BANK_LO, CYCLE_SZ, rz);
}