From 1e16303d718986e0e991444a7cdcab3c5c89b1f4 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Sun, 11 Apr 2021 17:42:52 +0300 Subject: Updated curve math layer to do multiplication using the Montgomery ladder method. Also added optional debugging output to help debug microcoded versions of double and add routines. --- ecdsa_fpga_curve_abstract.cpp | 326 +++++++++++++++++++++++------------------- 1 file changed, 178 insertions(+), 148 deletions(-) (limited to 'ecdsa_fpga_curve_abstract.cpp') diff --git a/ecdsa_fpga_curve_abstract.cpp b/ecdsa_fpga_curve_abstract.cpp index 5510ac1..2d25cfc 100644 --- a/ecdsa_fpga_curve_abstract.cpp +++ b/ecdsa_fpga_curve_abstract.cpp @@ -6,7 +6,7 @@ // // Authors: Pavel Shatov // -// Copyright (c) 2015-2016, 2018 NORDUnet A/S +// Copyright (c) 2015-2016, 2018, 2021 NORDUnet A/S // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: @@ -79,14 +79,7 @@ void fpga_curve_init() // Q(qx,qy) = k * G(px,py) // // Note, that Q is supposed to be in affine coordinates. Multiplication is done -// using the double-and-add algorithm 3.27 from "Guide to Elliptic Curve -// Cryptography". -// -// WARNING: Though this procedure always does the addition step, it only -// updates the result when current bit of k is set. It does not take any -// active measures to keep run-time constant. The main purpose of this model -// is to help debug Verilog code for FPGA, so *DO NOT* use it anywhere near -// production! +// using the Montgomery ladder method. // //------------------------------------------------------------------------------ void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy) @@ -94,42 +87,91 @@ void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER { int word_count, bit_count; // counters - FPGA_BUFFER rx, ry, rz; // intermediate result - FPGA_BUFFER tx, ty, tz; // temporary variable + FPGA_BUFFER r0x, r0y, r0z; // intermediate result + FPGA_BUFFER r1x, r1y, r1z; // intermediate result + FPGA_BUFFER sx, sy, sz; // temporary variable + FPGA_BUFFER tx, ty, tz; // temporary variable + + /* set initial value of R0 to point at infinity, R1 to the base point */ + fpga_multiword_copy(&ECDSA_ONE, &r0x); + fpga_multiword_copy(&ECDSA_ONE, &r0y); + fpga_multiword_copy(&ECDSA_ZERO, &r0z); - /* set initial value of R to point at infinity */ - fpga_multiword_copy(&ECDSA_ONE, &rx); - fpga_multiword_copy(&ECDSA_ONE, &ry); - fpga_multiword_copy(&ECDSA_ZERO, &rz); + fpga_multiword_copy(&ECDSA_GX, &r1x); + fpga_multiword_copy(&ECDSA_GY, &r1y); + fpga_multiword_copy(&ECDSA_ONE, &r1z); + + /* handy vars */ + FPGA_WORD k_word_shifted; + bool k_bit; /* process bits of k left-to-right */ for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--) for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--) { - /* calculate T = 2 * R */ - fpga_curve_double_jacobian_abstract(&rx, &ry, &rz, &tx, &ty, &tz); - - /* always calculate R = T + P for constant-time */ - fpga_curve_add_jacobian_abstract(&tx, &ty, &tz, &rx, &ry, &rz); - - /* revert to the value of T before addition if the current bit of k is not set */ - if (!((k->words[word_count-1] >> (bit_count-1)) & 1)) - { fpga_multiword_copy(&tx, &rx); - fpga_multiword_copy(&ty, &ry); - fpga_multiword_copy(&tz, &rz); - } - + k_word_shifted = k->words[word_count-1] >> (bit_count-1); + k_bit = (k_word_shifted & 1) == 1; + +#ifdef DUMP_CYCLE_STATES + dump_cycle_header(word_count, bit_count, k_bit); +#endif + + /* calculate S = R0 + R */ + fpga_curve_add_jacobian_abstract_2(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z, &sx, &sy, &sz); + + /* calculate T = 2 * (R0 | R1) */ + if (!k_bit) + fpga_curve_double_jacobian_abstract(&r0x, &r0y, &r0z, &tx, &ty, &tz); + else + fpga_curve_double_jacobian_abstract(&r1x, &r1y, &r1z, &tx, &ty, &tz); + + // + // dump cycle state + // +#ifdef DUMP_CYCLE_STATES + dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z, + &sx, &sy, &sz, &tx, &ty, &tz); +#endif + + /* now update working variables */ + if (!k_bit) + { fpga_multiword_copy(&tx, &r0x); + fpga_multiword_copy(&ty, &r0y); + fpga_multiword_copy(&tz, &r0z); + + fpga_multiword_copy(&sx, &r1x); + fpga_multiword_copy(&sy, &r1y); + fpga_multiword_copy(&sz, &r1z); + } + else + { fpga_multiword_copy(&tx, &r1x); + fpga_multiword_copy(&ty, &r1y); + fpga_multiword_copy(&tz, &r1z); + + fpga_multiword_copy(&sx, &r0x); + fpga_multiword_copy(&sy, &r0y); + fpga_multiword_copy(&sz, &r0z); + } } + // + // we now need to convert the point to affine coordinates + // FPGA_BUFFER a2, a3; - fpga_modular_inv23(&rz, &a2, &a3); +#ifdef DUMP_UOP_OUTPUTS + _DUMP_MODULAR_RESULTS = true; +#endif + + fpga_modular_inv23(&r0z, &a2, &a3); + + fpga_modular_mul(&r0x, &a2, qx); // qx = px * (pz^-1)^2 (mod q) + fpga_modular_mul(&r0y, &a3, qy); // qy = py * (pz^-1)^3 (mod q) - fpga_modular_mul(&rx, &a2, qx); // qx = px * (pz^-1)^2 (mod q) - fpga_modular_mul(&ry, &a3, qy); // qy = py * (pz^-1)^3 (mod q) + _DUMP_MODULAR_RESULTS = false; // check, that rz is non-zero (not point at infinity) - bool rz_is_zero = fpga_multiword_is_zero(&rz); + bool rz_is_zero = fpga_multiword_is_zero(&r0z); // handle special case (result is point at infinity) if (rz_is_zero) @@ -154,21 +196,13 @@ void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER // faster, than multiplication. // // Note, that this routine also handles one special case, namely when P is at -// infinity. +// infinity. No actual extra "handling" is necessary, since when pz is zero, +// rz will also be zero (and that's what the "at infinity" check takes into +// account). // // Instead of actual modular division, multiplication by pre-computed constant // (2^-1 mod q) is done. // -// Note, that FPGA modular multiplier can't multiply a given buffer by itself, -// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome -// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and -// then fpga_modular_mul(pz, &t1, &t1) instead. -// -// WARNING: Though this procedure always does doubling steps, it does not take -// any active measures to keep run-time constant. The main purpose of this -// model is to help debug Verilog code for FPGA, so *DO NOT* use is anywhere -// near production! -// //------------------------------------------------------------------------------ void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px, const FPGA_BUFFER *py, @@ -178,41 +212,32 @@ void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px, FPGA_BUFFER *rz) //------------------------------------------------------------------------------ { - FPGA_BUFFER t1, t2, t3; // temporary variables - - // check, whether P is at infinity - bool pz_is_zero = fpga_multiword_is_zero(pz); - - /* 2. */ fpga_multiword_copy(pz, &t1); - fpga_modular_mul(pz, &t1, &t1); - /* 3. */ fpga_modular_sub(px, &t1, &t2); - /* 4. */ fpga_modular_add(px, &t1, &t1); - /* 5. */ fpga_modular_mul(&t1, &t2, &t2); - /* 6. */ fpga_modular_add(&t2, &t2, &t1); - /* */ fpga_modular_add(&t1, &t2, &t2); - /* 7. */ fpga_modular_add(py, py, ry); - /* 8. */ fpga_modular_mul(pz, ry, rz); - /* 9. */ fpga_multiword_copy(ry, &t1); - fpga_multiword_copy(ry, &t3); - fpga_modular_mul(&t1, &t3, ry); - /* 10. */ fpga_modular_mul(px, ry, &t3); - /* 11. */ fpga_multiword_copy(ry, &t1); - fpga_modular_mul(ry, &t1, &t1); - /* 12. */ fpga_modular_mul(&t1, &ECDSA_DELTA, ry); - /* 13. */ fpga_multiword_copy(&t2, &t1); - fpga_modular_mul(&t1, &t2, rx); - /* 14. */ fpga_modular_add(&t3, &t3, &t1); - /* 15. */ fpga_modular_sub(rx, &t1, rx); - /* 16. */ fpga_modular_sub(&t3, rx, &t1); - /* 17. */ fpga_modular_mul(&t1, &t2, &t1); - /* 18. */ fpga_modular_sub(&t1, ry, ry); - - // handle special case (input point is at infinity) - if (pz_is_zero) - { fpga_multiword_copy(&ECDSA_ONE, rx); - fpga_multiword_copy(&ECDSA_ONE, ry); - fpga_multiword_copy(&ECDSA_ZERO, rz); - } + FPGA_BUFFER t1, t2, t3, t4, t5; // temporary variables + +#ifdef DUMP_UOP_OUTPUTS + _DUMP_MODULAR_RESULTS = true; +#endif + + fpga_modular_mul(pz, pz, &t1); + fpga_modular_sub(px, &t1, &t2); + fpga_modular_add(px, &t1, &t3); + fpga_modular_mul(&t3, &t2, &t4); + fpga_modular_add(&t4, &t4, &t1); + fpga_modular_add(&t1, &t4, &t2); + fpga_modular_add(py, py, ry); + fpga_modular_mul(pz, ry, rz); + fpga_modular_mul(ry, ry, &t1); + fpga_modular_mul(px, &t1, &t3); + fpga_modular_mul(&t1, &t1, &t4); + fpga_modular_mul(&t4, &ECDSA_DELTA, &t5); + fpga_modular_mul(&t2, &t2, &t4); + fpga_modular_add(&t3, &t3, &t1); + fpga_modular_sub(&t4, &t1, rx); + fpga_modular_sub(&t3, rx, &t1); + fpga_modular_mul(&t1, &t2, &t3); + fpga_modular_sub(&t3, &t5, ry); + + _DUMP_MODULAR_RESULTS = false; } @@ -220,89 +245,94 @@ void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px, // // Elliptic curve point addition routine. // -// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy) +// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy,qz) // -// Note, that P(px, py, pz) is supposed to be in projective Jacobian -// coordinates, while Q(qx,qy) is supposed to be in affine coordinates, -// R(rx, ry, rz) will be in projective Jacobian coordinates. Moreover, in this -// particular implementation Q is always the base point G. +// Note, that P(px, py, pz) and Q(qx, qy, qz) are supposed to be in projective +// Jacobian coordinates, R(rx, ry, rz) will be in projective Jacobian +// coordinates too. // -// This routine implements algorithm 3.22 from "Guide to Elliptic Curve -// Cryptography". Differences from the original algorithm: +// This routine implements the Point Addition algorithm from +// https://en.wikibooks.org/wiki/Cryptography/Prime_Curve/Jacobian_Coordinates // -// 1) Step 1. is omitted, because point Q is always the base point, which is -// not at infinity by definition. -// -// 2) Step 9.1 just returns the pre-computed double of the base point instead -// of actually doubling it. -// -// Note, that this routine also handles three special cases: -// -// 1) P is at infinity -// 2) P == Q -// 3) P == -Q -// -// Note, that FPGA modular multiplier can't multiply a given buffer by itself, -// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome -// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and -// then fpga_modular_mul(pz, &t1, &t1) instead. +// Since the routine is means to be used with Montgomery ladder, the invariant +// R1 - R0 = G means, that the two special cases P == Q and P == -Q can never +// happen and the checks are redundant. The checks for P === O and Q == O are +// necessary, however. Note, that P and Q can't be at infinity at the same time +// though. // // WARNING: This procedure does not take any active measures to keep run-time // constant. The main purpose of this model is to help debug Verilog code for // FPGA, so *DO NOT* use is anywhere near production! // //------------------------------------------------------------------------------ -void fpga_curve_add_jacobian_abstract(const FPGA_BUFFER *px, - const FPGA_BUFFER *py, - const FPGA_BUFFER *pz, - FPGA_BUFFER *rx, - FPGA_BUFFER *ry, - FPGA_BUFFER *rz) +void fpga_curve_add_jacobian_abstract_2(const FPGA_BUFFER *px, + const FPGA_BUFFER *py, + const FPGA_BUFFER *pz, + const FPGA_BUFFER *qx, + const FPGA_BUFFER *qy, + const FPGA_BUFFER *qz, + FPGA_BUFFER *rx, + FPGA_BUFFER *ry, + FPGA_BUFFER *rz) //------------------------------------------------------------------------------ { - FPGA_BUFFER t1, t2, t3, t4; // temporary variables + bool pz_is_zero = fpga_multiword_is_zero(pz); + bool qz_is_zero = fpga_multiword_is_zero(qz); + + FPGA_BUFFER t1, t2, t3, t4, t5, t6, t7, t8; + +#ifdef DUMP_UOP_OUTPUTS + _DUMP_MODULAR_RESULTS = true; +#endif + + fpga_modular_mul(pz, pz, &t1); // pz2 = pz * pz (pz squared) + fpga_modular_mul(qz, qz, &t2); // qz2 = qz * qz (qz squared) + + fpga_modular_mul(pz, &t1, &t3); // pz3 = pz * pz2 (pz cubed) + fpga_modular_mul(qz, &t2, &t4); // qz3 = qz * qz2 (qz cubed) + + fpga_modular_mul(px, &t2, &t5); // pxz = px * qz2 (px z-adjusted) + fpga_modular_mul(qx, &t1, &t2); // qxz = qx * pz2 (qx z-adjusted) + + fpga_modular_mul(py, &t4, &t6); // pyz = py * qz3 (py z-adjusted) + fpga_modular_mul(qy, &t3, &t4); // qyz = qy * pz3 (qy z-adjusted) + + fpga_modular_sub(&t2, &t5, &t7); // dqpx = qxz - pxz (x-coordinate delta) + fpga_modular_sub(&t4, &t6, &t8); // dqpy = qyz - pyz (y-coordinate delta) + + fpga_modular_mul(pz, qz, &t1); // pqz = pz * qz + fpga_modular_mul(&t7, &t1, rz); // rz = pqz * qdpx + + fpga_modular_mul(&t8, &t8, &t2); // dqpy2 = dqpy * dqpy + fpga_modular_mul(&t7, &t7, &t3); // dqpx2 = dqpx * dqpx + fpga_modular_mul(&t7, &t3, &t4); // dqpx3 = dqpx * dqpx2 + + fpga_modular_sub(&t2, &t4, &t1); // t1 = dqpy2 - dqpx3 + fpga_modular_mul(&t5, &t3, &t2); // t2 = pxz * dqpx2 + fpga_modular_add(&t2, &t2, &t3); // t3 = 2 * t2 (= t2 + t2, which is faster) + fpga_modular_sub(&t1, &t3, rx); // rx = t1 - t3 + + fpga_modular_sub(&t2, rx, &t1); // t1 = t2 - rx + fpga_modular_mul(&t1, &t8, &t2); // t2 = t1 * dqpy + fpga_modular_mul(&t6, &t4, &t3); // t3 = pyz * dqpx3 + fpga_modular_sub(&t2, &t3, ry); // ry = t2 - t3 - bool pz_is_zero = fpga_multiword_is_zero(pz); // Step 2. - - /* 3. */ fpga_multiword_copy(pz, &t1); - fpga_modular_mul(pz, &t1, &t1); - /* 4. */ fpga_modular_mul(pz, &t1, &t2); - /* 5. */ fpga_modular_mul(&t1, &ECDSA_GX, &t1); - /* 6. */ fpga_modular_mul(&t2, &ECDSA_GY, &t2); - /* 7. */ fpga_modular_sub(&t1, px, &t1); - /* 8. */ fpga_modular_sub(&t2, py, &t2); - - bool t1_is_zero = fpga_multiword_is_zero(&t1); // | Step 9. - bool t2_is_zero = fpga_multiword_is_zero(&t2); // | - - /* 10. */ fpga_modular_mul(pz, &t1, rz); - /* 11. */ fpga_multiword_copy(&t1, &t3); - fpga_modular_mul(&t1, &t3, &t3); - /* 12. */ fpga_modular_mul(&t1, &t3, &t4); - /* 13. */ fpga_modular_mul(px, &t3, &t3); - /* 14. */ fpga_modular_add(&t3, &t3, &t1); - /* 15. */ fpga_multiword_copy(&t2, rx); - fpga_modular_mul(rx, &t2, rx); - /* 16. */ fpga_modular_sub(rx, &t1, rx); - /* 17. */ fpga_modular_sub(rx, &t4, rx); - /* 18. */ fpga_modular_sub(&t3, rx, &t3); - /* 19. */ fpga_modular_mul(&t2, &t3, &t3); - /* 20. */ fpga_modular_mul(py, &t4, &t4); - /* 21. */ fpga_modular_sub(&t3, &t4, ry); - - // - // final selection - // - if (pz_is_zero) // P at infinity ? - { fpga_multiword_copy(&ECDSA_GX, rx); - fpga_multiword_copy(&ECDSA_GY, ry); - fpga_multiword_copy(&ECDSA_ONE, rz); + _DUMP_MODULAR_RESULTS = false; + + // P == O + if (pz_is_zero) + { fpga_multiword_copy(qx, rx); + fpga_multiword_copy(qy, ry); + fpga_multiword_copy(qz, rz); + return; } - else if (t1_is_zero) // same x for P and Q ? - { - fpga_multiword_copy(t2_is_zero ? &ECDSA_HX : &ECDSA_ONE, rx); // | same y ? (P==Q => R=2*G) : (P==-Q => R=O) - fpga_multiword_copy(t2_is_zero ? &ECDSA_HY : &ECDSA_ONE, ry); // | - fpga_multiword_copy(t2_is_zero ? &ECDSA_ONE : &ECDSA_ZERO, rz); // | + + // Q == O + if (qz_is_zero) + { fpga_multiword_copy(px, rx); + fpga_multiword_copy(py, ry); + fpga_multiword_copy(pz, rz); + return; } } -- cgit v1.2.3