//------------------------------------------------------------------------------ // // ecdsa_fpga_modular.cpp // ------------------------------------- // Modular arithmetic routines for ECDSA // // Authors: Pavel Shatov // // Copyright 2015-2016, 2018 NORDUnet A/S // Copyright 2021 The Commons Conservancy Cryptech Project // SPDX-License-Identifier: BSD-3-Clause // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // - Neither the name of the copyright holder nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // Headers //------------------------------------------------------------------------------ #include "ecdsa_fpga_model.h" //------------------------------------------------------------------------------ // Globals //------------------------------------------------------------------------------ FPGA_BUFFER ECDSA_Q; FPGA_BUFFER ECDSA_DELTA; //------------------------------------------------------------------------------ // Settings //------------------------------------------------------------------------------ bool _DUMP_MODULAR_RESULTS = false; //------------------------------------------------------------------------------ void fpga_modular_init() //------------------------------------------------------------------------------ { int w_src, w_dst; // word counters // temporary things FPGA_WORD TMP_Q [FPGA_OPERAND_NUM_WORDS] = ECDSA_Q_INIT; FPGA_WORD TMP_DELTA[FPGA_OPERAND_NUM_WORDS] = ECDSA_DELTA_INIT; /* fill buffers for large multi-word integers, we need to fill them in * reverse order because of the way C arrays are initialized */ for ( w_src = 0, w_dst = FPGA_OPERAND_NUM_WORDS - 1; w_src < FPGA_OPERAND_NUM_WORDS; w_src++, w_dst--) { ECDSA_Q.words[w_dst] = TMP_Q[w_src]; ECDSA_DELTA.words[w_dst] = TMP_DELTA[w_src]; } } //------------------------------------------------------------------------------ // // Modular addition. // // This routine implements algorithm 3. from "Ultra High Performance ECC over // NIST Primes on Commercial FPGAs". // // s = (a + b) mod q // // The naive approach is like the following: // // 1. s = a + b // 2. if (s >= q) s -= q // // The speed-up trick is to simultaneously calculate (a + b) and (a + b - q) // and then select the right variant. // //------------------------------------------------------------------------------ void fpga_modular_add(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *s) //------------------------------------------------------------------------------ { int w; // word counter FPGA_BUFFER ab, ab_n; // intermediate buffers bool c_in, c_out; // carries bool b_in, b_out; // borrows c_in = false; // first word has no carry b_in = false; // first word has no borrow // run parallel addition and subtraction for (w=0; wwords[w], b->words[w], c_in, &ab.words[w], &c_out); fpga_lowlevel_sub32(ab.words[w], ECDSA_Q.words[w], b_in, &ab_n.words[w], &b_out); c_in = c_out; // propagate carry b_in = b_out; // propagate borrow } // now select the right buffer /* * We select the right variant based on borrow and carry flags after * addition and subtraction of the very last pair of words. Note, that * we only need to select the first variant (a + b) when (a + b) < q. * This way if we get negative number after subtraction, we discard it * and use the output of the adder instead. The subtractor output is * negative when borrow flag is set *and* carry flag is not set. When * both borrow and carry are set, the number is non-negative, because * borrow and carry cancel each other out. */ for (w=0; wwords[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w]; if (_DUMP_MODULAR_RESULTS) dump_uop_output("ADD", s); } //------------------------------------------------------------------------------ // // Modular subtraction. // // This routine implements algorithm 3. from "Ultra High Performance ECC over // NIST Primes on Commercial FPGAs". // // d = (a - b) mod q // // The naive approach is like the following: // // 1. d = a - b // 2. if (a < b) d += q // // The speed-up trick is to simultaneously calculate (a - b) and (a - b + q) // and then select the right variant. // //------------------------------------------------------------------------------ void fpga_modular_sub(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *d) //------------------------------------------------------------------------------ { int w; // word counter FPGA_BUFFER ab, ab_n; // intermediate buffers bool c_in, c_out; // carries bool b_in, b_out; // borrows c_in = false; // first word has no carry b_in = false; // first word has no borrow // run parallel subtraction and addition for (w=0; wwords[w], b->words[w], b_in, &ab.words[w], &b_out); fpga_lowlevel_add32(ab.words[w], ECDSA_Q.words[w], c_in, &ab_n.words[w], &c_out); b_in = b_out; // propagate borrow c_in = c_out; // propagate carry } // now select the right buffer /* * We select the right variant based on borrow flag after subtraction * and addition of the very last pair of words. Note, that we only * need to select the second variant (a - b + q) when a < b. This way * if we get negative number after subtraction, we discard it * and use the output of the adder instead. The Subtractor output is * negative when borrow flag is set. */ for (w=0; wwords[w] = b_out ? ab_n.words[w] : ab.words[w]; if (_DUMP_MODULAR_RESULTS) dump_uop_output("SUB", d); } //------------------------------------------------------------------------------ // // Modular multiplication. // // This routine implements modular multiplication algorithm from "Ultra High // Performance ECC over NIST Primes on Commercial FPGAs". // // p = (a * b) mod q // // The complex algorithm is split into three parts: // // 1. Calculation of partial words // 2. Acccumulation of partial words into full-size product // 3. Modular reduction of the full-size product // // See comments for corresponding helper routines for more information. // //------------------------------------------------------------------------------ void fpga_modular_mul(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *p) //------------------------------------------------------------------------------ { FPGA_WORD_EXTENDED si[4*FPGA_OPERAND_NUM_WORDS-1]; // parts of intermediate product FPGA_WORD c[2*FPGA_OPERAND_NUM_WORDS]; // full-size intermediate product /* save debug flag */ bool _save_dump_modular_results = _DUMP_MODULAR_RESULTS; /* mask debug flag to not garble output */ _DUMP_MODULAR_RESULTS = false; /* multiply to get partial words */ fpga_modular_mul_helper_multiply(a, b, si); /* accumulate partial words into full-size product */ fpga_modular_mul_helper_accumulate(si, c); /* reduce full-size product using special routine */ fpga_modular_mul_helper_reduce(c, p); /* restore debug flag */ _DUMP_MODULAR_RESULTS = _save_dump_modular_results; /* now dump result if needed */ if (_DUMP_MODULAR_RESULTS) dump_uop_output("MUL", p); } //------------------------------------------------------------------------------ // // Parallelized multiplication. // // This routine implements the algorithm in Fig. 3. from "Ultra High // Performance ECC over NIST Primes on Commercial FPGAs". // // Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2 // bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide // inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS // multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial // output words si[]. // // The order of loading a and b into the multipliers is a bit complicated, // during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is // obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the // remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously. // //------------------------------------------------------------------------------ void fpga_modular_mul_helper_multiply(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si) //------------------------------------------------------------------------------ { int w; // counter int t, x; // more counters int j, i; // word indices FPGA_WORD p; // product // buffers for smaller words that multipliers can handle FPGA_WORD_REDUCED ai[2*FPGA_OPERAND_NUM_WORDS]; FPGA_WORD_REDUCED bj[2*FPGA_OPERAND_NUM_WORDS]; // split a and b into smaller words for (w=0; wwords[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)), bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2)); // accumulators FPGA_WORD_EXTENDED mac[2*FPGA_OPERAND_NUM_WORDS]; // clear accumulators for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) mac[w] = 0; // run the crazy algorithm :) for (t=0; t<(2*FPGA_OPERAND_NUM_WORDS); t++) { // save upper half of si[] (one word per cycle) if (t > 0) { si[4*FPGA_OPERAND_NUM_WORDS - (t+1)] = mac[t]; mac[t] = 0; } // update index j = 2*FPGA_OPERAND_NUM_WORDS - (t+1); // parallel multiplication for (x=0; x<(2*FPGA_OPERAND_NUM_WORDS); x++) { // update index i = t - x; if (i < 0) i += 2*FPGA_OPERAND_NUM_WORDS; // multiply... fpga_lowlevel_mul16(ai[i], bj[j], &p); // ...accumulate mac[x] += p; } } // now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once) for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) si[w] = mac[2*FPGA_OPERAND_NUM_WORDS - (w+1)]; } //------------------------------------------------------------------------------ // // Accumulation of partial words into full-size product. // // This routine implements the Algorithm 4. from "Ultra High Performance ECC // over NIST Primes on Commercial FPGAs". // // Input words si[] are accumulated into full-size product c[]. // // The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in // si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are // summed in full, odd words are split into two parts. During every cycle the // upper part of the previous word and the lower part of the following word are // summed too. // //------------------------------------------------------------------------------ void fpga_modular_mul_helper_accumulate(const FPGA_WORD_EXTENDED *si, FPGA_WORD *c) //------------------------------------------------------------------------------ { int w; // word counter FPGA_WORD_EXTENDED cw0, cw1; // intermediate sums FPGA_WORD_REDUCED cw_carry; // wide carry // clear carry cw_carry = 0; // execute the algorithm for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) { // handy flags bool w_is_first = (w == 0); bool w_is_last = (w == (2*FPGA_OPERAND_NUM_WORDS-1)); // accumulate full current even word... // ...and also the upper part of the previous odd word (if not the first word) fpga_lowlevel_add47(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0); // generate another word from "carry" part of the previous even word... // ...and also the lower part of the following odd word (if not the last word) cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2)); cw1 |= (FPGA_WORD_EXTENDED)cw_carry; // accumulate once again fpga_lowlevel_add47(cw0, cw1, &cw1); // store current word... c[w] = (FPGA_WORD)cw1; // ...and carry cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH); } } //------------------------------------------------------------------------------ // // Fast modular reduction for NIST prime P-256. // // p = c mod p256 // // This routine implements the algorithm 2.29 from "Guide to Elliptic Curve // Cryptography". // // Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c // on the other hand is the output of the parallelized Comba multiplier, so it // is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS). // // To save FPGA resources, the calculation is done using only two adders and // one subtractor. The algorithm is split into five steps. // //------------------------------------------------------------------------------ #if USE_CURVE == 1 void fpga_modular_mul_helper_reduce_p256(const FPGA_WORD *c, FPGA_BUFFER *p) { // "funny" words FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9; // compose "funny" words out of input words s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0]; s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0, s2.words[1] = 0, s2.words[0] = 0; s3.words[7] = 0, s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0, s3.words[1] = 0, s3.words[0] = 0; s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0, s4.words[4] = 0, s4.words[3] = 0, s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8]; s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9]; s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0, s6.words[4] = 0, s6.words[3] = 0, s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11]; s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0, s7.words[4] = 0, s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12]; s8.words[7] = c[12], s8.words[6] = 0, s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13]; s9.words[7] = c[13], s9.words[6] = 0, s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0, s9.words[1] = c[15], s9.words[0] = c[14]; // intermediate results FPGA_BUFFER sum0, sum1, difference; /* Step 1. */ fpga_modular_add(&s2, &s2, &sum0); // sum0 = 2*s2 fpga_modular_add(&s3, &s3, &sum1); // sum1 = 2*s3 fpga_modular_sub(&ECDSA_ZERO, &s6, &difference); // difference = -s6 /* Step 2. */ fpga_modular_add(&sum0, &s1, &sum0); // sum0 = s1 + 2*s2 fpga_modular_add(&sum1, &s4, &sum1); // sum1 = s4 + 2*s3 fpga_modular_sub(&difference, &s7, &difference); // difference = -(s6 + s7) /* Step 3. */ fpga_modular_add(&sum0, &s5, &sum0); // sum0 = s1 + 2*s2 + s5 fpga_modular_add(&sum1, &ECDSA_ZERO, &sum1); // compulsory cycle to keep sum1 constant for next stage fpga_modular_sub(&difference, &s8, &difference); // difference = -(s6 + s7 + s8) /* Step 4. */ fpga_modular_add(&sum0, &sum1, &sum0); // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5 // fpga_modular_add(, , &sum1); // dummy cycle, result ignored fpga_modular_sub(&difference, &s9, &difference); // difference = -(s6 + s7 + s8 + s9) /* Step 5. */ fpga_modular_add(&sum0, &difference, p); // p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9 // fpga_modular_add(, , &sum1); // dummy cycle, result ignored // fpga_modular_add(, , &difference); // dummy cycle, result ignored } #endif //------------------------------------------------------------------------------ // // Fast modular reduction for NIST prime P-384. // // p = c mod p384 // // This routine implements the algorithm 2.30 from "Guide to Elliptic Curve // Cryptography". // // Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c // on the other hand is the output of the parallelized Comba multiplier, so it // is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS). // // To save FPGA resources, the calculation is done using only two adders and // one subtractor. The algorithm is split into five steps. // //------------------------------------------------------------------------------ #if USE_CURVE == 2 void fpga_modular_mul_helper_reduce_p384(const FPGA_WORD *c, FPGA_BUFFER *p) { // "funny" words FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; // compose "funny" words s1.words[11] = c[11], s1.words[10] = c[10], s1.words[ 9] = c[ 9], s1.words[ 8] = c[ 8], s1.words[ 7] = c[ 7], s1.words[ 6] = c[ 6], s1.words[ 5] = c[ 5], s1.words[ 4] = c[ 4], s1.words[ 3] = c[ 3], s1.words[ 2] = c[ 2], s1.words[ 1] = c[ 1], s1.words[ 0] = c[ 0]; s2.words[11] = 0, s2.words[10] = 0, s2.words[ 9] = 0, s2.words[ 8] = 0, s2.words[ 7] = 0, s2.words[ 6] = c[23], s2.words[ 5] = c[22], s2.words[ 4] = c[21], s2.words[ 3] = 0, s2.words[ 2] = 0, s2.words[ 1] = 0, s2.words[ 0] = 0; s3.words[11] = c[23], s3.words[10] = c[22], s3.words[ 9] = c[21], s3.words[ 8] = c[20], s3.words[ 7] = c[19], s3.words[ 6] = c[18], s3.words[ 5] = c[17], s3.words[ 4] = c[16], s3.words[ 3] = c[15], s3.words[ 2] = c[14], s3.words[ 1] = c[13], s3.words[ 0] = c[12]; s4.words[11] = c[20], s4.words[10] = c[19], s4.words[ 9] = c[18], s4.words[ 8] = c[17], s4.words[ 7] = c[16], s4.words[ 6] = c[15], s4.words[ 5] = c[14], s4.words[ 4] = c[13], s4.words[ 3] = c[12], s4.words[ 2] = c[23], s4.words[ 1] = c[22], s4.words[ 0] = c[21]; s5.words[11] = c[19], s5.words[10] = c[18], s5.words[ 9] = c[17], s5.words[ 8] = c[16], s5.words[ 7] = c[15], s5.words[ 6] = c[14], s5.words[ 5] = c[13], s5.words[ 4] = c[12], s5.words[ 3] = c[20], s5.words[ 2] = 0, s5.words[ 1] = c[23], s5.words[ 0] = 0; s6.words[11] = 0, s6.words[10] = 0, s6.words[ 9] = 0, s6.words[ 8] = 0, s6.words[ 7] = c[23], s6.words[ 6] = c[22], s6.words[ 5] = c[21], s6.words[ 4] = c[20], s6.words[ 3] = 0, s6.words[ 2] = 0, s6.words[ 1] = 0, s6.words[ 0] = 0; s7.words[11] = 0, s7.words[10] = 0, s7.words[ 9] = 0, s7.words[ 8] = 0, s7.words[ 7] = 0, s7.words[ 6] = 0, s7.words[ 5] = c[23], s7.words[ 4] = c[22], s7.words[ 3] = c[21], s7.words[ 2] = 0, s7.words[ 1] = 0, s7.words[ 0] = c[20]; s8.words[11] = c[22], s8.words[10] = c[21], s8.words[ 9] = c[20], s8.words[ 8] = c[19], s8.words[ 7] = c[18], s8.words[ 6] = c[17], s8.words[ 5] = c[16], s8.words[ 4] = c[15], s8.words[ 3] = c[14], s8.words[ 2] = c[13], s8.words[ 1] = c[12], s8.words[ 0] = c[23]; s9.words[11] = 0, s9.words[10] = 0, s9.words[ 9] = 0, s9.words[ 8] = 0, s9.words[ 7] = 0, s9.words[ 6] = 0, s9.words[ 5] = 0, s9.words[ 4] = c[23], s9.words[ 3] = c[22], s9.words[ 2] = c[21], s9.words[ 1] = c[20], s9.words[ 0] = 0; s10.words[11] = 0, s10.words[10] = 0, s10.words[ 9] = 0, s10.words[ 8] = 0, s10.words[ 7] = 0, s10.words[ 6] = 0, s10.words[ 5] = 0, s10.words[ 4] = c[23], s10.words[ 3] = c[23], s10.words[ 2] = 0, s10.words[ 1] = 0, s10.words[ 0] = 0; // intermediate results FPGA_BUFFER sum0, sum1, difference; /* Step 1. */ fpga_modular_add(&s1, &s3, &sum0); // sum0 = s1 + s3 fpga_modular_add(&s2, &s2, &sum1); // sum1 = 2*s2 fpga_modular_sub(&ECDSA_ZERO, &s8, &difference); // difference = -s8 /* Step 2. */ fpga_modular_add(&sum0, &s4, &sum0); // sum0 = s1 + s3 + s4 fpga_modular_add(&sum1, &s5, &sum1); // sum1 = 2*s2 + s5 fpga_modular_sub(&difference, &s9, &difference); // difference = -(s8 + s9) /* Step 3. */ fpga_modular_add(&sum0, &s6, &sum0); // sum0 = s1 + s3 + s4 + s6 fpga_modular_add(&sum1, &s7, &sum1); // sum1 = 2*s2 + s5 + s7 fpga_modular_sub(&difference, &s10, &difference); // difference = -(s8 + s9 + s10) /* Step 4. */ fpga_modular_add(&sum0, &sum1, &sum0); // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5 // fpga_modular_add(, , &sum1); // dummy cycle, result ignored fpga_modular_sub(&difference, &ECDSA_ZERO, &difference); // compulsory cycle to keep difference constant for next stage /* Step 5. */ fpga_modular_add(&sum0, &difference, p); // p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10 // fpga_modular_add(, , &sum1); // dummy cycle, result ignored // fpga_modular_add(, , &difference); // dummy cycle, result ignored } #endif #if USE_CURVE == 1 //------------------------------------------------------------------------------ void fpga_modular_inv23_p256(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3) //------------------------------------------------------------------------------ // // This uses the addition chain from // // < https://briansmith.org/ecc-inversion-addition-chains-01 > // // to calculate A2 = A^-2 and A3 = A^-3. // //------------------------------------------------------------------------------ { // counter int cyc_cnt; // working variables FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X32; // first obtain intermediate helper quantities (X1..X32) // X1 fpga_multiword_copy(A, &X1); // X2 fpga_modular_mul(&X1, &X1, &R1); fpga_modular_mul(&R1, &X1, &X2); // X3 fpga_modular_mul(&X2, &X2, &R1); fpga_modular_mul(&R1, &X1, &X3); // X6 fpga_multiword_copy(&X3, &R1); for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R2, &X3, &X6); // X12 fpga_multiword_copy(&X6, &R1); for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X6, &X12); // X15 fpga_multiword_copy(&X12, &R1); for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R2, &X3, &X15); // X30 fpga_multiword_copy(&X15, &R1); for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R2, &X15, &X30); // X32 fpga_multiword_copy(&X30, &R1); for (cyc_cnt=0; cyc_cnt<2; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X2, &X32); // now compute the final results fpga_multiword_copy(&X32, &R1); for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X1, &R2); for (cyc_cnt=0; cyc_cnt<128; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R2, &R2, &R1); else fpga_modular_mul(&R1, &R1, &R2); } fpga_modular_mul(&R2, &X32, &R1); for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X32, &R2); for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R2, &R2, &R1); else fpga_modular_mul(&R1, &R1, &R2); } fpga_modular_mul(&R2, &X30, &R1); fpga_modular_mul(&R1, &R1, &R2); fpga_modular_mul(&R2, &R2, &R1); // A2 obtained fpga_multiword_copy(&R1, A2); // now calculate compute inverse cubed from inverse squared fpga_modular_mul(&R1, &R1, &R2); fpga_modular_mul(&R2, A, &R1); // A3 obtained fpga_multiword_copy(&R1, A3); } #endif #if USE_CURVE == 2 //------------------------------------------------------------------------------ void fpga_modular_inv23_p384(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3) //------------------------------------------------------------------------------ // // This uses the addition chain from // // < https://briansmith.org/ecc-inversion-addition-chains-01 > // // to calculate A2 = A^-2 and A3 = A^-3. // //------------------------------------------------------------------------------ { // counter int cyc_cnt; // working variables FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X60, X120; // first obtain intermediate helper quantities (X1..X120) // X1 fpga_multiword_copy(A, &X1); // X2 fpga_modular_mul(&X1, &X1, &R1); fpga_modular_mul(&R1, &X1, &X2); // X3 fpga_modular_mul(&X2, &X2, &R1); fpga_modular_mul(&R1, &X1, &X3); // X6 fpga_multiword_copy(&X3, &R1); for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R2, &X3, &X6); // X12 fpga_multiword_copy(&X6, &R1); for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X6, &X12); // X15 fpga_multiword_copy(&X12, &R1); for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R2, &X3, &X15); // X30 fpga_multiword_copy(&X15, &R1); for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R2, &X15, &X30); // X60 fpga_multiword_copy(&X30, &R1); for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X30, &X60); // X120 fpga_multiword_copy(&X60, &R1); for (cyc_cnt=0; cyc_cnt<60; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X60, &X120); // now compute the final results fpga_multiword_copy(&X120, &R1); for (cyc_cnt=0; cyc_cnt<120; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X120, &R2); for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R2, &R2, &R1); else fpga_modular_mul(&R1, &R1, &R2); } fpga_modular_mul(&R1, &X15, &R2); for (cyc_cnt=0; cyc_cnt<31; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R2, &R2, &R1); else fpga_modular_mul(&R1, &R1, &R2); } fpga_modular_mul(&R1, &X30, &R2); fpga_modular_mul(&R2, &R2, &R1); fpga_modular_mul(&R1, &R1, &R2); fpga_modular_mul(&R2, &X2, &R1); for (cyc_cnt=0; cyc_cnt<94; cyc_cnt++) { if (!(cyc_cnt % 2)) fpga_modular_mul(&R1, &R1, &R2); else fpga_modular_mul(&R2, &R2, &R1); } fpga_modular_mul(&R1, &X30, &R2); fpga_modular_mul(&R2, &R2, &R1); fpga_modular_mul(&R1, &R1, &R2); // A2 obtained fpga_multiword_copy(&R2, A2); // now calculate compute inverse cubed from inverse squared fpga_modular_mul(&R2, &R2, &R1); fpga_modular_mul(&R1, A, &R2); // A3 obtained fpga_multiword_copy(&R2, A3); } #endif //------------------------------------------------------------------------------ // End-of-File //------------------------------------------------------------------------------