//------------------------------------------------------------------------------ // // curve25519_fpga_modular.cpp // ------------------------------------------ // Modular arithmetic routines for Curve25519 // // Authors: Pavel Shatov // // Copyright (c) 2015-2016, 2018 NORDUnet A/S // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // Headers //------------------------------------------------------------------------------ #include "curve25519_fpga_model.h" //------------------------------------------------------------------------------ // Globals //------------------------------------------------------------------------------ FPGA_BUFFER CURVE25519_1P; FPGA_BUFFER CURVE25519_2P; //------------------------------------------------------------------------------ void fpga_modular_init() //------------------------------------------------------------------------------ { int w_src, w_dst; // word counters // temporary things FPGA_WORD TMP_1P[FPGA_OPERAND_NUM_WORDS] = CURVE25519_1P_INIT; FPGA_WORD TMP_2P[FPGA_OPERAND_NUM_WORDS] = CURVE25519_2P_INIT; /* fill buffers for large multi-word integers, we need to fill them in * reverse order because of the way C arrays are initialized */ for ( w_src = 0, w_dst = FPGA_OPERAND_NUM_WORDS - 1; w_src < FPGA_OPERAND_NUM_WORDS; w_src++, w_dst--) { CURVE25519_1P.words[w_dst] = TMP_1P[w_src]; CURVE25519_2P.words[w_dst] = TMP_2P[w_src]; } } //------------------------------------------------------------------------------ // // Modular addition. // // This routine implements algorithm 3. from "Ultra High Performance ECC over // NIST Primes on Commercial FPGAs". // // s = (a + b) mod q // // The naive approach is like the following: // // 1. s = a + b // 2. if (s >= q) s -= q // // The speed-up trick is to simultaneously calculate (a + b) and (a + b - q) // and then select the right variant. // //------------------------------------------------------------------------------ void fpga_modular_add(const FPGA_BUFFER *A, const FPGA_BUFFER *B, FPGA_BUFFER *S, const FPGA_BUFFER *N) //------------------------------------------------------------------------------ { int w; // word counter FPGA_BUFFER AB, AB_N; // intermediate buffers bool c_in, c_out; // carries bool b_in, b_out; // borrows c_in = false; // first word has no carry b_in = false; // first word has no borrow // run parallel addition and subtraction for (w=0; wwords[w], B->words[w], c_in, &AB.words[w], &c_out); fpga_lowlevel_sub32(AB.words[w], N->words[w], b_in, &AB_N.words[w], &b_out); c_in = c_out; // propagate carry b_in = b_out; // propagate borrow } // now select the right buffer /* * We select the right variant based on borrow and carry flags after * addition and subtraction of the very last pair of words. Note, that * we only need to select the first variant (a + b) when (a + b) < q. * This way if we get negative number after subtraction, we discard it * and use the output of the adder instead. The subtractor output is * negative when borrow flag is set *and* carry flag is not set. When * both borrow and carry are set, the number is non-negative, because * borrow and carry cancel each other out. */ for (w=0; wwords[w] = (b_out && !c_out) ? AB.words[w] : AB_N.words[w]; } //------------------------------------------------------------------------------ // // Modular subtraction. // // This routine implements algorithm 3. from "Ultra High Performance ECC over // NIST Primes on Commercial FPGAs". // // d = (a - b) mod q // // The naive approach is like the following: // // 1. d = a - b // 2. if (a < b) d += q // // The speed-up trick is to simultaneously calculate (a - b) and (a - b + q) // and then select the right variant. // //------------------------------------------------------------------------------ void fpga_modular_sub(const FPGA_BUFFER *A, const FPGA_BUFFER *B, FPGA_BUFFER *D, const FPGA_BUFFER *N) //------------------------------------------------------------------------------ { int w; // word counter FPGA_BUFFER AB, AB_N; // intermediate buffers bool c_in, c_out; // carries bool b_in, b_out; // borrows c_in = false; // first word has no carry b_in = false; // first word has no borrow // run parallel subtraction and addition for (w=0; wwords[w], B->words[w], b_in, &AB.words[w], &b_out); fpga_lowlevel_add32(AB.words[w], N->words[w], c_in, &AB_N.words[w], &c_out); b_in = b_out; // propagate borrow c_in = c_out; // propagate carry } // now select the right buffer /* * We select the right variant based on borrow flag after subtraction * and addition of the very last pair of words. Note, that we only * need to select the second variant (a - b + q) when a < b. This way * if we get negative number after subtraction, we discard it * and use the output of the adder instead. The Subtractor output is * negative when borrow flag is set. */ for (w=0; wwords[w] = b_out ? AB_N.words[w] : AB.words[w]; } //------------------------------------------------------------------------------ // // Modular multiplication for Curve25519. // // p = (a * b) mod q // // The complex algorithm is split into three parts: // // 1. Calculation of partial words // 2. Acccumulation of partial words into full-size product // 3. Modular reduction of the full-size product // // See comments for corresponding helper routines for more information. // //------------------------------------------------------------------------------ void fpga_modular_mul(const FPGA_BUFFER *A, const FPGA_BUFFER *B, FPGA_BUFFER *P, const FPGA_BUFFER *N) //------------------------------------------------------------------------------ { FPGA_WORD_EXTENDED SI[4*FPGA_OPERAND_NUM_WORDS-1]; // parts of intermediate product FPGA_WORD C[2*FPGA_OPERAND_NUM_WORDS]; // full-size intermediate product /* multiply to get partial words */ fpga_modular_mul_helper_multiply(A, B, SI); /* accumulate partial words into full-size product */ fpga_modular_mul_helper_accumulate(SI, C); /* reduce full-size product using special routine */ fpga_modular_mul_helper_reduce(C, P, N); } //------------------------------------------------------------------------------ // // Modular reduction for Curve25519. // // Note, that this routine reduces 512-bit product modulo 2*P, i.e. // 2 * (2^255 -19) = 2^256 - 38. It is computationally more effective to not // fully reduce the result until the very end of X25519 calculation. // // See the "Special Reduction" section of "High-Performance Modular // Multiplication on the Cell Processor" by Joppe W. Bos for more information // about the math behind reduction: http://joppebos.com/files/waifi09.pdf // //------------------------------------------------------------------------------ void fpga_modular_mul_helper_reduce(const FPGA_WORD *C, FPGA_BUFFER *P, const FPGA_BUFFER *N) //------------------------------------------------------------------------------ { int w; // word counter // handy vars FPGA_WORD y; FPGA_WORD x1_msb, x1_lsb; FPGA_WORD x2_msb, x2_lsb; FPGA_WORD x5_msb, x5_lsb; FPGA_WORD n_word; // S1 is 262-bit result after the first reduction attempt // S2 is 257-bit result after the second reduction attempt FPGA_WORD S1[FPGA_OPERAND_NUM_WORDS + 1]; FPGA_WORD S2[FPGA_OPERAND_NUM_WORDS + 1]; // carries during the first and the second stages FPGA_WORD_REDUCED t_carry1; FPGA_WORD_EXTENDED t_carry2; // borrows for the final stage bool b_in, b_out; // temporary result of the final stage FPGA_WORD S2_N[FPGA_OPERAND_NUM_WORDS + 1]; // outputs of adders FPGA_WORD_EXTENDED T1[FPGA_OPERAND_NUM_WORDS + 1]; FPGA_WORD_EXTENDED T2[FPGA_OPERAND_NUM_WORDS + 1]; FPGA_WORD_EXTENDED T3[FPGA_OPERAND_NUM_WORDS + 1]; FPGA_WORD_EXTENDED T4[FPGA_OPERAND_NUM_WORDS + 1]; // parts of full input product FPGA_BUFFER P_LO, P_HI; // split 512-bit input C into two 256-bit parts for (w=0; w 0) ? (FPGA_WORD)(P_HI.words[w-1] >> (32 - 1)) : 0; x2_msb = (w > 0) ? (FPGA_WORD)(P_HI.words[w-1] >> (32 - 2)) : 0; x5_msb = (w > 0) ? (FPGA_WORD)(P_HI.words[w-1] >> (32 - 5)) : 0; // lower parts of shifted copies of P_HI[w] x1_lsb = (w < FPGA_OPERAND_NUM_WORDS) ? (FPGA_WORD)(P_HI.words[w] << 1) : 0; x2_lsb = (w < FPGA_OPERAND_NUM_WORDS) ? (FPGA_WORD)(P_HI.words[w] << 2) : 0; x5_lsb = (w < FPGA_OPERAND_NUM_WORDS) ? (FPGA_WORD)(P_HI.words[w] << 5) : 0; // take care of uppers parts from the previous cycle x1_lsb |= x1_msb; x2_lsb |= x2_msb; x5_lsb |= x5_msb; // current word of P_LO y = (w < FPGA_OPERAND_NUM_WORDS) ? P_LO.words[w] : 0; // run addition fpga_lowlevel_add47(x1_lsb, x2_lsb, &T1[w]); // T1 obtained after clock cycle w fpga_lowlevel_add47(x5_lsb, y, &T2[w]); // T2 obtained after clock cycle w fpga_lowlevel_add47(T1[w], T2[w], &T3[w]); // T3 obtained after clock cycle w+1 (when T1 and T2 are available) fpga_lowlevel_add47(T3[w], t_carry1, &T4[w]); // T4 obtained after clock cycle w+2 (when T3 is available) // store carry t_carry1 = (FPGA_WORD_REDUCED)(T4[w] >> FPGA_WORD_WIDTH); // store word of sum S1[w] = (FPGA_WORD)T4[w]; } /* now repeat what we've just done once again with S1, but this time S1_HI * is 6-bit wide at most, so we can calculate S1_HI * 38 beforehand, * add it to the lowest word of S1_LO and then propagate the carry */ t_carry2 = 0; t_carry2 += (FPGA_WORD_EXTENDED)S1[FPGA_OPERAND_NUM_WORDS] << 1; t_carry2 += (FPGA_WORD_EXTENDED)S1[FPGA_OPERAND_NUM_WORDS] << 2; t_carry2 += (FPGA_WORD_EXTENDED)S1[FPGA_OPERAND_NUM_WORDS] << 5; for (w=0; w<(FPGA_OPERAND_NUM_WORDS+1); w++) { // current word of S1_LO y = (w < FPGA_OPERAND_NUM_WORDS) ? S1[w] : 0; // do addition fpga_lowlevel_add47(t_carry2, y, &T1[w]); // store carry t_carry2 = (FPGA_WORD_REDUCED)(T1[w] >> FPGA_WORD_WIDTH); // store word of sum S2[w] = (FPGA_WORD)T1[w]; } /* So we've ended up with 257-bit result in S2. Note that there can only be * two situations, given our modulus N is 2^256 - 38: * * a) 0 <= S < N * b) N <= S < 2*N * * This is because we've obtained S2 by adding 256-bit quantity S1_LO and * 12-bit quantity 38 * S_HI. S1_LO is at most 2^256 - 1 or N + 37, while * the 12-bit quantity is at most 4095, so the largest possible value of * S2 is N + 4132, which is obviously less than 2*N. * * What we now do is we try subtracting N from S2 to obtain S2_N, if we end up * with a negative number, we return S2 (reduction was not necessary), otherwise * we return S2_N. */ b_in = false; // first word has no borrow // run parallel subtraction for (w=0; w<=FPGA_OPERAND_NUM_WORDS; w++) { // current word of N n_word = (w < FPGA_OPERAND_NUM_WORDS) ? N->words[w] : 0; // do subtraction fpga_lowlevel_sub32(S2[w], n_word, b_in, &S2_N[w], &b_out); // propagate borrow b_in = b_out; } // copy result to output buffer for (w=0; wwords[w] = b_out ? S2[w] : S2_N[w]; } } //------------------------------------------------------------------------------ // // Parallelized multiplication. // // This routine implements the algorithm in Fig. 3. from "Ultra High // Performance ECC over NIST Primes on Commercial FPGAs". // // Inputs A and B are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2 // bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide // inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS // multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial // output words SI[]. // // The order of loading A and B into the multipliers is a bit complicated, // during the first 2*OPERAND_NUM_WORDS-1 cycles one SI word per cycle is // obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the // remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously. // //------------------------------------------------------------------------------ void fpga_modular_mul_helper_multiply(const FPGA_BUFFER *A, const FPGA_BUFFER *B, FPGA_WORD_EXTENDED *SI) //------------------------------------------------------------------------------ { int w; // counter int t, x; // more counters int j, i; // word indices FPGA_WORD p; // product // buffers for smaller words that multipliers can handle FPGA_WORD_REDUCED AI[2*FPGA_OPERAND_NUM_WORDS]; FPGA_WORD_REDUCED BJ[2*FPGA_OPERAND_NUM_WORDS]; // split a and b into smaller words for (w=0; wwords[w], AI[2*w + 1] = (FPGA_WORD_REDUCED)(A->words[w] >> (FPGA_WORD_WIDTH / 2)), BJ[2*w] = (FPGA_WORD_REDUCED)B->words[w], BJ[2*w + 1] = (FPGA_WORD_REDUCED)(B->words[w] >> (FPGA_WORD_WIDTH / 2)); // accumulators FPGA_WORD_EXTENDED mac[2*FPGA_OPERAND_NUM_WORDS]; // clear accumulators for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) mac[w] = 0; // run the crazy algorithm :) for (t=0; t<(2*FPGA_OPERAND_NUM_WORDS); t++) { // save upper half of si[] (one word per cycle) if (t > 0) { SI[4*FPGA_OPERAND_NUM_WORDS - (t+1)] = mac[t]; mac[t] = 0; } // update index j = 2*FPGA_OPERAND_NUM_WORDS - (t+1); // parallel multiplication for (x=0; x<(2*FPGA_OPERAND_NUM_WORDS); x++) { // update index i = t - x; if (i < 0) i += 2*FPGA_OPERAND_NUM_WORDS; // multiply... fpga_lowlevel_mul16(AI[i], BJ[j], &p); // ...accumulate mac[x] += p; } } // now finally save lower half of SI[] (2*OPERAND_NUM_WORDS words at once) for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) SI[w] = mac[2*FPGA_OPERAND_NUM_WORDS - (w+1)]; } //------------------------------------------------------------------------------ // // Accumulation of partial words into full-size product. // // This routine implements the Algorithm 4. from "Ultra High Performance ECC // over NIST Primes on Commercial FPGAs". // // Input words SI[] are accumulated into full-size product C[]. // // The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in // SI[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are // summed in full, odd words are split into two parts. During every cycle the // upper part of the previous word and the lower part of the following word are // summed too. // //------------------------------------------------------------------------------ void fpga_modular_mul_helper_accumulate(const FPGA_WORD_EXTENDED *SI, FPGA_WORD *C) //------------------------------------------------------------------------------ { int w; // word counter FPGA_WORD_EXTENDED cw0, cw1; // intermediate sums FPGA_WORD_REDUCED cw_carry; // wide carry // clear carry cw_carry = 0; // execute the algorithm for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) { // handy flags bool w_is_first = (w == 0); bool w_is_last = (w == (2*FPGA_OPERAND_NUM_WORDS-1)); // accumulate full current even word... // ...and also the upper part of the previous odd word (if not the first word) fpga_lowlevel_add47(SI[2*w], w_is_first ? 0 : SI[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0); // generate another word from "carry" part of the previous even word... // ...and also the lower part of the following odd word (if not the last word) cw1 = w_is_last ? 0 : (FPGA_WORD)(SI[2*w+1] << (FPGA_WORD_WIDTH / 2)); cw1 |= (FPGA_WORD_EXTENDED)cw_carry; // accumulate once again fpga_lowlevel_add47(cw0, cw1, &cw1); // store current word... C[w] = (FPGA_WORD)cw1; // ...and carry cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH); } } //------------------------------------------------------------------------------ // // Custom modular inversion. // // This uses Fermat's little theorem to calculate A1 = A ^ -1. // // The corresponding addition chain which Bernstein calls "straightforward // sequence of 254 squarings and 11 multiplications" in his original paper was // taken from here: // https://briansmith.org/ecc-inversion-addition-chains-01 // //------------------------------------------------------------------------------ void fpga_modular_inv_abstract(const FPGA_BUFFER *A, FPGA_BUFFER *A1, const FPGA_BUFFER *N) { int cyc_count; // counter // temporary variables FPGA_BUFFER R1, R2; FPGA_BUFFER T_1; FPGA_BUFFER T_10; FPGA_BUFFER T_1001; FPGA_BUFFER T_1011; FPGA_BUFFER T_X5; FPGA_BUFFER T_X10; FPGA_BUFFER T_X20; FPGA_BUFFER T_X40; FPGA_BUFFER T_X50; FPGA_BUFFER T_X100; // let's go fpga_multiword_copy(A, &T_1); // fpga_modular_mul(&T_1, &T_1, &T_10, &CURVE25519_2P); // fpga_modular_mul(&T_10, &T_10, &R1, &CURVE25519_2P); fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); fpga_modular_mul(&R2, &T_1, &T_1001, &CURVE25519_2P); // fpga_modular_mul(&T_1001, &T_10, &T_1011, &CURVE25519_2P); // fpga_modular_mul(&T_1011, &T_1011, &R1, &CURVE25519_2P); fpga_modular_mul(&R1, &T_1001, &T_X5, &CURVE25519_2P); // fpga_multiword_copy(&T_X5, &R1); for (cyc_count=0; cyc_count<5; cyc_count++) { if (!(cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } fpga_modular_mul(&R2, &T_X5, &T_X10, &CURVE25519_2P); // fpga_multiword_copy(&T_X10, &R1); for (cyc_count=0; cyc_count<10; cyc_count++) { if (!(cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } fpga_modular_mul(&R1, &T_X10, &T_X20, &CURVE25519_2P); // fpga_multiword_copy(&T_X20, &R1); for (cyc_count=0; cyc_count<20; cyc_count++) { if (!(cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } fpga_modular_mul(&R1, &T_X20, &T_X40, &CURVE25519_2P); // fpga_multiword_copy(&T_X40, &R1); for (cyc_count=0; cyc_count<10; cyc_count++) { if (!(cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } fpga_modular_mul(&R1, &T_X10, &T_X50, &CURVE25519_2P); // fpga_multiword_copy(&T_X50, &R1); for (cyc_count=0; cyc_count<50; cyc_count++) { if (!(cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } fpga_modular_mul(&R1, &T_X50, &T_X100, &CURVE25519_2P); // fpga_multiword_copy(&T_X100, &R1); for (cyc_count=0; cyc_count<100; cyc_count++) { if (!(cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } // fpga_modular_mul(&R1, &T_X100, &R2, &CURVE25519_2P); // for (cyc_count=0; cyc_count<50; cyc_count++) { if ((cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); // inverse order here! else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } // fpga_modular_mul(&R2, &T_X50, &R1, &CURVE25519_2P); // for (cyc_count=0; cyc_count<5; cyc_count++) { if (!(cyc_count % 2)) fpga_modular_mul(&R1, &R1, &R2, &CURVE25519_2P); else fpga_modular_mul(&R2, &R2, &R1, &CURVE25519_2P); } // fpga_modular_mul(&R2, &T_1011, A1, &CURVE25519_2P); } //------------------------------------------------------------------------------ // End-of-File //------------------------------------------------------------------------------