From 71fba5c6662d6bbe74366c03b531a5f86b25bbbb Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Mon, 26 Feb 2018 13:05:17 +0300 Subject: Initial commit of C reference model for ECDH cores. --- fpga_modular.cpp | 838 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 838 insertions(+) create mode 100644 fpga_modular.cpp (limited to 'fpga_modular.cpp') diff --git a/fpga_modular.cpp b/fpga_modular.cpp new file mode 100644 index 0000000..5f7f5b7 --- /dev/null +++ b/fpga_modular.cpp @@ -0,0 +1,838 @@ +//------------------------------------------------------------------------------ +// +// fpga_modular.cpp +// --------------------------- +// Modular arithmetic routines +// +// Authors: Pavel Shatov +// +// Copyright (c) 2015-2016, 2018 NORDUnet A/S +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ + + +//------------------------------------------------------------------------------ +// Headers +//------------------------------------------------------------------------------ +#include +#include "ecdh_fpga_model.h" +#include "fpga_lowlevel.h" +#include "fpga_modular.h" + + +//------------------------------------------------------------------------------ +// Globals +//------------------------------------------------------------------------------ +FPGA_BUFFER ecdsa_q; +FPGA_BUFFER ecdsa_zero; +FPGA_BUFFER ecdsa_one; +FPGA_BUFFER ecdsa_delta; + + +//------------------------------------------------------------------------------ +void fpga_modular_init() +//------------------------------------------------------------------------------ +{ + int w; // word counter + + FPGA_BUFFER tmp_q = ECDSA_Q; + FPGA_BUFFER tmp_zero = ECDSA_ZERO; + FPGA_BUFFER tmp_one = ECDSA_ONE; + FPGA_BUFFER tmp_delta = ECDSA_DELTA; + + /* fill buffers for large multi-word integers */ + for (w=0; w= q) s -= q +// +// The speed-up trick is to simultaneously calculate (a + b) and (a + b - q) +// and then select the right variant. +// +//------------------------------------------------------------------------------ +void fpga_modular_add(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *s) +//------------------------------------------------------------------------------ +{ + int w; // word counter + FPGA_BUFFER ab, ab_n; // intermediate buffers + bool c_in, c_out; // carries + bool b_in, b_out; // borrows + + c_in = false; // first word has no carry + b_in = false; // first word has no borrow + + // run parallel addition and subtraction + for (w=0; wwords[w], b->words[w], c_in, &ab.words[w], &c_out); + fpga_lowlevel_sub32(ab.words[w], ecdsa_q.words[w], b_in, &ab_n.words[w], &b_out); + + c_in = c_out; // propagate carry + b_in = b_out; // propagate borrow + } + + // now select the right buffer + + /* + * We select the right variant based on borrow and carry flags after + * addition and subtraction of the very last pair of words. Note, that + * we only need to select the first variant (a + b) when (a + b) < q. + * This way if we get negative number after subtraction, we discard it + * and use the output of the adder instead. The subtractor output is + * negative when borrow flag is set *and* carry flag is not set. When + * both borrow and carry are set, the number is non-negative, because + * borrow and carry cancel each other out. + */ + for (w=0; wwords[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w]; +} + + +//------------------------------------------------------------------------------ +// +// Modular subtraction. +// +// This routine implements algorithm 3. from "Ultra High Performance ECC over +// NIST Primes on Commercial FPGAs". +// +// d = (a - b) mod q +// +// The naive approach is like the following: +// +// 1. d = a - b +// 2. if (a < b) d += q +// +// The speed-up trick is to simultaneously calculate (a - b) and (a - b + q) +// and then select the right variant. +// +//------------------------------------------------------------------------------ +void fpga_modular_sub(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *d) +//------------------------------------------------------------------------------ +{ + int w; // word counter + FPGA_BUFFER ab, ab_n; // intermediate buffers + bool c_in, c_out; // carries + bool b_in, b_out; // borrows + + c_in = false; // first word has no carry + b_in = false; // first word has no borrow + + // run parallel subtraction and addition + for (w=0; wwords[w], b->words[w], b_in, &ab.words[w], &b_out); + fpga_lowlevel_add32(ab.words[w], ecdsa_q.words[w], c_in, &ab_n.words[w], &c_out); + + b_in = b_out; // propagate borrow + c_in = c_out; // propagate carry + } + + // now select the right buffer + + /* + * We select the right variant based on borrow flag after subtraction + * and addition of the very last pair of words. Note, that we only + * need to select the second variant (a - b + q) when a < b. This way + * if we get negative number after subtraction, we discard it + * and use the output of the adder instead. The Subtractor output is + * negative when borrow flag is set. + */ + for (w=0; wwords[w] = b_out ? ab_n.words[w] : ab.words[w]; +} + + +//------------------------------------------------------------------------------ +// +// Modular multiplication. +// +// This routine implements modular multiplication algorithm from "Ultra High +// Performance ECC over NIST Primes on Commercial FPGAs". +// +// p = (a * b) mod q +// +// The complex algorithm is split into three parts: +// +// 1. Calculation of partial words +// 2. Acccumulation of partial words into full-size product +// 3. Modular reduction of the full-size product +// +// See comments for corresponding helper routines for more information. +// +//------------------------------------------------------------------------------ +void fpga_modular_mul(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *p) +//------------------------------------------------------------------------------ +{ + FPGA_WORD_EXTENDED si[4*OPERAND_NUM_WORDS-1]; // parts of intermediate product + FPGA_WORD c[2*OPERAND_NUM_WORDS]; // full-size intermediate product + + /* multiply to get partial words */ + fpga_modular_mul_helper_multiply(a, b, si); + + /* accumulate partial words into full-size product */ + fpga_modular_mul_helper_accumulate(si, c); + + /* reduce full-size product using special routine */ + fpga_modular_mul_helper_reduce(c, p); +} + + +//------------------------------------------------------------------------------ +// +// Modular multiplicative inversion procedure. +// +// a1 = a^-1 (mod n) +// +// This routine implements the algorithm from "Constant Time Modular +// Inversion" by Joppe W. Bos (http://www.joppebos.com/files/CTInversion.pdf) +// +// The algorithm has two phases: 1) calculation of "almost" modular inverse, +// which is a^-1*2^k and 2) removal of redundant factor 2^k. +// +// The first stage has four temporary variables: r, s, u, v; they are updated +// at every iteration. Depending on flags there can be four branches, FPGA +// will pre-calculate all possible values in parallel and then use a multiplexor +// to select the next value. This implementation also calculates all possible +// outcomes. This is done for debugging purposes, *NOT* for constant run-time! +// +// The second part only works with s and k. +// +// Note, that k is a simple counter, and it can't exceed 2*OPERAND_WIDTH. +// +// The complex inversion algorithm uses helper routines. Note, that width of the +// intermediate results can temporarily exceed OPERAND_WIDTH, so all the helper +// routines process OPERAND_NUM_WORDS+1 words. +// +//------------------------------------------------------------------------------ +void fpga_modular_inv(FPGA_BUFFER *a, FPGA_BUFFER *a1) +{ + int i; // round counter + int w; // word counter + int k; // redundant power of 2 + + /* q, 1 */ + FPGA_WORD buf_q[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_1[OPERAND_NUM_WORDS+1]; + + /* r, s */ + FPGA_WORD buf_r[OPERAND_NUM_WORDS+1], buf_s[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_r_double[OPERAND_NUM_WORDS+1], buf_s_double[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_r_new[OPERAND_NUM_WORDS+1], buf_s_new[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_r_plus_s[OPERAND_NUM_WORDS+1], buf_s_plus_r[OPERAND_NUM_WORDS+1]; + + /* u, v */ + FPGA_WORD buf_u[OPERAND_NUM_WORDS+1], buf_v[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_u_half[OPERAND_NUM_WORDS+1], buf_v_half[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_u_minus_v[OPERAND_NUM_WORDS+1], buf_v_minus_u[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_u_minus_v_half[OPERAND_NUM_WORDS+1], buf_v_minus_u_half[OPERAND_NUM_WORDS+1]; + FPGA_WORD buf_u_new[OPERAND_NUM_WORDS+1], buf_v_new[OPERAND_NUM_WORDS+1]; + + /* comparison */ + int cmp_v_1, cmp_u_v; + + /* clear buffers */ + for (w=0; w<=OPERAND_NUM_WORDS; w++) + buf_r[w] = 0, buf_s[w] = 0, + buf_u[w] = 0, buf_v[w] = 0, + buf_q[w] = 0, buf_1[w] = 0; + + /* initialize q, 1 */ + for (w=0; wwords[w]; + + /* initialize k */ + k = 0; + + /* flags for the first stage */ + bool v_is_1, u_is_greater_than_v, u_is_even, v_is_even; + + /* first stage */ + for (i=0; i<(2*OPERAND_WIDTH); i++) + { + /* pre-calculate all possible values for r and s */ + fpga_modular_inv_helper_shl(buf_r, buf_r_double); // r_double = 2 * r + fpga_modular_inv_helper_shl(buf_s, buf_s_double); // s_double = 2 * s + fpga_modular_inv_helper_add(buf_r, buf_s, buf_r_plus_s); // r_plus_s = r + s + fpga_modular_inv_helper_add(buf_s, buf_r, buf_s_plus_r); // s_plus_r = s + r + + /* pre-calculate all possible values for u and v */ + fpga_modular_inv_helper_shr(buf_u, buf_u_half); // u_half = u / 2 + fpga_modular_inv_helper_shr(buf_v, buf_v_half); // v_half = v / 2 + fpga_modular_inv_helper_sub(buf_u, buf_v, buf_u_minus_v); // u_minus_v = u - v + fpga_modular_inv_helper_sub(buf_v, buf_u, buf_v_minus_u); // v_minus_u = v - u + fpga_modular_inv_helper_shr(buf_u_minus_v, buf_u_minus_v_half); // u_minus_v_half = u_minus_v / 2 + fpga_modular_inv_helper_shr(buf_v_minus_u, buf_v_minus_u_half); // v_minus_u_half = v_minus_u / 2 + + /* compare */ + fpga_modular_inv_helper_cmp(buf_v, buf_1, &cmp_v_1); + fpga_modular_inv_helper_cmp(buf_u, buf_v, &cmp_u_v); + + /* assign flags */ + v_is_1 = (cmp_v_1 == 0); + u_is_greater_than_v = (cmp_u_v > 0); + u_is_even = !(buf_u[0] & 1); + v_is_even = !(buf_v[0] & 1); + + /* select u */ + if ( u_is_even) fpga_modular_inv_helper_cpy(buf_u_new, buf_u_half); + if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_u_new, buf_u); + if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_u_new, u_is_greater_than_v ? buf_u_minus_v_half : buf_u); + + /* select v */ + if ( u_is_even) fpga_modular_inv_helper_cpy(buf_v_new, buf_v); + if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_v_new, buf_v_half); + if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_v_new, u_is_greater_than_v ? buf_v : buf_v_minus_u_half); + + /* select r */ + if ( u_is_even) fpga_modular_inv_helper_cpy(buf_r_new, buf_r); + if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_r_new, buf_r_double); + if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_r_new, u_is_greater_than_v ? buf_r_plus_s : buf_r_double); + + /* select s */ + if ( u_is_even) fpga_modular_inv_helper_cpy(buf_s_new, buf_s_double); + if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_s_new, buf_s); + if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_s_new, u_is_greater_than_v ? buf_s_double : buf_s_plus_r); + + /* update values */ + if (!v_is_1) + { fpga_modular_inv_helper_cpy(buf_u, buf_u_new); + fpga_modular_inv_helper_cpy(buf_v, buf_v_new); + fpga_modular_inv_helper_cpy(buf_r, buf_r_new); + fpga_modular_inv_helper_cpy(buf_s, buf_s_new); + } + + /* update k */ + if (!v_is_1) k++; + } + + // + // Note, that to save FPGA resources, the second stage re-uses buffers + // used in the first stage. + // + + /* flags for the second stage */ + bool k_is_0, s_is_odd; + + /* second stage */ + for (i=0; i<(2*OPERAND_WIDTH); i++) + { + /* pre-calculate all possible values */ + fpga_modular_inv_helper_shr(buf_s, buf_u); + fpga_modular_inv_helper_add(buf_s, buf_q, buf_r); + fpga_modular_inv_helper_shr(buf_r, buf_v); + + // + // assign flags + // + s_is_odd = buf_s[0] & 1; + k_is_0 = (k == 0); + + // + // select new values based on flags + // + fpga_modular_inv_helper_cpy(buf_s_new, s_is_odd ? buf_v : buf_u); + + /* update s */ + if (! k_is_0) + fpga_modular_inv_helper_cpy(buf_s, buf_s_new); + + /* update k */ + if (! k_is_0) k--; + } + + /* done, copy s into output buffer */ + for (w=0; wwords[w] = buf_s[w]; +} + + +//------------------------------------------------------------------------------ +// +// Parallelized multiplication. +// +// This routine implements the algorithm in Fig. 3. from "Ultra High +// Performance ECC over NIST Primes on Commercial FPGAs". +// +// Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2 +// bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide +// inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS +// multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial +// output words si[]. +// +// The order of loading a and b into the multipliers is a bit complicated, +// during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is +// obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the +// remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously. +// +//------------------------------------------------------------------------------ +void fpga_modular_mul_helper_multiply(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si) +//------------------------------------------------------------------------------ +{ + int w; // counter + int t, x; // more counters + int j, i; // word indices + FPGA_WORD p; // product + + // buffers for smaller words that multipliers can handle + FPGA_WORD_REDUCED ai[2*OPERAND_NUM_WORDS]; + FPGA_WORD_REDUCED bj[2*OPERAND_NUM_WORDS]; + + // split a and b into smaller words + for (w=0; wwords[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)), + bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2)); + + // accumulators + FPGA_WORD_EXTENDED mac[2*OPERAND_NUM_WORDS]; + + // clear accumulators + for (w=0; w<(2*OPERAND_NUM_WORDS); w++) mac[w] = 0; + + // run the crazy algorithm :) + for (t=0; t<(2*OPERAND_NUM_WORDS); t++) + { + // save upper half of si[] (one word per cycle) + if (t > 0) + { si[4*OPERAND_NUM_WORDS - (t+1)] = mac[t]; + mac[t] = 0; + } + + // update index + j = 2*OPERAND_NUM_WORDS - (t+1); + + // parallel multiplication + for (x=0; x<(2*OPERAND_NUM_WORDS); x++) + { + // update index + i = t - x; + if (i < 0) i += 2*OPERAND_NUM_WORDS; + + // multiply... + fpga_lowlevel_mul16(ai[i], bj[j], &p); + + // ...accumulate + mac[x] += p; + } + + } + + // now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once) + for (w=0; w<(2*OPERAND_NUM_WORDS); w++) + si[w] = mac[2*OPERAND_NUM_WORDS - (w+1)]; +} + + +//------------------------------------------------------------------------------ +// +// Accumulation of partial words into full-size product. +// +// This routine implements the Algorithm 4. from "Ultra High Performance ECC +// over NIST Primes on Commercial FPGAs". +// +// Input words si[] are accumulated into full-size product c[]. +// +// The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in +// si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are +// summed in full, odd words are split into two parts. During every cycle the +// upper part of the previous word and the lower part of the following word are +// summed too. +// +//------------------------------------------------------------------------------ +void fpga_modular_mul_helper_accumulate(FPGA_WORD_EXTENDED *si, FPGA_WORD *c) +//------------------------------------------------------------------------------ +{ + int w; // word counter + FPGA_WORD_EXTENDED cw0, cw1; // intermediate sums + FPGA_WORD_REDUCED cw_carry; // wide carry + + // clear carry + cw_carry = 0; + + // execute the algorithm + for (w=0; w<(2*OPERAND_NUM_WORDS); w++) + { + // handy flags + bool w_is_first = (w == 0); + bool w_is_last = (w == (2*OPERAND_NUM_WORDS-1)); + + // accumulate full current even word... + // ...and also the upper part of the previous odd word (if not the first word) + fpga_lowlevel_add48(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0); + + // generate another word from "carry" part of the previous even word... + // ...and also the lower part of the following odd word (if not the last word) + cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2)); + cw1 |= (FPGA_WORD_EXTENDED)cw_carry; + + // accumulate once again + fpga_lowlevel_add48(cw0, cw1, &cw1); + + // store current word... + c[w] = (FPGA_WORD)cw1; + + // ...and carry + cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH); + } +} + + +//------------------------------------------------------------------------------ +// +// Fast modular reduction for NIST prime P-256. +// +// p = c mod p256 +// +// This routine implements the algorithm 2.29 from "Guide to Elliptic Curve +// Cryptography". +// +// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c +// on the other hand is the output of the parallelized Comba multiplier, so it +// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS). +// +// To save FPGA resources, the calculation is done using only two adders and +// one subtractor. The algorithm is split into five steps. +// +//------------------------------------------------------------------------------ +#if USE_CURVE == 1 +void fpga_modular_mul_helper_reduce_p256(FPGA_WORD *c, FPGA_BUFFER *p) +{ + // "funny" words + FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9; + + // compose "funny" words out of input words + s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0]; + s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0, s2.words[1] = 0, s2.words[0] = 0; + s3.words[7] = 0, s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0, s3.words[1] = 0, s3.words[0] = 0; + s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0, s4.words[4] = 0, s4.words[3] = 0, s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8]; + s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9]; + s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0, s6.words[4] = 0, s6.words[3] = 0, s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11]; + s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0, s7.words[4] = 0, s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12]; + s8.words[7] = c[12], s8.words[6] = 0, s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13]; + s9.words[7] = c[13], s9.words[6] = 0, s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0, s9.words[1] = c[15], s9.words[0] = c[14]; + + // intermediate results + FPGA_BUFFER sum0, sum1, difference; + + /* Step 1. */ + fpga_modular_add(&s2, &s2, &sum0); // sum0 = 2*s2 + fpga_modular_add(&s3, &s3, &sum1); // sum1 = 2*s3 + fpga_modular_sub(&ecdsa_zero, &s6, &difference); // difference = -s6 + + /* Step 2. */ + fpga_modular_add(&sum0, &s1, &sum0); // sum0 = s1 + 2*s2 + fpga_modular_add(&sum1, &s4, &sum1); // sum1 = s4 + 2*s3 + fpga_modular_sub(&difference, &s7, &difference); // difference = -(s6 + s7) + + /* Step 3. */ + fpga_modular_add(&sum0, &s5, &sum0); // sum0 = s1 + 2*s2 + s5 + fpga_modular_add(&sum1, &ecdsa_zero, &sum1); // compulsory cycle to keep sum1 constant for next stage + fpga_modular_sub(&difference, &s8, &difference); // difference = -(s6 + s7 + s8) + + /* Step 4. */ + fpga_modular_add(&sum0, &sum1, &sum0); // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5 +// fpga_modular_add(, , &sum1); // dummy cycle, result ignored + fpga_modular_sub(&difference, &s9, &difference); // difference = -(s6 + s7 + s8 + s9) + + /* Step 5. */ + fpga_modular_add(&sum0, &difference, p); // p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9 +// fpga_modular_add(, , &sum1); // dummy cycle, result ignored +// fpga_modular_add(, , &difference); // dummy cycle, result ignored +} +#endif + + +//------------------------------------------------------------------------------ +// +// Fast modular reduction for NIST prime P-384. +// +// p = c mod p384 +// +// This routine implements the algorithm 2.30 from "Guide to Elliptic Curve +// Cryptography". +// +// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c +// on the other hand is the output of the parallelized Comba multiplier, so it +// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS). +// +// To save FPGA resources, the calculation is done using only two adders and +// one subtractor. The algorithm is split into five steps. +// +//------------------------------------------------------------------------------ +#if USE_CURVE == 2 +void fpga_modular_mul_helper_reduce_p384(FPGA_WORD *c, FPGA_BUFFER *p) +{ + // "funny" words + FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + // compose "funny" words + s1.words[11] = c[11], s1.words[10] = c[10], s1.words[ 9] = c[ 9], s1.words[ 8] = c[ 8], s1.words[ 7] = c[ 7], s1.words[ 6] = c[ 6], s1.words[ 5] = c[ 5], s1.words[ 4] = c[ 4], s1.words[ 3] = c[ 3], s1.words[ 2] = c[ 2], s1.words[ 1] = c[ 1], s1.words[ 0] = c[ 0]; + s2.words[11] = 0, s2.words[10] = 0, s2.words[ 9] = 0, s2.words[ 8] = 0, s2.words[ 7] = 0, s2.words[ 6] = c[23], s2.words[ 5] = c[22], s2.words[ 4] = c[21], s2.words[ 3] = 0, s2.words[ 2] = 0, s2.words[ 1] = 0, s2.words[ 0] = 0; + s3.words[11] = c[23], s3.words[10] = c[22], s3.words[ 9] = c[21], s3.words[ 8] = c[20], s3.words[ 7] = c[19], s3.words[ 6] = c[18], s3.words[ 5] = c[17], s3.words[ 4] = c[16], s3.words[ 3] = c[15], s3.words[ 2] = c[14], s3.words[ 1] = c[13], s3.words[ 0] = c[12]; + s4.words[11] = c[20], s4.words[10] = c[19], s4.words[ 9] = c[18], s4.words[ 8] = c[17], s4.words[ 7] = c[16], s4.words[ 6] = c[15], s4.words[ 5] = c[14], s4.words[ 4] = c[13], s4.words[ 3] = c[12], s4.words[ 2] = c[23], s4.words[ 1] = c[22], s4.words[ 0] = c[21]; + s5.words[11] = c[19], s5.words[10] = c[18], s5.words[ 9] = c[17], s5.words[ 8] = c[16], s5.words[ 7] = c[15], s5.words[ 6] = c[14], s5.words[ 5] = c[13], s5.words[ 4] = c[12], s5.words[ 3] = c[20], s5.words[ 2] = 0, s5.words[ 1] = c[23], s5.words[ 0] = 0; + s6.words[11] = 0, s6.words[10] = 0, s6.words[ 9] = 0, s6.words[ 8] = 0, s6.words[ 7] = c[23], s6.words[ 6] = c[22], s6.words[ 5] = c[21], s6.words[ 4] = c[20], s6.words[ 3] = 0, s6.words[ 2] = 0, s6.words[ 1] = 0, s6.words[ 0] = 0; + s7.words[11] = 0, s7.words[10] = 0, s7.words[ 9] = 0, s7.words[ 8] = 0, s7.words[ 7] = 0, s7.words[ 6] = 0, s7.words[ 5] = c[23], s7.words[ 4] = c[22], s7.words[ 3] = c[21], s7.words[ 2] = 0, s7.words[ 1] = 0, s7.words[ 0] = c[20]; + s8.words[11] = c[22], s8.words[10] = c[21], s8.words[ 9] = c[20], s8.words[ 8] = c[19], s8.words[ 7] = c[18], s8.words[ 6] = c[17], s8.words[ 5] = c[16], s8.words[ 4] = c[15], s8.words[ 3] = c[14], s8.words[ 2] = c[13], s8.words[ 1] = c[12], s8.words[ 0] = c[23]; + s9.words[11] = 0, s9.words[10] = 0, s9.words[ 9] = 0, s9.words[ 8] = 0, s9.words[ 7] = 0, s9.words[ 6] = 0, s9.words[ 5] = 0, s9.words[ 4] = c[23], s9.words[ 3] = c[22], s9.words[ 2] = c[21], s9.words[ 1] = c[20], s9.words[ 0] = 0; + s10.words[11] = 0, s10.words[10] = 0, s10.words[ 9] = 0, s10.words[ 8] = 0, s10.words[ 7] = 0, s10.words[ 6] = 0, s10.words[ 5] = 0, s10.words[ 4] = c[23], s10.words[ 3] = c[23], s10.words[ 2] = 0, s10.words[ 1] = 0, s10.words[ 0] = 0; + + // intermediate results + FPGA_BUFFER sum0, sum1, difference; + + /* Step 1. */ + fpga_modular_add(&s1, &s3, &sum0); // sum0 = s1 + s3 + fpga_modular_add(&s2, &s2, &sum1); // sum1 = 2*s2 + fpga_modular_sub(&ecdsa_zero, &s8, &difference); // difference = -s8 + + /* Step 2. */ + fpga_modular_add(&sum0, &s4, &sum0); // sum0 = s1 + s3 + s4 + fpga_modular_add(&sum1, &s5, &sum1); // sum1 = 2*s2 + s5 + fpga_modular_sub(&difference, &s9, &difference); // difference = -(s8 + s9) + + /* Step 3. */ + fpga_modular_add(&sum0, &s6, &sum0); // sum0 = s1 + s3 + s4 + s6 + fpga_modular_add(&sum1, &s7, &sum1); // sum1 = 2*s2 + s5 + s7 + fpga_modular_sub(&difference, &s10, &difference); // difference = -(s8 + s9 + s10) + + /* Step 4. */ + fpga_modular_add(&sum0, &sum1, &sum0); // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5 +// fpga_modular_add(, , &sum1); // dummy cycle, result ignored + fpga_modular_sub(&difference, &ecdsa_zero, &difference); // compulsory cycle to keep difference constant for next stage + + /* Step 5. */ + fpga_modular_add(&sum0, &difference, p); // p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10 +// fpga_modular_add(, , &sum1); // dummy cycle, result ignored +// fpga_modular_add(, , &difference); // dummy cycle, result ignored +} +#endif + + +//------------------------------------------------------------------------------ +// +// Multi-word shift to the left by 1 bit. +// +// y = x << 1 +// +//------------------------------------------------------------------------------ +void fpga_modular_inv_helper_shl(FPGA_WORD *x, FPGA_WORD *y) +//------------------------------------------------------------------------------ +{ + int w; // word counter + FPGA_WORD carry_in, carry_out; // carries + + carry_in = 0; // first word has no carry + + // shift word-by-word + for (w=0; w<=OPERAND_NUM_WORDS; w++) + carry_out = x[w] >> (FPGA_WORD_WIDTH - 1), // store next carry + y[w] = x[w] << 1, // shift + y[w] |= carry_in, // process carry + carry_in = carry_out; // propagate carry +} + + +//------------------------------------------------------------------------------ +// +// Multi-word shift to the right by 1 bit. +// +// y = x >> 1 +// +//------------------------------------------------------------------------------ +void fpga_modular_inv_helper_shr(FPGA_WORD *x, FPGA_WORD *y) +//------------------------------------------------------------------------------ +{ + int w; // word counter + FPGA_WORD carry_in, carry_out; // carries + + carry_in = 0; // first word has no carry + + // shift word-by-word + for (w=OPERAND_NUM_WORDS; w>=0; w--) + carry_out = x[w], // store next carry + y[w] = x[w] >> 1, // shift + y[w] |= carry_in << (FPGA_WORD_WIDTH - 1), // process carry + carry_in = carry_out; // propagate carry +} + + +//------------------------------------------------------------------------------ +// +// Multi-word addition. +// +// s = x + y +// +//------------------------------------------------------------------------------ +void fpga_modular_inv_helper_add(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *s) +//------------------------------------------------------------------------------ +{ + int w; // word counter + bool carry_in, carry_out; // carries + + // lowest word has no carry + carry_in = false; + + // sum a and b word-by-word + for (w=0; w<=OPERAND_NUM_WORDS; w++) + { + // low-level addition + fpga_lowlevel_add32(x[w], y[w], carry_in, &s[w], &carry_out); + + // propagate carry bit + carry_in = carry_out; + } +} + + +//------------------------------------------------------------------------------ +// +// Multi-word subtraction . +// +// d = x - y +// +//------------------------------------------------------------------------------ +void fpga_modular_inv_helper_sub(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *d) +//------------------------------------------------------------------------------ +{ + int w; // word counter + bool borrow_in, borrow_out; // borrows + + // lowest word has no borrow + borrow_in = false; + + // subtract b from a word-by-word + for (w=0; w<=OPERAND_NUM_WORDS; w++) + { + // low-level subtraction + fpga_lowlevel_sub32(x[w], y[w], borrow_in, &d[w], &borrow_out); + + // propagate borrow bit + borrow_in = borrow_out; + } + +} + + +//------------------------------------------------------------------------------ +// +// Multi-word copy. +// +// dst = src +// +//------------------------------------------------------------------------------ +void fpga_modular_inv_helper_cpy(FPGA_WORD *dst, FPGA_WORD *src) +//------------------------------------------------------------------------------ +{ + int w; // word counter + + // copy all the words from src into dst + for (w=0; w<=OPERAND_NUM_WORDS; w++) + dst[w] = src[w]; +} + + +//------------------------------------------------------------------------------ +// +// Multi-word comparison. +// +// The return value is -1 when ab. +// +//------------------------------------------------------------------------------ +void fpga_modular_inv_helper_cmp(FPGA_WORD *a, FPGA_WORD *b, int *c) +//------------------------------------------------------------------------------ +{ + int w; // word counter + int r, r0, ra, rb; // result + bool borrow; // borrow + FPGA_WORD d; // partial difference + + // result is unknown so far + r = 0; + + // clear borrow for the very first word + borrow = false; + + // compare a and b word-by-word + for (w=OPERAND_NUM_WORDS; w>=0; w--) + { + // save result + r0 = r; + + // subtract current words + fpga_lowlevel_sub32(a[w], b[w], false, &d, &borrow); + + // analyze flags + rb = borrow ? -1 : 0; // a[w] < b[w] + ra = (!borrow && (d != 0)) ? 1 : 0; // a[w] > b[w] + + // + // Note, that ra is either 0 or 1, rb is either 0 or -1 and they + // can never be non-zero at the same time. + // + // Note, that r can only be updated if comparison result has not + // been resolved yet. Even if we already know comparison result, + // we continue doing dummy subtractions to keep run-time constant. + // + + // update result + r = (r == 0) ? ra + rb : r0; + } + + // done + *c = r; +} + + +//------------------------------------------------------------------------------ +// End-of-File +//------------------------------------------------------------------------------ -- cgit v1.2.3