// // modexp_fpga_model_montgomery.cpp // ------------------------------------------------------------- // Montgomery modular multiplication and exponentiation routines // // Authors: Pavel Shatov // Copyright (c) 2017, NORDUnet A/S // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // - Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // - Neither the name of the NORDUnet nor the names of its contributors may // be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // //---------------------------------------------------------------- // Headers //---------------------------------------------------------------- #include "modexp_fpga_model.h" #include "modexp_fpga_model_pe.h" #include "modexp_fpga_model_montgomery.h" //---------------------------------------------------------------- // Montgomery modular multiplier //---------------------------------------------------------------- void montgomery_multiply(const FPGA_WORD *A, const FPGA_WORD *B, const FPGA_WORD *N, const FPGA_WORD *N_COEFF, FPGA_WORD *R, size_t len) //---------------------------------------------------------------- // // R = A * B * 2^-len mod N // // The high-level algorithm is: // // 1. AB = A * B // 2. Q = AB * N_COEFF // 3. QN = Q * N // 4. S = AB + QN // 5. SN = S - N // 6. R = (SN < 0) ? S : SN // 7. R = R >> len // //---------------------------------------------------------------- { size_t i, j, k; // counters bool select_s; // flag FPGA_WORD t_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // accumulators FPGA_WORD t_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD t_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD s_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // intermediate products FPGA_WORD s_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD s_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD c_in_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // input carries FPGA_WORD c_in_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD c_in_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD c_out_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // output carries FPGA_WORD c_out_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD c_out_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // FPGA_WORD c_in_s; // 1-bit carry and borrow FPGA_WORD b_in_sn; // FPGA_WORD c_out_s; // FPGA_WORD b_out_sn; // FPGA_WORD AB[2 * MAX_OPERAND_WORDS]; // final products FPGA_WORD Q [2 * MAX_OPERAND_WORDS]; // FPGA_WORD QN[2 * MAX_OPERAND_WORDS]; // FPGA_WORD S [2 * MAX_OPERAND_WORDS]; // final sum FPGA_WORD SN[2 * MAX_OPERAND_WORDS]; // final difference // number of systolic cycles needed to multiply entire B by one word of A size_t num_systolic_cycles = len / SYSTOLIC_NUM_WORDS; // initialize arrays of accumulators and carries to zeroes for (i=0; i 0) t_ab[k-1][SYSTOLIC_NUM_WORDS-1] = s_ab[k][0], t_q [k-1][SYSTOLIC_NUM_WORDS-1] = s_q [k][0], t_qn[k-1][SYSTOLIC_NUM_WORDS-1] = s_qn[k][0]; } // now it's time to simultaneously add and subtract // current operand words FPGA_WORD QNi = QN[i]; FPGA_WORD Ni = (i < len) ? 0 : N[i-len]; // add, subtract pe_add(AB[i], QNi, c_in_s, &S[i], &c_out_s); pe_sub(S [i], Ni, b_in_sn, &SN[i], &b_out_sn); // propagate carry and borrow c_in_s = c_out_s; b_in_sn = b_out_sn; } // flag select the right result select_s = b_out_sn && !c_out_s; // copy product into output buffer for (i=0; i 0) ? 0 : 1, P[word_cnt] = A[word_cnt]; FPGA_WORD M_PP[MAX_OPERAND_WORDS]; // intermediate buffer for next power FPGA_WORD M_RP[MAX_OPERAND_WORDS]; // intermediate buffer for next result // scan all bits of the exponent for (bit_cnt=0; bit_cnt<(len * CHAR_BIT * sizeof(FPGA_WORD)); bit_cnt++) { montgomery_multiply(P, P, N, N_COEFF, M_PP, len); // M_PP = P * P montgomery_multiply(R, P, N, N_COEFF, M_RP, len); // M_RP = R * P word_index = bit_cnt / (CHAR_BIT * sizeof(FPGA_WORD)); bit_index = bit_cnt & ((CHAR_BIT * sizeof(FPGA_WORD)) - 1); mask = 1 << bit_index; // current bit of exponent // whether we need to update R (non-zero exponent bit) flag_update_r = (B[word_index] & mask) == mask; // always update P for (word_cnt=0; word_cnt 0) ? 0 : 1; // do the math for (i=0; i<(2 * len * CHAR_BIT * sizeof(FPGA_WORD)); i++) { // clear carry and borrow carry_in = 0, borrow_in = 0; // calculate f1 = f << 1, f2 = f1 - n for (j=0; j> (sizeof(FPGA_WORD) * CHAR_BIT - 1); // | M <<= 1 FACTOR[j] <<= 1, FACTOR[j] |= carry_in; // | pe_sub(FACTOR[j], N[j], borrow_in, &FACTOR_N[j], &borrow_out); // MN = M - N carry_in = carry_out, borrow_in = borrow_out; // propagate carry & borrow } // obtain flag flag_keep_f = (borrow_out && !carry_out); // now select the right value for (j=0; j 0) ? 0 : 1; // nw = 1 pe_add(~N[i], nw, carry_in, &NN[i], &carry_out); // NN = ~N + nw carry_in = carry_out; // propagate carry } // R = 1 for (i=0; i 0) ? 0 : 1; // calculate T = R * NN for (k=1; k<(len * sizeof(FPGA_WORD) * CHAR_BIT); k++) { // T = 0 for (i=0; i