aboutsummaryrefslogtreecommitdiff
path: root/modexp_fpga_model_montgomery.cpp
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-06-13 20:11:58 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2017-06-13 20:11:58 +0300
commit53e92c5355aca120eab8d59e6904282c9e3b4ab1 (patch)
tree62c6b65e52c01bbbfdac063ec8ba791a9d073203 /modexp_fpga_model_montgomery.cpp
Initial commit of faster modular exponentiation model based on systolic architecture.
Diffstat (limited to 'modexp_fpga_model_montgomery.cpp')
-rw-r--r--modexp_fpga_model_montgomery.cpp417
1 files changed, 417 insertions, 0 deletions
diff --git a/modexp_fpga_model_montgomery.cpp b/modexp_fpga_model_montgomery.cpp
new file mode 100644
index 0000000..d1cca60
--- /dev/null
+++ b/modexp_fpga_model_montgomery.cpp
@@ -0,0 +1,417 @@
+//
+// modexp_fpga_model_montgomery.cpp
+// -------------------------------------------------------------
+// Montgomery modular multiplication and exponentiation routines
+//
+// Authors: Pavel Shatov
+// Copyright (c) 2017, NORDUnet A/S
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+
+//----------------------------------------------------------------
+// Headers
+//----------------------------------------------------------------
+#include "modexp_fpga_model.h"
+#include "modexp_fpga_model_pe.h"
+#include "modexp_fpga_model_montgomery.h"
+
+
+//----------------------------------------------------------------
+// Montgomery modular multiplier
+//----------------------------------------------------------------
+void montgomery_multiply(const FPGA_WORD *A, const FPGA_WORD *B, const FPGA_WORD *N, const FPGA_WORD *N_COEFF, FPGA_WORD *R, size_t len)
+//----------------------------------------------------------------
+//
+// R = A * B * 2^-len mod N
+//
+// The high-level algorithm is:
+//
+// 1. AB = A * B
+// 2. Q = AB * N_COEFF
+// 3. QN = Q * N
+// 4. S = AB + QN
+// 5. SN = S - N
+// 6. R = (SN < 0) ? S : SN
+// 7. R = R >> len
+//
+//----------------------------------------------------------------
+{
+ size_t i, j, k; // counters
+
+ bool select_s; // flag
+
+ FPGA_WORD t_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // accumulators
+ FPGA_WORD t_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+ FPGA_WORD t_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+
+ FPGA_WORD s_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // intermediate products
+ FPGA_WORD s_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+ FPGA_WORD s_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+
+ FPGA_WORD c_in_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // input carries
+ FPGA_WORD c_in_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+ FPGA_WORD c_in_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+ FPGA_WORD c_out_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; // output carries
+ FPGA_WORD c_out_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+ FPGA_WORD c_out_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS]; //
+
+ FPGA_WORD c_in_s; // 1-bit carry and borrow
+ FPGA_WORD b_in_sn; //
+ FPGA_WORD c_out_s; //
+ FPGA_WORD b_out_sn; //
+
+ FPGA_WORD AB[2 * MAX_OPERAND_WORDS]; // final products
+ FPGA_WORD Q [2 * MAX_OPERAND_WORDS]; //
+ FPGA_WORD QN[2 * MAX_OPERAND_WORDS]; //
+
+ FPGA_WORD S [2 * MAX_OPERAND_WORDS]; // final sum
+ FPGA_WORD SN[2 * MAX_OPERAND_WORDS]; // final difference
+
+ // number of systolic cycles needed to multiply entire B by one word of A
+ size_t num_systolic_cycles = len / SYSTOLIC_NUM_WORDS;
+
+ // initialize arrays of accumulators and carries to zeroes
+ for (i=0; i<num_systolic_cycles; i++)
+ for (j=0; j<SYSTOLIC_NUM_WORDS; j++)
+ c_in_ab[i][j] = 0, c_in_q [i][j] = 0, c_in_qn[i][j] = 0,
+ t_ab[i][j] = 0, t_q [i][j] = 0, t_qn[i][j] = 0;
+
+ // initialize 1-bit carry and borrow to zeroes too
+ c_in_s = 0, b_in_sn = 0;
+
+ // simultaneously calculate AB, Q, QN, S, SN
+ for (i = 0; i < (2 * len); i++)
+ {
+ // multiply entire B by current word of A to get AB
+ // multiply entire N_COEFF by current word of AB to get Q
+ // multiply entire N by current word of Q to get QN
+ for (k = 0; k < num_systolic_cycles; k++)
+ {
+ // simulate how a systolic array would work
+ for (j = 0; j < SYSTOLIC_NUM_WORDS; j++)
+ {
+ // current words of B, N_COEFF, N
+ FPGA_WORD Bj = B [k * SYSTOLIC_NUM_WORDS + j];
+ FPGA_WORD N_COEFFj = N_COEFF[k * SYSTOLIC_NUM_WORDS + j];
+ FPGA_WORD Nj = N [k * SYSTOLIC_NUM_WORDS + j];
+
+ // current word of A
+ FPGA_WORD Aj_ab = (i < len) ? A[i] : 0;
+
+ // AB = A * B
+ pe_mul(Aj_ab, Bj, t_ab[k][j], c_in_ab[k][j], &s_ab[k][j], &c_out_ab[k][j]);
+
+ // store current word of AB
+ if ((k == 0) && (j == 0)) AB[i] = s_ab[0][0];
+
+ // current word of AB
+ FPGA_WORD Aj_q = (i < len) ? AB[i] : 0;
+
+ // Q = AB * N
+ pe_mul(Aj_q, N_COEFFj, t_q[k][j], c_in_q[k][j], &s_q[k][j], &c_out_q[k][j]);
+
+ // store current word of Q
+ if ((k == 0) && (j == 0)) Q[i] = s_q[0][0];
+
+ // current word of Q
+ FPGA_WORD Aj_qn = (i < len) ? Q[i] : 0;
+
+ // QN = Q * N
+ pe_mul(Aj_qn, Nj, t_qn[k][j], c_in_qn[k][j], &s_qn[k][j], &c_out_qn[k][j]);
+
+ // store next word of QN
+ if ((k == 0) && (j == 0)) QN[i] = s_qn[0][0];
+ }
+
+ // propagate carries
+ for (j=0; j<SYSTOLIC_NUM_WORDS; j++)
+ c_in_ab[k][j] = c_out_ab[k][j],
+ c_in_q [k][j] = c_out_q [k][j],
+ c_in_qn[k][j] = c_out_qn[k][j];
+
+ // update accumulators
+ for (j=1; j<SYSTOLIC_NUM_WORDS; j++)
+ {
+ t_ab[k][j-1] = s_ab[k][j];
+ t_q [k][j-1] = s_q [k][j];
+ t_qn[k][j-1] = s_qn[k][j];
+ }
+
+ // update accumulators
+ if (k > 0)
+ t_ab[k-1][SYSTOLIC_NUM_WORDS-1] = s_ab[k][0],
+ t_q [k-1][SYSTOLIC_NUM_WORDS-1] = s_q [k][0],
+ t_qn[k-1][SYSTOLIC_NUM_WORDS-1] = s_qn[k][0];
+
+ }
+
+ // now it's time to simultaneously add and subtract
+
+ // current operand words
+ FPGA_WORD QNi = QN[i];
+ FPGA_WORD Ni = (i < len) ? 0 : N[i-len];
+
+ // add, subtract
+ pe_add(AB[i], QNi, c_in_s, &S[i], &c_out_s);
+ pe_sub(S [i], Ni, b_in_sn, &SN[i], &b_out_sn);
+
+ // propagate carry and borrow
+ c_in_s = c_out_s;
+ b_in_sn = b_out_sn;
+ }
+
+ // flag select the right result
+ select_s = b_out_sn && !c_out_s;
+
+ // copy product into output buffer
+ for (i=0; i<len; i++)
+ R[i] = select_s ? S[i+len] : SN[i+len];
+}
+
+
+//----------------------------------------------------------------
+// Classic binary exponentiation
+//----------------------------------------------------------------
+void montgomery_exponentiate(const FPGA_WORD *A, const FPGA_WORD *B, const FPGA_WORD *N, const FPGA_WORD *N_COEFF, FPGA_WORD *R, size_t len)
+//----------------------------------------------------------------
+//
+// R = A ** B mod N
+//
+//----------------------------------------------------------------
+{
+ size_t word_cnt, bit_cnt; // counters
+ size_t word_index, bit_index; // indices
+
+ bool flag_update_r; // flag
+
+ FPGA_WORD P[MAX_OPERAND_WORDS]; // power of A
+ FPGA_WORD mask; // mask
+
+ // R = 1, P = 1
+ for (word_cnt=0; word_cnt<len; word_cnt++)
+ R[word_cnt] = (word_cnt > 0) ? 0 : 1,
+ P[word_cnt] = A[word_cnt];
+
+ FPGA_WORD M_PP[MAX_OPERAND_WORDS]; // intermediate buffer for next power
+ FPGA_WORD M_RP[MAX_OPERAND_WORDS]; // intermediate buffer for next result
+
+ // scan all bits of the exponent
+ for (bit_cnt=0; bit_cnt<(len * CHAR_BIT * sizeof(FPGA_WORD)); bit_cnt++)
+ {
+ montgomery_multiply(P, P, N, N_COEFF, M_PP, len); // M_PP = P * P
+ montgomery_multiply(R, P, N, N_COEFF, M_RP, len); // M_RP = R * P
+
+ word_index = bit_cnt / (CHAR_BIT * sizeof(FPGA_WORD));
+ bit_index = bit_cnt & ((CHAR_BIT * sizeof(FPGA_WORD)) - 1);
+
+ mask = 1 << bit_index; // current bit of exponent
+
+ // whether we need to update R (non-zero exponent bit)
+ flag_update_r = (B[word_index] & mask) == mask;
+
+ // always update P
+ for (word_cnt=0; word_cnt<len; word_cnt++)
+ P[word_cnt] = M_PP[word_cnt];
+
+ // only update R when necessary
+ if (flag_update_r)
+ {
+ for (word_cnt=0; word_cnt<len; word_cnt++)
+ R[word_cnt] = M_RP[word_cnt];
+ }
+ }
+}
+
+
+//----------------------------------------------------------------
+// Montgomery factor calculation
+//----------------------------------------------------------------
+void montgomery_calc_factor(const FPGA_WORD *N, FPGA_WORD *FACTOR, size_t len)
+//----------------------------------------------------------------
+//
+// FACTOR = 2 ** (2*len) mod N
+//
+// This routine calculates the factor that is necessary to bring
+// numbers into Montgomery domain. The high-level algorithm is:
+//
+// 1. f = 1
+// 2. for i=0 to 2*len-1
+// 3. f1 = f << 1
+// 4. f2 = f1 - n
+// 5. f = (f2 < 0) ? f1 : f2
+//
+//----------------------------------------------------------------
+{
+ size_t i, j; // counters
+
+ bool flag_keep_f; // flag
+
+ // temporary buffer
+ FPGA_WORD FACTOR_N[MAX_OPERAND_WORDS];
+
+ // carry and borrow
+ FPGA_WORD carry_in, carry_out;
+ FPGA_WORD borrow_in, borrow_out;
+
+ // FACTOR = 1
+ for (i=0; i<len; i++)
+ FACTOR[i] = (i > 0) ? 0 : 1;
+
+ // do the math
+ for (i=0; i<(2 * len * CHAR_BIT * sizeof(FPGA_WORD)); i++)
+ {
+ // clear carry and borrow
+ carry_in = 0, borrow_in = 0;
+
+ // calculate f1 = f << 1, f2 = f1 - n
+ for (j=0; j<len; j++)
+ {
+ carry_out = FACTOR[j] >> (sizeof(FPGA_WORD) * CHAR_BIT - 1); // | M <<= 1
+ FACTOR[j] <<= 1, FACTOR[j] |= carry_in; // |
+
+ pe_sub(FACTOR[j], N[j], borrow_in, &FACTOR_N[j], &borrow_out); // MN = M - N
+
+ carry_in = carry_out, borrow_in = borrow_out; // propagate carry & borrow
+ }
+
+ // obtain flag
+ flag_keep_f = (borrow_out && !carry_out);
+
+ // now select the right value
+ for (j=0; j<len; j++)
+ FACTOR[j] = flag_keep_f ? FACTOR[j] : FACTOR_N[j];
+ }
+}
+
+
+//----------------------------------------------------------------
+// Montgomery modulus-dependent coefficient calculation
+//----------------------------------------------------------------
+void montgomery_calc_n_coeff(const FPGA_WORD *N, FPGA_WORD *N_COEFF, size_t len)
+//----------------------------------------------------------------
+//
+// N_COEFF = -N ** -1 mod 2 ** len
+//
+// This routine calculates the coefficient that is used during the reduction
+// phase of Montgomery multiplication to zero out the lower half of product.
+//
+// The high-level algorithm is:
+//
+// 1. R = 1
+// 2. NN = ~N + 1
+// 3. for i=1 to len-1
+// 4. T = R * NN mod 2 ** len
+// 5. if T[i] then
+// 6. R = R + (1 << i)
+//
+//----------------------------------------------------------------
+{
+ size_t i, j, k; // counters
+
+ FPGA_WORD NN[MAX_OPERAND_WORDS]; // temporary buffers
+ FPGA_WORD T [MAX_OPERAND_WORDS]; //
+ FPGA_WORD R [MAX_OPERAND_WORDS]; //
+ FPGA_WORD R1[MAX_OPERAND_WORDS]; //
+
+ bool flag_update_r; // flag
+
+ FPGA_WORD nw, pwr; //
+ FPGA_WORD sum_c_in, sum_c_out; //
+ FPGA_WORD carry_in, carry_out; //
+ FPGA_WORD mul_s, mul_c_in, mul_c_out; //
+
+ // NN = -N mod 2 ** len = ~N + 1 mod 2 ** len
+ carry_in = 0;
+ for (i=0; i<len; i++)
+ { nw = (i > 0) ? 0 : 1; // nw = 1
+ pe_add(~N[i], nw, carry_in, &NN[i], &carry_out); // NN = ~N + nw
+ carry_in = carry_out; // propagate carry
+ }
+
+ // R = 1
+ for (i=0; i<len; i++)
+ R[i] = (i > 0) ? 0 : 1;
+
+ // calculate T = R * NN
+ for (k=1; k<(len * sizeof(FPGA_WORD) * CHAR_BIT); k++)
+ {
+ // T = 0
+ for (i=0; i<len; i++) T[i] = 0;
+
+ // T = NN * R
+ for (i=0; i<len; i++)
+ {
+ mul_c_in = 0;
+
+ for (j=0; j<(len-i); j++)
+ {
+
+ pe_mul(R[j], NN[i], T[i+j], mul_c_in, &mul_s, &mul_c_out);
+
+ T[i+j] = mul_s;
+
+ mul_c_in = mul_c_out;
+ }
+ }
+
+ // get word and index indices
+ size_t word_index = k / (CHAR_BIT * sizeof(FPGA_WORD));
+ size_t bit_index = k & ((CHAR_BIT * sizeof(FPGA_WORD)) - 1);
+
+ // update bit mask
+ FPGA_WORD bit_mask = (1 << bit_index);
+
+ sum_c_in = 0; // clear carry
+ flag_update_r = false; // reset flag
+
+ // calculate R1 = R + 1 << (2 * len)
+ for (i=0; i<len; i++)
+ {
+ if (i == word_index) flag_update_r = (T[i] & bit_mask) == bit_mask;
+
+ pwr = (i == word_index) ? bit_mask : 0;
+ pe_add(R[i], pwr, sum_c_in, &R1[i], &sum_c_out);
+ carry_in = carry_out;
+ }
+
+ // update r
+ for (i=0; i<len; i++)
+ R[i] = flag_update_r ? R1[i] : R[i];
+ }
+
+ // store output
+ for (i=0; i<len; i++)
+ N_COEFF[i] = R[i];
+}
+
+
+//----------------------------------------------------------------
+// End of file
+//----------------------------------------------------------------