1 files changed, 417 insertions, 0 deletions
diff --git a/modexp_fpga_model_montgomery.cpp b/modexp_fpga_model_montgomery.cpp
new file mode 100644
index 0000000..d1cca60
--- /dev/null
+++ b/modexp_fpga_model_montgomery.cpp
@@ -0,0 +1,417 @@
+//
+// modexp_fpga_model_montgomery.cpp
+// -------------------------------------------------------------
+// Montgomery modular multiplication and exponentiation routines
+//
+// Authors: Pavel Shatov
+// Copyright (c) 2017, NORDUnet A/S
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+//   be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+
+//----------------------------------------------------------------
+// Headers
+//----------------------------------------------------------------
+#include "modexp_fpga_model.h"
+#include "modexp_fpga_model_pe.h"
+#include "modexp_fpga_model_montgomery.h"
+
+
+//----------------------------------------------------------------
+// Montgomery modular multiplier
+//----------------------------------------------------------------
+void montgomery_multiply(const FPGA_WORD *A, const FPGA_WORD *B, const FPGA_WORD *N, const FPGA_WORD *N_COEFF, FPGA_WORD *R, size_t len)
+//----------------------------------------------------------------
+//
+// R = A * B * 2^-len mod N
+//
+// The high-level algorithm is:
+//
+// 1. AB =  A * B
+// 2. Q  = AB * N_COEFF
+// 3. QN =  Q * N
+// 4. S  = AB + QN
+// 5. SN =  S - N
+// 6. R  = (SN < 0) ? S : SN
+// 7. R  = R >> len
+//
+//----------------------------------------------------------------
+{
+	size_t i, j, k;													// counters
+
+	bool select_s;													// flag
+
+	FPGA_WORD t_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		// accumulators
+	FPGA_WORD t_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		//
+	FPGA_WORD t_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		//
+
+	FPGA_WORD s_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		// intermediate products
+	FPGA_WORD s_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		//
+	FPGA_WORD s_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		//
+
+	FPGA_WORD c_in_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		// input carries
+	FPGA_WORD c_in_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		//
+	FPGA_WORD c_in_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];		//
+	FPGA_WORD c_out_ab[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];	// output carries
+	FPGA_WORD c_out_q [MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];	//
+	FPGA_WORD c_out_qn[MAX_SYSTOLIC_CYCLES][SYSTOLIC_NUM_WORDS];	//
+
+	FPGA_WORD c_in_s;												// 1-bit carry and borrow
+	FPGA_WORD b_in_sn;												//
+	FPGA_WORD c_out_s;												//
+	FPGA_WORD b_out_sn;												//
+
+	FPGA_WORD AB[2 * MAX_OPERAND_WORDS];							// final products
+	FPGA_WORD Q [2 * MAX_OPERAND_WORDS];							//
+	FPGA_WORD QN[2 * MAX_OPERAND_WORDS];							//
+
+	FPGA_WORD S [2 * MAX_OPERAND_WORDS];							// final sum
+	FPGA_WORD SN[2 * MAX_OPERAND_WORDS];							// final difference
+
+		// number of systolic cycles needed to multiply entire B by one word of A
+	size_t num_systolic_cycles = len / SYSTOLIC_NUM_WORDS;
+
+		// initialize arrays of accumulators and carries to zeroes
+	for (i=0; i<num_systolic_cycles; i++)
+		for (j=0; j<SYSTOLIC_NUM_WORDS; j++)
+			c_in_ab[i][j] = 0, c_in_q [i][j] = 0, c_in_qn[i][j] = 0,
+			t_ab[i][j]    = 0, t_q [i][j]    = 0, t_qn[i][j]    = 0;
+
+		// initialize 1-bit carry and borrow to zeroes too
+	c_in_s = 0, b_in_sn = 0;
+
+		// simultaneously calculate AB, Q, QN, S, SN
+	for (i = 0; i < (2 * len); i++)
+	{
+			// multiply entire B by current word of A to get AB
+			// multiply entire N_COEFF by current word of AB to get Q
+			// multiply entire N by current word of Q to get QN
+		for (k = 0; k < num_systolic_cycles; k++)
+		{
+				// simulate how a systolic array would work
+			for (j = 0; j < SYSTOLIC_NUM_WORDS; j++)
+			{
+					// current words of B, N_COEFF, N
+				FPGA_WORD Bj       = B      [k * SYSTOLIC_NUM_WORDS + j];
+				FPGA_WORD N_COEFFj = N_COEFF[k * SYSTOLIC_NUM_WORDS + j];
+				FPGA_WORD Nj       = N      [k * SYSTOLIC_NUM_WORDS + j];
+
+					// current word of A
+				FPGA_WORD Aj_ab = (i < len) ? A[i] : 0;
+
+					// AB = A * B		
+				pe_mul(Aj_ab, Bj, t_ab[k][j], c_in_ab[k][j], &s_ab[k][j], &c_out_ab[k][j]);
+
+					// store current word of AB
+				if ((k == 0) && (j == 0)) AB[i] = s_ab[0][0];
+
+					// current word of AB
+				FPGA_WORD Aj_q = (i < len) ? AB[i] : 0;
+
+					// Q = AB * N		
+				pe_mul(Aj_q, N_COEFFj, t_q[k][j], c_in_q[k][j], &s_q[k][j], &c_out_q[k][j]);
+
+					// store current word of Q
+				if ((k == 0) && (j == 0)) Q[i] = s_q[0][0];
+
+					// current word of Q
+				FPGA_WORD Aj_qn = (i < len) ? Q[i] : 0;
+
+					// QN = Q * N
+				pe_mul(Aj_qn, Nj, t_qn[k][j], c_in_qn[k][j], &s_qn[k][j], &c_out_qn[k][j]);
+
+					// store next word of QN
+				if ((k == 0) && (j == 0)) QN[i] = s_qn[0][0];
+			}
+
+				// propagate carries
+			for (j=0; j<SYSTOLIC_NUM_WORDS; j++)
+				c_in_ab[k][j] = c_out_ab[k][j],
+				c_in_q [k][j] = c_out_q [k][j],
+				c_in_qn[k][j] = c_out_qn[k][j];
+
+				// update accumulators
+			for (j=1; j<SYSTOLIC_NUM_WORDS; j++)
+			{
+				t_ab[k][j-1] = s_ab[k][j];
+				t_q [k][j-1] = s_q [k][j];
+				t_qn[k][j-1] = s_qn[k][j];
+			}
+			
+				// update accumulators
+			if (k > 0)
+				t_ab[k-1][SYSTOLIC_NUM_WORDS-1] = s_ab[k][0],
+				t_q [k-1][SYSTOLIC_NUM_WORDS-1] = s_q [k][0],
+				t_qn[k-1][SYSTOLIC_NUM_WORDS-1] = s_qn[k][0];
+
+		}
+	
+			// now it's time to simultaneously add and subtract
+
+			// current operand words
+		FPGA_WORD QNi = QN[i];
+		FPGA_WORD Ni  = (i < len) ? 0 : N[i-len];
+
+			// add, subtract
+		pe_add(AB[i], QNi, c_in_s,  &S[i],  &c_out_s);
+		pe_sub(S [i], Ni,  b_in_sn, &SN[i], &b_out_sn);
+
+			// propagate carry and borrow
+		c_in_s  = c_out_s;
+		b_in_sn = b_out_sn;
+	}
+
+		// flag select the right result
+	select_s = b_out_sn && !c_out_s;
+
+		// copy product into output buffer
+	for (i=0; i<len; i++)
+		R[i] = select_s ? S[i+len] : SN[i+len];
+}
+
+
+//----------------------------------------------------------------
+// Classic binary exponentiation
+//----------------------------------------------------------------
+void montgomery_exponentiate(const FPGA_WORD *A, const FPGA_WORD *B, const FPGA_WORD *N, const FPGA_WORD *N_COEFF, FPGA_WORD *R, size_t len)
+//----------------------------------------------------------------
+//
+// R = A ** B mod N
+//
+//----------------------------------------------------------------
+{
+	size_t word_cnt,   bit_cnt;			// counters
+	size_t word_index, bit_index;		// indices
+
+	bool flag_update_r;					// flag
+
+	FPGA_WORD P[MAX_OPERAND_WORDS];		// power of A
+	FPGA_WORD mask;						// mask		
+
+		// R = 1, P = 1
+	for (word_cnt=0; word_cnt<len; word_cnt++)
+		R[word_cnt] = (word_cnt > 0) ? 0 : 1,
+		P[word_cnt] = A[word_cnt];
+
+	FPGA_WORD M_PP[MAX_OPERAND_WORDS];	// intermediate buffer for next power
+	FPGA_WORD M_RP[MAX_OPERAND_WORDS];	// intermediate buffer for next result
+
+		// scan all bits of the exponent
+	for (bit_cnt=0; bit_cnt<(len * CHAR_BIT * sizeof(FPGA_WORD)); bit_cnt++)
+	{
+		montgomery_multiply(P, P, N, N_COEFF, M_PP, len);	// M_PP = P * P
+		montgomery_multiply(R, P, N, N_COEFF, M_RP, len);	// M_RP = R * P
+		
+		word_index = bit_cnt / (CHAR_BIT * sizeof(FPGA_WORD));
+		bit_index = bit_cnt & ((CHAR_BIT * sizeof(FPGA_WORD)) - 1);
+
+		mask = 1 << bit_index;	// current bit of exponent
+
+			// whether we need to update R (non-zero exponent bit)
+		flag_update_r = (B[word_index] & mask) == mask;
+
+			// always update P
+		for (word_cnt=0; word_cnt<len; word_cnt++)
+			P[word_cnt] = M_PP[word_cnt];
+
+			// only update R when necessary
+		if (flag_update_r)
+		{
+			for (word_cnt=0; word_cnt<len; word_cnt++)
+				R[word_cnt] = M_RP[word_cnt];
+		}
+	}
+}
+
+
+//----------------------------------------------------------------
+// Montgomery factor calculation
+//----------------------------------------------------------------
+void montgomery_calc_factor(const FPGA_WORD *N, FPGA_WORD *FACTOR, size_t len)
+//----------------------------------------------------------------
+//
+// FACTOR = 2 ** (2*len) mod N
+//
+// This routine calculates the factor that is necessary to bring
+// numbers into Montgomery domain. The high-level algorithm is:
+//
+// 1. f = 1
+// 2. for i=0 to 2*len-1
+// 3.   f1 = f << 1
+// 4.   f2 = f1 - n
+// 5.   f = (f2 < 0) ? f1 : f2
+//
+//----------------------------------------------------------------
+{
+	size_t i, j;		// counters
+
+	bool flag_keep_f;	// flag
+	
+		// temporary buffer
+	FPGA_WORD FACTOR_N[MAX_OPERAND_WORDS];
+
+		// carry and borrow
+	FPGA_WORD carry_in, carry_out;
+	FPGA_WORD borrow_in, borrow_out;
+
+		// FACTOR = 1
+	for (i=0; i<len; i++)
+		FACTOR[i] = (i > 0) ? 0 : 1;
+		
+		// do the math
+	for (i=0; i<(2 * len * CHAR_BIT * sizeof(FPGA_WORD)); i++)
+	{
+			// clear carry and borrow
+		carry_in = 0, borrow_in = 0;
+		
+			// calculate f1 = f << 1, f2 = f1 - n
+		for (j=0; j<len; j++)
+		{
+			carry_out = FACTOR[j] >> (sizeof(FPGA_WORD) * CHAR_BIT - 1);		// | M <<= 1
+			FACTOR[j] <<= 1, FACTOR[j] |= carry_in;								// |
+
+			pe_sub(FACTOR[j], N[j], borrow_in, &FACTOR_N[j], &borrow_out);		// MN = M - N
+	
+			carry_in = carry_out, borrow_in = borrow_out;						// propagate carry & borrow
+		}
+
+			// obtain flag
+		flag_keep_f = (borrow_out && !carry_out);
+
+			// now select the right value
+		for (j=0; j<len; j++)
+			FACTOR[j] = flag_keep_f ? FACTOR[j] : FACTOR_N[j];
+	}
+}
+
+
+//----------------------------------------------------------------
+// Montgomery modulus-dependent coefficient calculation
+//----------------------------------------------------------------
+void montgomery_calc_n_coeff(const FPGA_WORD *N, FPGA_WORD *N_COEFF, size_t len)
+//----------------------------------------------------------------
+//
+// N_COEFF = -N ** -1 mod 2 ** len
+//
+// This routine calculates the coefficient that is used during the reduction
+// phase of Montgomery multiplication to zero out the lower half of product.
+//
+// The high-level algorithm is:
+//
+// 1. R = 1
+// 2. NN = ~N + 1
+// 3. for i=1 to len-1
+// 4.   T = R * NN mod 2 ** len
+// 5.   if T[i] then
+// 6.     R = R + (1 << i)
+//
+//----------------------------------------------------------------
+{
+	size_t i, j, k;							// counters
+
+	FPGA_WORD NN[MAX_OPERAND_WORDS];		// temporary buffers
+	FPGA_WORD T [MAX_OPERAND_WORDS];		//
+	FPGA_WORD R [MAX_OPERAND_WORDS];		//
+	FPGA_WORD R1[MAX_OPERAND_WORDS];		//
+
+	bool flag_update_r;						// flag
+
+	FPGA_WORD nw, pwr;						//
+	FPGA_WORD sum_c_in, sum_c_out;			//
+	FPGA_WORD carry_in, carry_out;			//
+	FPGA_WORD mul_s, mul_c_in, mul_c_out;	//
+
+		// NN = -N mod 2 ** len = ~N + 1 mod 2 ** len
+	carry_in = 0;
+	for (i=0; i<len; i++)
+	{	nw = (i > 0) ? 0 : 1;								// nw = 1
+		pe_add(~N[i], nw, carry_in, &NN[i], &carry_out);	// NN = ~N + nw
+		carry_in = carry_out;								// propagate carry
+	}
+
+		// R = 1
+	for (i=0; i<len; i++)
+		R[i] = (i > 0) ? 0 : 1;
+
+		// calculate T = R * NN
+	for (k=1; k<(len * sizeof(FPGA_WORD) * CHAR_BIT); k++)
+	{
+			// T = 0
+		for (i=0; i<len; i++) T[i] = 0;
+
+			// T = NN * R
+		for (i=0; i<len; i++)
+		{
+			mul_c_in = 0;
+
+			for (j=0; j<(len-i); j++)
+			{
+			
+				pe_mul(R[j], NN[i], T[i+j], mul_c_in, &mul_s, &mul_c_out);
+				
+				T[i+j] = mul_s;
+				
+				mul_c_in = mul_c_out;
+			}
+		}
+
+			// get word and index indices
+		size_t word_index = k / (CHAR_BIT * sizeof(FPGA_WORD));
+		size_t bit_index = k & ((CHAR_BIT * sizeof(FPGA_WORD)) - 1);
+
+			// update bit mask
+		FPGA_WORD bit_mask = (1 << bit_index);
+
+		sum_c_in = 0;			// clear carry
+		flag_update_r = false;	// reset flag
+
+			// calculate R1 = R + 1 << (2 * len)
+		for (i=0; i<len; i++)
+		{
+			if (i == word_index) flag_update_r = (T[i] & bit_mask) == bit_mask;
+
+			pwr = (i == word_index) ? bit_mask : 0;
+			pe_add(R[i], pwr, sum_c_in,  &R1[i], &sum_c_out);
+			carry_in = carry_out;
+		}
+
+			// update r
+		for (i=0; i<len; i++)
+			R[i] = flag_update_r ? R1[i] : R[i];
+	}
+
+		// store output
+	for (i=0; i<len; i++)
+		N_COEFF[i] = R[i];
+}
+
+
+//----------------------------------------------------------------
+// End of file
+//----------------------------------------------------------------