1 files changed, 189 insertions, 0 deletions
diff --git a/stm32/modexpng_util.c b/stm32/modexpng_util.c
new file mode 100644
index 0000000..27afb19
--- /dev/null
+++ b/stm32/modexpng_util.c
@@ -0,0 +1,189 @@
+//
+// helper precomputation routines for the "modexpng" core
+//
+#include "modexpng_util.h"
+
+
+//
+// internal buffers
+//
+static uint32_t MOD_FACTOR_N[BUF_NUM_WORDS];
+static uint32_t MOD_NN[BUF_NUM_WORDS+1];
+static uint32_t MOD_T[BUF_NUM_WORDS+1];
+
+static void _add32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
+static void _sub32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
+static void _mul32(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
+
+
+//
+// calculation of the Montgomery factor
+//
+void _calc_montgomery_factor(uint32_t num_words, const uint32_t *N, uint32_t *N_FACTOR)
+{
+		// counters
+		uint32_t i, j;		
+
+		// flag
+		uint32_t flag_keep;
+
+		// carry and borrow
+		uint32_t cry_in, cry_out;
+		uint32_t brw_in, brw_out;
+
+		// initially set N_FACTOR = 1
+		for (i=0; i<num_words; i++)
+				N_FACTOR[i] = i ? 0 : 1;
+		
+		// do the math
+		for (i=0; i<2*(num_words * UINT32_BITS + UINT16_BITS); i++)
+		{	
+				// clear carry and borrow
+				cry_in = 0, brw_in = 0;
+		
+				// calculate N_FACTOR = N_FACTOR << 1, MOD_FACTOR_N = N_FACTOR - N
+				for (j=0; j<num_words; j++)
+				{
+						cry_out = N_FACTOR[j] >> (UINT32_BITS - 1);		// | N_FACTOR <<= 1
+						N_FACTOR[j] <<= 1; N_FACTOR[j] |= cry_in;			// |
+
+						_sub32(N_FACTOR[j], N[j], brw_in, &MOD_FACTOR_N[j], &brw_out);	// MOD_FACTOR_N = N_FACTOR - N
+	
+						// propagate carry & borrow
+						cry_in = cry_out, brw_in = brw_out;
+				}
+
+				// obtain flag
+				flag_keep = brw_out && !cry_out;
+
+				// now select the right value
+				for (j=0; j<num_words; j++)
+						N_FACTOR[j] = flag_keep ? N_FACTOR[j] : MOD_FACTOR_N[j];
+		}
+}
+
+
+//
+// calculation of the modulus-dependent speed-up coefficient
+//
+void _calc_modulus_coeff(uint32_t num_words, const uint32_t *N, uint32_t *N_COEFF)
+{
+		// counters
+		uint32_t i, j, k, jk;
+
+		// indices
+		uint32_t word_index, bit_index;
+
+		// flag
+		uint32_t flag_update;
+	
+		// carries
+		uint32_t cry_in, cry_out;
+	
+		// temporary variables
+		uint32_t mod_p, add_s, b_word;
+	
+		// initially set N_COEFF to 1
+		for (i=0; i<=num_words; i++)
+				N_COEFF[i] = i ? 0 : 1;
+
+		// also set NN to ~N+1
+		// note that since N must be odd, ~N is even, so adding 1 to it doesn't need
+		// any carry propagation
+		for (i=0; i<num_words; i++) MOD_NN[i] = ~N[i];
+		MOD_NN[0] += 1;
+		MOD_NN[num_words] = 0xffffffff;
+        
+		// do the math
+		for (i=1; i<(num_words * UINT32_BITS + UINT16_BITS); i++)
+		{
+				word_index = i / UINT32_BITS;
+			  bit_index = i & (UINT32_BITS - 1);
+
+				// clear T
+				for (j=0; j<=num_words; j++) MOD_T[j] = 0;
+				
+				// T = N_COEFF * NN mod 2 ** (modulus_length + 16)
+				/*
+			   * Note, that we only need the lower half of the product T, so in
+			   * the outer loop we always scan entire N_COEFF, but the inner
+				 * loop only scans entire NN during the first iteration, and then
+			   * keeps skipping one more word every iteration, during the last
+				 * iteration we only scan one word of NN.
+			   *
+			   */
+				for (j=0; j<=num_words; j++)
+				{   cry_in = 0;
+						for (k=0; k<=(num_words-j); k++)
+						{		jk = j + k;
+								_mul32(N_COEFF[j], MOD_NN[k], MOD_T[jk], cry_in, &mod_p, &cry_out);
+								MOD_T[jk] = mod_p;
+								cry_in = cry_out;
+								if (word_index == jk)
+										flag_update = MOD_T[jk] & (1 << bit_index) ? 1 : 0;
+						}
+				}
+				if (flag_update)
+				{		cry_in = 0;
+						for (j=0; j<=num_words; j++)
+						{		b_word = (j == word_index) ? (1 << bit_index) : 0;
+								_add32(b_word, N_COEFF[j], cry_in, &add_s, &cry_out);
+								N_COEFF[j] = add_s;
+								cry_in = cry_out;
+						}
+				}
+		}
+}
+
+
+//
+// low-level addition w/ carry
+//
+static void _add32(uint32_t a, uint32_t b, uint32_t c_in, uint32_t *s, uint32_t *c_out)
+{
+		uint64_t t;															// intermediate var
+	
+		t = (uint64_t)a + (uint64_t)b;					// obtain "wide" difference
+		t += (uint64_t)(c_in & 1);							// take borrow into account
+	
+		*s = (uint32_t)t;												// return the lower part of result
+		*c_out = (uint32_t)(t >> UINT32_BITS);	// return the higher part of result, ...
+		*c_out &= (uint32_t)1;									// ...but truncate it to 1 bit
+}
+
+
+//
+// low-level subtraction w/ borrow
+//
+static void _sub32(uint32_t a, uint32_t b, uint32_t b_in, uint32_t *d, uint32_t *b_out)
+{
+		uint64_t t;															// intermediate var
+	
+		t = (uint64_t)a - (uint64_t)b;					// obtain "wide" difference
+		t -= (uint64_t)(b_in & 1);							// take borrow into account
+	
+		*d = (uint32_t)t;												// return the lower part of result
+		*b_out = (uint32_t)(t >> UINT32_BITS);	// return the higher part of result, ...
+		*b_out &= (uint32_t)1;									// ...but truncate it to 1 bit
+}
+
+
+//
+// low-level multiplication w/ carry and pre-adder
+//
+static void _mul32(uint32_t a, uint32_t b, uint32_t t, uint32_t c_in, uint32_t *p, uint32_t *c_out)
+{
+		uint64_t r;															// intermediate result
+	
+		r = (uint64_t)a * (uint64_t)b;					// obtain wide product
+		r += (uint64_t)t;												// handle pre-addition
+		r += (uint64_t)c_in;										// take carry into account
+	
+		*p = (uint32_t)r;												// return the lower part of result
+		*c_out = (uint32_t)(r >> UINT32_BITS);	// return the higher part of result, ...
+}
+
+
+//
+// end-of-file
+//