aboutsummaryrefslogtreecommitdiff
path: root/stm32/modexpng_util.c
diff options
context:
space:
mode:
Diffstat (limited to 'stm32/modexpng_util.c')
-rw-r--r--stm32/modexpng_util.c189
1 files changed, 189 insertions, 0 deletions
diff --git a/stm32/modexpng_util.c b/stm32/modexpng_util.c
new file mode 100644
index 0000000..27afb19
--- /dev/null
+++ b/stm32/modexpng_util.c
@@ -0,0 +1,189 @@
+//
+// helper precomputation routines for the "modexpng" core
+//
+#include "modexpng_util.h"
+
+
+//
+// internal buffers
+//
+static uint32_t MOD_FACTOR_N[BUF_NUM_WORDS];
+static uint32_t MOD_NN[BUF_NUM_WORDS+1];
+static uint32_t MOD_T[BUF_NUM_WORDS+1];
+
+static void _add32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
+static void _sub32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
+static void _mul32(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
+
+
+//
+// calculation of the Montgomery factor
+//
+void _calc_montgomery_factor(uint32_t num_words, const uint32_t *N, uint32_t *N_FACTOR)
+{
+ // counters
+ uint32_t i, j;
+
+ // flag
+ uint32_t flag_keep;
+
+ // carry and borrow
+ uint32_t cry_in, cry_out;
+ uint32_t brw_in, brw_out;
+
+ // initially set N_FACTOR = 1
+ for (i=0; i<num_words; i++)
+ N_FACTOR[i] = i ? 0 : 1;
+
+ // do the math
+ for (i=0; i<2*(num_words * UINT32_BITS + UINT16_BITS); i++)
+ {
+ // clear carry and borrow
+ cry_in = 0, brw_in = 0;
+
+ // calculate N_FACTOR = N_FACTOR << 1, MOD_FACTOR_N = N_FACTOR - N
+ for (j=0; j<num_words; j++)
+ {
+ cry_out = N_FACTOR[j] >> (UINT32_BITS - 1); // | N_FACTOR <<= 1
+ N_FACTOR[j] <<= 1; N_FACTOR[j] |= cry_in; // |
+
+ _sub32(N_FACTOR[j], N[j], brw_in, &MOD_FACTOR_N[j], &brw_out); // MOD_FACTOR_N = N_FACTOR - N
+
+ // propagate carry & borrow
+ cry_in = cry_out, brw_in = brw_out;
+ }
+
+ // obtain flag
+ flag_keep = brw_out && !cry_out;
+
+ // now select the right value
+ for (j=0; j<num_words; j++)
+ N_FACTOR[j] = flag_keep ? N_FACTOR[j] : MOD_FACTOR_N[j];
+ }
+}
+
+
+//
+// calculation of the modulus-dependent speed-up coefficient
+//
+void _calc_modulus_coeff(uint32_t num_words, const uint32_t *N, uint32_t *N_COEFF)
+{
+ // counters
+ uint32_t i, j, k, jk;
+
+ // indices
+ uint32_t word_index, bit_index;
+
+ // flag
+ uint32_t flag_update;
+
+ // carries
+ uint32_t cry_in, cry_out;
+
+ // temporary variables
+ uint32_t mod_p, add_s, b_word;
+
+ // initially set N_COEFF to 1
+ for (i=0; i<=num_words; i++)
+ N_COEFF[i] = i ? 0 : 1;
+
+ // also set NN to ~N+1
+ // note that since N must be odd, ~N is even, so adding 1 to it doesn't need
+ // any carry propagation
+ for (i=0; i<num_words; i++) MOD_NN[i] = ~N[i];
+ MOD_NN[0] += 1;
+ MOD_NN[num_words] = 0xffffffff;
+
+ // do the math
+ for (i=1; i<(num_words * UINT32_BITS + UINT16_BITS); i++)
+ {
+ word_index = i / UINT32_BITS;
+ bit_index = i & (UINT32_BITS - 1);
+
+ // clear T
+ for (j=0; j<=num_words; j++) MOD_T[j] = 0;
+
+ // T = N_COEFF * NN mod 2 ** (modulus_length + 16)
+ /*
+ * Note, that we only need the lower half of the product T, so in
+ * the outer loop we always scan entire N_COEFF, but the inner
+ * loop only scans entire NN during the first iteration, and then
+ * keeps skipping one more word every iteration, during the last
+ * iteration we only scan one word of NN.
+ *
+ */
+ for (j=0; j<=num_words; j++)
+ { cry_in = 0;
+ for (k=0; k<=(num_words-j); k++)
+ { jk = j + k;
+ _mul32(N_COEFF[j], MOD_NN[k], MOD_T[jk], cry_in, &mod_p, &cry_out);
+ MOD_T[jk] = mod_p;
+ cry_in = cry_out;
+ if (word_index == jk)
+ flag_update = MOD_T[jk] & (1 << bit_index) ? 1 : 0;
+ }
+ }
+ if (flag_update)
+ { cry_in = 0;
+ for (j=0; j<=num_words; j++)
+ { b_word = (j == word_index) ? (1 << bit_index) : 0;
+ _add32(b_word, N_COEFF[j], cry_in, &add_s, &cry_out);
+ N_COEFF[j] = add_s;
+ cry_in = cry_out;
+ }
+ }
+ }
+}
+
+
+//
+// low-level addition w/ carry
+//
+static void _add32(uint32_t a, uint32_t b, uint32_t c_in, uint32_t *s, uint32_t *c_out)
+{
+ uint64_t t; // intermediate var
+
+ t = (uint64_t)a + (uint64_t)b; // obtain "wide" difference
+ t += (uint64_t)(c_in & 1); // take borrow into account
+
+ *s = (uint32_t)t; // return the lower part of result
+ *c_out = (uint32_t)(t >> UINT32_BITS); // return the higher part of result, ...
+ *c_out &= (uint32_t)1; // ...but truncate it to 1 bit
+}
+
+
+//
+// low-level subtraction w/ borrow
+//
+static void _sub32(uint32_t a, uint32_t b, uint32_t b_in, uint32_t *d, uint32_t *b_out)
+{
+ uint64_t t; // intermediate var
+
+ t = (uint64_t)a - (uint64_t)b; // obtain "wide" difference
+ t -= (uint64_t)(b_in & 1); // take borrow into account
+
+ *d = (uint32_t)t; // return the lower part of result
+ *b_out = (uint32_t)(t >> UINT32_BITS); // return the higher part of result, ...
+ *b_out &= (uint32_t)1; // ...but truncate it to 1 bit
+}
+
+
+//
+// low-level multiplication w/ carry and pre-adder
+//
+static void _mul32(uint32_t a, uint32_t b, uint32_t t, uint32_t c_in, uint32_t *p, uint32_t *c_out)
+{
+ uint64_t r; // intermediate result
+
+ r = (uint64_t)a * (uint64_t)b; // obtain wide product
+ r += (uint64_t)t; // handle pre-addition
+ r += (uint64_t)c_in; // take carry into account
+
+ *p = (uint32_t)r; // return the lower part of result
+ *c_out = (uint32_t)(r >> UINT32_BITS); // return the higher part of result, ...
+}
+
+
+//
+// end-of-file
+//