//
// helper precomputation routines for the "modexpng" core
//
#include "modexpng_util.h"
//
// internal buffers
//
static uint32_t MOD_FACTOR_N[BUF_NUM_WORDS];
static uint32_t MOD_NN[BUF_NUM_WORDS+1];
static uint32_t MOD_T[BUF_NUM_WORDS+1];
static void _add32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
static void _sub32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
static void _mul32(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
//
// calculation of the Montgomery factor
//
void _calc_montgomery_factor(uint32_t num_words, const uint32_t *N, uint32_t *N_FACTOR)
{
// counters
uint32_t i, j;
// flag
uint32_t flag_keep;
// carry and borrow
uint32_t cry_in, cry_out;
uint32_t brw_in, brw_out;
// initially set N_FACTOR = 1
for (i=0; i<num_words; i++)
N_FACTOR[i] = i ? 0 : 1;
// do the math
for (i=0; i<2*(num_words * UINT32_BITS + UINT16_BITS); i++)
{
// clear carry and borrow
cry_in = 0, brw_in = 0;
// calculate N_FACTOR = N_FACTOR << 1, MOD_FACTOR_N = N_FACTOR - N
for (j=0; j<num_words; j++)
{
cry_out = N_FACTOR[j] >> (UINT32_BITS - 1); // | N_FACTOR <<= 1
N_FACTOR[j] <<= 1; N_FACTOR[j] |= cry_in; // |
_sub32(N_FACTOR[j], N[j], brw_in, &MOD_FACTOR_N[j], &brw_out); // MOD_FACTOR_N = N_FACTOR - N
// propagate carry & borrow
cry_in = cry_out, brw_in = brw_out;
}
// obtain flag
flag_keep = brw_out && !cry_out;
// now select the right value
for (j=0; j<num_words; j++)
N_FACTOR[j] = flag_keep ? N_FACTOR[j] : MOD_FACTOR_N[j];
}
}
//
// calculation of the modulus-dependent speed-up coefficient
//
void _calc_modulus_coeff(uint32_t num_words, const uint32_t *N, uint32_t *N_COEFF)
{
// counters
uint32_t i, j, k, jk;
// indices
uint32_t word_index, bit_index;
// flag
uint32_t flag_update;
// carries
uint32_t cry_in, cry_out;
// temporary variables
uint32_t mod_p, add_s, b_word;
// initially set N_COEFF to 1
for (i=0; i<=num_words; i++)
N_COEFF[i] = i ? 0 : 1;
// also set NN to ~N+1
// note that since N must be odd, ~N is even, so adding 1 to it doesn't need
// any carry propagation
for (i=0; i<num_words; i++) MOD_NN[i] = ~N[i];
MOD_NN[0] += 1;
MOD_NN[num_words] = 0xffffffff;
// do the math
for (i=1; i<(num_words * UINT32_BITS + UINT16_BITS); i++)
{
word_index = i / UINT32_BITS;
bit_index = i & (UINT32_BITS - 1);
// clear T
for (j=0; j<=num_words; j++) MOD_T[j] = 0;
// T = N_COEFF * NN mod 2 ** (modulus_length + 16)
/*
* Note, that we only need the lower half of the product T, so in
* the outer loop we always scan entire N_COEFF, but the inner
* loop only scans entire NN during the first iteration, and then
* keeps skipping one more word every iteration, during the last
* iteration we only scan one word of NN.
*
*/
for (j=0; j<=num_words; j++)
{ cry_in = 0;
for (k=0; k<=(num_words-j); k++)
{ jk = j + k;
_mul32(N_COEFF[j], MOD_NN[k], MOD_T[jk], cry_in, &mod_p, &cry_out);
MOD_T[jk] = mod_p;
cry_in = cry_out;
if (word_index == jk)
flag_update = MOD_T[jk] & (1 << bit_index) ? 1 : 0;
}
}
if (flag_update)
{ cry_in = 0;
for (j=0; j<=num_words; j++)
{ b_word = (j == word_index) ? (1 << bit_index) : 0;
_add32(b_word, N_COEFF[j], cry_in, &add_s, &cry_out);
N_COEFF[j] = add_s;
cry_in = cry_out;
}
}
}
}
//
// low-level addition w/ carry
//
static void _add32(uint32_t a, uint32_t b, uint32_t c_in, uint32_t *s, uint32_t *c_out)
{
uint64_t t; // intermediate var
t = (uint64_t)a + (uint64_t)b; // obtain "wide" difference
t += (uint64_t)(c_in & 1); // take borrow into account
*s = (uint32_t)t; // return the lower part of result
*c_out = (uint32_t)(t >> UINT32_BITS); // return the higher part of result, ...
*c_out &= (uint32_t)1; // ...but truncate it to 1 bit
}
//
// low-level subtraction w/ borrow
//
static void _sub32(uint32_t a, uint32_t b, uint32_t b_in, uint32_t *d, uint32_t *b_out)
{
uint64_t t; // intermediate var
t = (uint64_t)a - (uint64_t)b; // obtain "wide" difference
t -= (uint64_t)(b_in & 1); // take borrow into account
*d = (uint32_t)t; // return the lower part of result
*b_out = (uint32_t)(t >> UINT32_BITS); // return the higher part of result, ...
*b_out &= (uint32_t)1; // ...but truncate it to 1 bit
}
//
// low-level multiplication w/ carry and pre-adder
//
static void _mul32(uint32_t a, uint32_t b, uint32_t t, uint32_t c_in, uint32_t *p, uint32_t *c_out)
{
uint64_t r; // intermediate result
r = (uint64_t)a * (uint64_t)b; // obtain wide product
r += (uint64_t)t; // handle pre-addition
r += (uint64_t)c_in; // take carry into account
*p = (uint32_t)r; // return the lower part of result
*c_out = (uint32_t)(r >> UINT32_BITS); // return the higher part of result, ...
}
//
// end-of-file
//