path: root/stm32/modexpng_util.c



//
// helper precomputation routines for the "modexpng" core
//
#include "modexpng_util.h"


//
// internal buffers
//
static uint32_t MOD_FACTOR_N[BUF_NUM_WORDS];
static uint32_t MOD_NN[BUF_NUM_WORDS+1];
static uint32_t MOD_T[BUF_NUM_WORDS+1];

static void _add32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
static void _sub32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
static void _mul32(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);


//
// calculation of the Montgomery factor
//
void _calc_montgomery_factor(uint32_t num_words, const uint32_t *N, uint32_t *N_FACTOR)
{
		// counters
		uint32_t i, j;		

		// flag
		uint32_t flag_keep;

		// carry and borrow
		uint32_t cry_in, cry_out;
		uint32_t brw_in, brw_out;

		// initially set N_FACTOR = 1
		for (i=0; i<num_words; i++)
				N_FACTOR[i] = i ? 0 : 1;
		
		// do the math
		for (i=0; i<2*(num_words * UINT32_BITS + UINT16_BITS); i++)
		{	
				// clear carry and borrow
				cry_in = 0, brw_in = 0;
		
				// calculate N_FACTOR = N_FACTOR << 1, MOD_FACTOR_N = N_FACTOR - N
				for (j=0; j<num_words; j++)
				{
						cry_out = N_FACTOR[j] >> (UINT32_BITS - 1);		// | N_FACTOR <<= 1
						N_FACTOR[j] <<= 1; N_FACTOR[j] |= cry_in;			// |

						_sub32(N_FACTOR[j], N[j], brw_in, &MOD_FACTOR_N[j], &brw_out);	// MOD_FACTOR_N = N_FACTOR - N
	
						// propagate carry & borrow
						cry_in = cry_out, brw_in = brw_out;
				}

				// obtain flag
				flag_keep = brw_out && !cry_out;

				// now select the right value
				for (j=0; j<num_words; j++)
						N_FACTOR[j] = flag_keep ? N_FACTOR[j] : MOD_FACTOR_N[j];
		}
}


//
// calculation of the modulus-dependent speed-up coefficient
//
void _calc_modulus_coeff(uint32_t num_words, const uint32_t *N, uint32_t *N_COEFF)
{
		// counters
		uint32_t i, j, k, jk;

		// indices
		uint32_t word_index, bit_index;

		// flag
		uint32_t flag_update;
	
		// carries
		uint32_t cry_in, cry_out;
	
		// temporary variables
		uint32_t mod_p, add_s, b_word;
	
		// initially set N_COEFF to 1
		for (i=0; i<=num_words; i++)
				N_COEFF[i] = i ? 0 : 1;

		// also set NN to ~N+1
		// note that since N must be odd, ~N is even, so adding 1 to it doesn't need
		// any carry propagation
		for (i=0; i<num_words; i++) MOD_NN[i] = ~N[i];
		MOD_NN[0] += 1;
		MOD_NN[num_words] = 0xffffffff;
        
		// do the math
		for (i=1; i<(num_words * UINT32_BITS + UINT16_BITS); i++)
		{
				word_index = i / UINT32_BITS;
			  bit_index = i & (UINT32_BITS - 1);

				// clear T
				for (j=0; j<=num_words; j++) MOD_T[j] = 0;
				
				// T = N_COEFF * NN mod 2 ** (modulus_length + 16)
				/*
			   * Note, that we only need the lower half of the product T, so in
			   * the outer loop we always scan entire N_COEFF, but the inner
				 * loop only scans entire NN during the first iteration, and then
			   * keeps skipping one more word every iteration, during the last
				 * iteration we only scan one word of NN.
			   *
			   */
				for (j=0; j<=num_words; j++)
				{   cry_in = 0;
						for (k=0; k<=(num_words-j); k++)
						{		jk = j + k;
								_mul32(N_COEFF[j], MOD_NN[k], MOD_T[jk], cry_in, &mod_p, &cry_out);
								MOD_T[jk] = mod_p;
								cry_in = cry_out;
								if (word_index == jk)
										flag_update = MOD_T[jk] & (1 << bit_index) ? 1 : 0;
						}
				}
				if (flag_update)
				{		cry_in = 0;
						for (j=0; j<=num_words; j++)
						{		b_word = (j == word_index) ? (1 << bit_index) : 0;
								_add32(b_word, N_COEFF[j], cry_in, &add_s, &cry_out);
								N_COEFF[j] = add_s;
								cry_in = cry_out;
						}
				}
		}
}


//
// low-level addition w/ carry
//
static void _add32(uint32_t a, uint32_t b, uint32_t c_in, uint32_t *s, uint32_t *c_out)
{
		uint64_t t;															// intermediate var
	
		t = (uint64_t)a + (uint64_t)b;					// obtain "wide" difference
		t += (uint64_t)(c_in & 1);							// take borrow into account
	
		*s = (uint32_t)t;												// return the lower part of result
		*c_out = (uint32_t)(t >> UINT32_BITS);	// return the higher part of result, ...
		*c_out &= (uint32_t)1;									// ...but truncate it to 1 bit
}


//
// low-level subtraction w/ borrow
//
static void _sub32(uint32_t a, uint32_t b, uint32_t b_in, uint32_t *d, uint32_t *b_out)
{
		uint64_t t;															// intermediate var
	
		t = (uint64_t)a - (uint64_t)b;					// obtain "wide" difference
		t -= (uint64_t)(b_in & 1);							// take borrow into account
	
		*d = (uint32_t)t;												// return the lower part of result
		*b_out = (uint32_t)(t >> UINT32_BITS);	// return the higher part of result, ...
		*b_out &= (uint32_t)1;									// ...but truncate it to 1 bit
}


//
// low-level multiplication w/ carry and pre-adder
//
static void _mul32(uint32_t a, uint32_t b, uint32_t t, uint32_t c_in, uint32_t *p, uint32_t *c_out)
{
		uint64_t r;															// intermediate result
	
		r = (uint64_t)a * (uint64_t)b;					// obtain wide product
		r += (uint64_t)t;												// handle pre-addition
		r += (uint64_t)c_in;										// take carry into account
	
		*p = (uint32_t)r;												// return the lower part of result
		*c_out = (uint32_t)(r >> UINT32_BITS);	// return the higher part of result, ...
}


//
// end-of-file
//
//
// helper precomputation routines for the "modexpng" core
//
#include "modexpng_util.h"


//
// internal buffers
//
static uint32_t MOD_FACTOR_N[BUF_NUM_WORDS];
static uint32_t MOD_NN[BUF_NUM_WORDS+1];
static uint32_t MOD_T[BUF_NUM_WORDS+1];

static void _add32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
static void _sub32(uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);
static void _mul32(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t *, uint32_t *);


//
// calculation of the Montgomery factor
//
void _calc_montgomery_factor(uint32_t num_words, const uint32_t *N, uint32_t *N_FACTOR)
{
		// counters
		uint32_t i, j;		

		// flag
		uint32_t flag_keep;

		// carry and borrow
		uint32_t cry_in, cry_out;
		uint32_t brw_in, brw_out;

		// initially set N_FACTOR = 1
		for (i=0; i<num_words; i++)
				N_FACTOR[i] = i ? 0 : 1;
		
		// do the math
		for (i=0; i<2*(num_words * UINT32_BITS + UINT16_BITS); i++)
		{	
				// clear carry and borrow
				cry_in = 0, brw_in = 0;
		
				// calculate N_FACTOR = N_FACTOR << 1, MOD_FACTOR_N = N_FACTOR - N
				for (j=0; j<num_words; j++)
				{
						cry_out = N_FACTOR[j] >> (UINT32_BITS - 1);		// | N_FACTOR <<= 1
						N_FACTOR[j] <<= 1; N_FACTOR[j] |= cry_in;			// |

						_sub32(N_FACTOR[j], N[j], brw_in, &MOD_FACTOR_N[j], &brw_out);	// MOD_FACTOR_N = N_FACTOR - N
	
						// propagate carry & borrow
						cry_in = cry_out, brw_in = brw_out;
				}

				// obtain flag
				flag_keep = brw_out && !cry_out;

				// now select the right value
				for (j=0; j<num_words; j++)
						N_FACTOR[j] = flag_keep ? N_FACTOR[j] : MOD_FACTOR_N[j];
		}
}


//
// calculation of the modulus-dependent speed-up coefficient
//
void _calc_modulus_coeff(uint32_t num_words, const uint32_t *N, uint32_t *N_COEFF)
{
		// counters
		uint32_t i, j, k, jk;

		// indices
		uint32_t word_index, bit_index;

		// flag
		uint32_t flag_update;
	
		// carries
		uint32_t cry_in, cry_out;
	
		// temporary variables
		uint32_t mod_p, add_s, b_word;
	
		// initially set N_COEFF to 1
		for (i=0; i<=num_words; i++)
				N_COEFF[i] = i ? 0 : 1;

		// also set NN to ~N+1
		// note that since N must be odd, ~N is even, so adding 1 to it doesn't need
		// any carry propagation
		for (i=0; i<num_words; i++) MOD_NN[i] = ~N[i];
		MOD_NN[0] += 1;
		MOD_NN[num_words] = 0xffffffff;
        
		// do the math
		for (i=1; i<(num_words * UINT32_BITS + UINT16_BITS); i++)
		{
				word_index = i / UINT32_BITS;
			  bit_index = i & (UINT32_BITS - 1);

				// clear T
				for (j=0; j<=num_words; j++) MOD_T[j] = 0;
				
				// T = N_COEFF * NN mod 2 ** (modulus_length + 16)
				/*
			   * Note, that we only need the lower half of the product T, so in
			   * the outer loop we always scan entire N_COEFF, but the inner
				 * loop only scans entire NN during the first iteration, and then
			   * keeps skipping one more word every iteration, during the last
				 * iteration we only scan one word of NN.
			   *
			   */
				for (j=0; j<=num_words; j++)
				{   cry_in = 0;
						for (k=0; k<=(num_words-j); k++)
						{		jk = j + k;
								_mul32(N_COEFF[j], MOD_NN[k], MOD_T[jk], cry_in, &mod_p, &cry_out);
								MOD_T[jk] = mod_p;
								cry_in = cry_out;
								if (word_index == jk)
										flag_update = MOD_T[jk] & (1 << bit_index) ? 1 : 0;
						}
				}
				if (flag_update)
				{		cry_in = 0;
						for (j=0; j<=num_words; j++)
						{		b_word = (j == word_index) ? (1 << bit_index) : 0;
								_add32(b_word, N_COEFF[j], cry_in, &add_s, &cry_out);
								N_COEFF[j] = add_s;
								cry_in = cry_out;
						}
				}
		}
}


//
// low-level addition w/ carry
//
static void _add32(uint32_t a, uint32_t b, uint32_t c_in, uint32_t *s, uint32_t *c_out)
{
		uint64_t t;															// intermediate var
	
		t = (uint64_t)a + (uint64_t)b;					// obtain "wide" difference
		t += (uint64_t)(c_in & 1);							// take borrow into account
	
		*s = (uint32_t)t;												// return the lower part of result
		*c_out = (uint32_t)(t >> UINT32_BITS);	// return the higher part of result, ...
		*c_out &= (uint32_t)1;									// ...but truncate it to 1 bit
}


//
// low-level subtraction w/ borrow
//
static void _sub32(uint32_t a, uint32_t b, uint32_t b_in, uint32_t *d, uint32_t *b_out)
{
		uint64_t t;															// intermediate var
	
		t = (uint64_t)a - (uint64_t)b;					// obtain "wide" difference
		t -= (uint64_t)(b_in & 1);							// take borrow into account
	
		*d = (uint32_t)t;												// return the lower part of result
		*b_out = (uint32_t)(t >> UINT32_BITS);	// return the higher part of result, ...
		*b_out &= (uint32_t)1;									// ...but truncate it to 1 bit
}


//
// low-level multiplication w/ carry and pre-adder
//
static void _mul32(uint32_t a, uint32_t b, uint32_t t, uint32_t c_in, uint32_t *p, uint32_t *c_out)
{
		uint64_t r;															// intermediate result
	
		r = (uint64_t)a * (uint64_t)b;					// obtain wide product
		r += (uint64_t)t;												// handle pre-addition
		r += (uint64_t)c_in;										// take carry into account
	
		*p = (uint32_t)r;												// return the lower part of result
		*c_out = (uint32_t)(r >> UINT32_BITS);	// return the higher part of result, ...
}


//
// end-of-file
//