//
// simple driver to test "modexpng" core in hardware
//
//
// note, that the test program needs a custom bitstream where
// the core is located at offset 0 (without the core selector)
//
// stm32 headers
#include "stm-init.h"
#include "stm-led.h"
#include "stm-fmc.h"
// test vectors (generated by the supplied python math model)
#include "modexpng_vector_1024.h"
#include "modexpng_vector_2048.h"
#include "modexpng_vector_4096.h"
// reference code
#include "modexpng_util.h"
// locations of core registers
#define CORE_ADDR_NAME0 (0x00 << 2)
#define CORE_ADDR_NAME1 (0x01 << 2)
#define CORE_ADDR_VERSION (0x02 << 2)
#define CORE_ADDR_CONTROL (0x08 << 2)
#define CORE_ADDR_STATUS (0x09 << 2)
#define CORE_ADDR_MODE (0x10 << 2)
#define CORE_ADDR_MODULUS_BITS (0x11 << 2)
#define CORE_ADDR_EXPONENT_BITS (0x12 << 2)
#define CORE_ADDR_BANK_BITS (0x13 << 2)
#define CORE_ADDR_NUM_MULTS (0x14 << 2)
// locations of data buffers
#define CORE_ADDR_BANK_M (1 * 0x1000 + 0 * 0x200)
#define CORE_ADDR_BANK_N (1 * 0x1000 + 1 * 0x200)
#define CORE_ADDR_BANK_N_FACTOR (1 * 0x1000 + 2 * 0x200)
#define CORE_ADDR_BANK_N_COEFF (1 * 0x1000 + 3 * 0x200)
#define CORE_ADDR_BANK_X (1 * 0x1000 + 5 * 0x200)
#define CORE_ADDR_BANK_Y (1 * 0x1000 + 6 * 0x200)
#define CORE_ADDR_BANK_D (2 * 0x1000 + 0 * 0x200)
#define CORE_ADDR_BANK_P (2 * 0x1000 + 1 * 0x200)
#define CORE_ADDR_BANK_DP (2 * 0x1000 + 3 * 0x100)
#define CORE_ADDR_BANK_P_FACTOR (2 * 0x1000 + 2 * 0x200)
#define CORE_ADDR_BANK_P_COEFF (2 * 0x1000 + 3 * 0x200)
#define CORE_ADDR_BANK_Q (2 * 0x1000 + 4 * 0x200)
#define CORE_ADDR_BANK_DQ (2 * 0x1000 + 9 * 0x100)
#define CORE_ADDR_BANK_Q_FACTOR (2 * 0x1000 + 5 * 0x200)
#define CORE_ADDR_BANK_Q_COEFF (2 * 0x1000 + 6 * 0x200)
#define CORE_ADDR_BANK_QINV (2 * 0x1000 + 7 * 0x200)
#define CORE_ADDR_BANK_S (3 * 0x1000 + 0 * 0x200)
#define CORE_ADDR_BANK_XM (3 * 0x1000 + 1 * 0x200)
#define CORE_ADDR_BANK_YM (3 * 0x1000 + 2 * 0x200)
// bit maps
#define CORE_CONTROL_BIT_NEXT 0x00000002
#define CORE_STATUS_BIT_VALID 0x00000002
#define CORE_MODE_USING_CRT 0x00000002
#define CORE_MODE_WITHOUT_CRT 0x00000000
//
// test vectors
//
static const uint32_t M_1024[] = M_1024_INIT;
static const uint32_t N_1024[] = N_1024_INIT;
static const uint32_t N_FACTOR_1024[] = N_FACTOR_1024_INIT;
static const uint32_t N_COEFF_1024[] = N_COEFF_1024_INIT;
static uint32_t X_1024[] = X_1024_INIT;
static uint32_t Y_1024[] = Y_1024_INIT;
static const uint32_t P_1024[] = P_1024_INIT;
static const uint32_t Q_1024[] = Q_1024_INIT;
static const uint32_t P_FACTOR_1024[] = P_FACTOR_1024_INIT;
static const uint32_t Q_FACTOR_1024[] = Q_FACTOR_1024_INIT;
static const uint32_t P_COEFF_1024[] = P_COEFF_1024_INIT;
static const uint32_t Q_COEFF_1024[] = Q_COEFF_1024_INIT;
static const uint32_t D_1024[] = D_1024_INIT;
static const uint32_t DP_1024[] = DP_1024_INIT;
static const uint32_t DQ_1024[] = DQ_1024_INIT;
static const uint32_t QINV_1024[] = QINV_1024_INIT;
static const uint32_t XM_1024[] = XM_1024_INIT;
static const uint32_t YM_1024[] = YM_1024_INIT;
static const uint32_t S_1024[] = S_1024_INIT;
static const uint32_t M_2048[] = M_2048_INIT;
static const uint32_t N_2048[] = N_2048_INIT;
static const uint32_t N_FACTOR_2048[] = N_FACTOR_2048_INIT;
static const uint32_t N_COEFF_2048[] = N_COEFF_2048_INIT;
static uint32_t X_2048[] = X_2048_INIT;
static uint32_t Y_2048[] = Y_2048_INIT;
static const uint32_t P_2048[] = P_2048_INIT;
static const uint32_t Q_2048[] = Q_2048_INIT;
static const uint32_t P_FACTOR_2048[] = P_FACTOR_2048_INIT;
static const uint32_t Q_FACTOR_2048[] = Q_FACTOR_2048_INIT;
static const uint32_t P_COEFF_2048[] = P_COEFF_2048_INIT;
static const uint32_t Q_COEFF_2048[] = Q_COEFF_2048_INIT;
static const uint32_t D_2048[] = D_2048_INIT;
static const uint32_t DP_2048[] = DP_2048_INIT;
static const uint32_t DQ_2048[] = DQ_2048_INIT;
static const uint32_t QINV_2048[] = QINV_2048_INIT;
static const uint32_t XM_2048[] = XM_2048_INIT;
static const uint32_t YM_2048[] = YM_2048_INIT;
static const uint32_t S_2048[] = S_2048_INIT;
static const uint32_t M_4096[] = M_4096_INIT;
static const uint32_t N_4096[] = N_4096_INIT;
static const uint32_t N_FACTOR_4096[] = N_FACTOR_4096_INIT;
static const uint32_t N_COEFF_4096[] = N_COEFF_4096_INIT;
static uint32_t X_4096[] = X_4096_INIT;
static uint32_t Y_4096[] = Y_4096_INIT;
static const uint32_t P_4096[] = P_4096_INIT;
static const uint32_t Q_4096[] = Q_4096_INIT;
static const uint32_t P_FACTOR_4096[] = P_FACTOR_4096_INIT;
static const uint32_t Q_FACTOR_4096[] = Q_FACTOR_4096_INIT;
static const uint32_t P_COEFF_4096[] = P_COEFF_4096_INIT;
static const uint32_t Q_COEFF_4096[] = Q_COEFF_4096_INIT;
static const uint32_t D_4096[] = D_4096_INIT;
static const uint32_t DP_4096[] = DP_4096_INIT;
static const uint32_t DQ_4096[] = DQ_4096_INIT;
static const uint32_t QINV_4096[] = QINV_4096_INIT;
static const uint32_t XM_4096[] = XM_4096_INIT;
static const uint32_t YM_4096[] = YM_4096_INIT;
static const uint32_t S_4096[] = S_4096_INIT;
//
// buffers
//
static uint32_t mod_rev[BUF_NUM_WORDS];
static uint32_t mod_factor_rev[BUF_NUM_WORDS];
static uint32_t mod_coeff_rev[BUF_NUM_WORDS+1];
//
// prototypes
//
void toggle_yellow_led(void);
int check_montgomery_factor(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_factor);
int check_modulus_coeff(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_coeff);
int _sign_handler(uint32_t key_length, uint32_t use_crt, uint32_t first_run,
const uint32_t *m, const uint32_t *n,
const uint32_t *n_factor, const uint32_t *n_coeff,
uint32_t *x, uint32_t *y,
const uint32_t *p, const uint32_t *q,
const uint32_t *p_factor, const uint32_t *p_coeff,
const uint32_t *q_factor, const uint32_t *q_coeff,
const uint32_t *dp, const uint32_t *dq,
const uint32_t *d,
const uint32_t *qinv,
const uint32_t *s,
const uint32_t *xm, const uint32_t *ym);
//
// easier calls
//
#define sign_without_crt(k,f,m,n,nf,nc,x,y,d,s,xm,ym) \
_sign_handler (k,0,f,m,n,nf,nc,x,y,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,d,NULL,s,xm,ym)
#define sign_using_crt(k,f,m,n,nf,nc,x,y,p,q,pf,pc,qf,qc,dp,dq,qinv,s,xm,ym) \
_sign_handler (k,1,f,m,n,nf,nc,x,y,p,q,pf,pc,qf,qc,dp,dq,NULL,qinv,s,xm,ym)
//
// test routine
//
int main()
{
int ok;
int first_run;
// initialize
stm_init();
fmc_init();
// initialize
led_on(LED_GREEN);
led_off(LED_RED);
led_off(LED_YELLOW);
led_off(LED_BLUE);
// make sure, that ModExpNG is there
uint32_t core_name0;
uint32_t core_name1;
uint32_t core_version;
fmc_read_32(CORE_ADDR_NAME0, &core_name0);
fmc_read_32(CORE_ADDR_NAME1, &core_name1);
fmc_read_32(CORE_ADDR_VERSION, &core_version);
// "mode", "xpng"
if ((core_name0 != 0x6D6F6465) || (core_name1 != 0x78706E67))
{ led_off(LED_GREEN);
led_on(LED_RED);
while (1);
}
// check, that reference code works correctly
ok = 1;
ok = ok && check_montgomery_factor(1024, N_1024, N_FACTOR_1024);
ok = ok && check_montgomery_factor( 512, P_1024, P_FACTOR_1024);
ok = ok && check_montgomery_factor( 512, Q_1024, Q_FACTOR_1024);
ok = ok && check_montgomery_factor(2048, N_2048, N_FACTOR_2048);
ok = ok && check_montgomery_factor(1024, P_2048, P_FACTOR_2048);
ok = ok && check_montgomery_factor(1024, Q_2048, Q_FACTOR_2048);
ok = ok && check_montgomery_factor(4096, N_4096, N_FACTOR_4096);
ok = ok && check_montgomery_factor(2048, P_4096, P_FACTOR_4096);
ok = ok && check_montgomery_factor(2048, Q_4096, Q_FACTOR_4096);
ok = ok && check_modulus_coeff(1024, N_1024, N_COEFF_1024);
ok = ok && check_modulus_coeff( 512, P_1024, P_COEFF_1024);
ok = ok && check_modulus_coeff( 512, Q_1024, Q_COEFF_1024);
ok = ok && check_modulus_coeff(2048, N_2048, N_COEFF_2048);
ok = ok && check_modulus_coeff(1024, P_2048, P_COEFF_2048);
ok = ok && check_modulus_coeff(1024, Q_2048, Q_COEFF_2048);
// ok = ok && check_modulus_coeff(4096, N_4096, N_COEFF_4096); // SLOW (~20 sec)
ok = ok && check_modulus_coeff(2048, P_4096, P_COEFF_4096);
ok = ok && check_modulus_coeff(2048, Q_4096, Q_COEFF_4096);
if (!ok)
{ led_off(LED_GREEN);
led_on(LED_RED);
while (1);
}
// repeat forever
ok = 1, first_run = 1;
while (1)
{
ok = ok && sign_without_crt(1024, first_run,
M_1024, N_1024, N_FACTOR_1024, N_COEFF_1024,
X_1024, Y_1024, D_1024, S_1024,
XM_1024, YM_1024);
ok = ok && sign_without_crt(2048, first_run,
M_2048, N_2048, N_FACTOR_2048, N_COEFF_2048,
X_2048, Y_2048, D_2048, S_2048,
XM_2048, YM_2048);
ok = ok && sign_without_crt(4096, first_run,
M_4096, N_4096, N_FACTOR_4096, N_COEFF_4096,
X_4096, Y_4096, D_4096, S_4096,
XM_4096, YM_4096);
ok = ok && sign_using_crt(1024, first_run,
M_1024, N_1024, N_FACTOR_1024, N_COEFF_1024,
X_1024, Y_1024, P_1024, Q_1024,
P_FACTOR_1024, P_COEFF_1024, Q_FACTOR_1024, Q_COEFF_1024,
DP_1024, DQ_1024, QINV_1024, S_1024,
XM_1024, YM_1024);
ok = ok && sign_using_crt(2048, first_run,
M_2048, N_2048, N_FACTOR_2048, N_COEFF_2048,
X_2048, Y_2048, P_2048, Q_2048,
P_FACTOR_2048, P_COEFF_2048, Q_FACTOR_2048, Q_COEFF_2048,
DP_2048, DQ_2048, QINV_2048, S_2048,
XM_2048, YM_2048);
ok = ok && sign_using_crt(4096, first_run,
M_4096, N_4096, N_FACTOR_4096, N_COEFF_4096,
X_4096, Y_4096, P_4096, Q_4096,
P_FACTOR_4096, P_COEFF_4096, Q_FACTOR_4096, Q_COEFF_4096,
DP_4096, DQ_4096, QINV_4096, S_4096,
XM_4096, YM_4096);
if (!ok)
{ led_off(LED_GREEN);
led_on(LED_RED);
}
first_run = 0;
toggle_yellow_led();
}
}
int check_montgomery_factor(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_factor)
{
uint32_t i, j;
uint32_t num_words = key_length / UINT32_BITS;
// _calc_montgomery_factor() expects the least significant byte in [0],
// but C array initialization places it in [N-1], so we need to
// reverse the array before passing it to the function
for (i=0, j=num_words-1; i<num_words; i++, j--)
mod_rev[i] = mod[j];
// compute Montgomery factor
_calc_montgomery_factor(num_words, mod_rev, mod_factor_rev);
// we now need to compare the calculated factor to the reference value,
// _calc_montgomery_factor() places the least significant byte in [0],
// but C array initialization places the least significant byte of the
// reference value in [N-1], so we need to go in opposite directions
// when comparing
for (i=0, j=num_words-1; i<num_words; i++, j--)
if (mod_factor_rev[i] != mod_factor[j]) return 0;
// everything went just fine
return 1;
}
int check_modulus_coeff(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_coeff)
{
uint32_t i, j;
uint32_t num_words = key_length / UINT32_BITS;
// _calc_modulus_coeff() expects the least significant byte in [0],
// but C array initialization places it in [N-1], so we need to
// reverse the array before passing it to the function
for (i=0, j=num_words-1; i<num_words; i++, j--)
mod_rev[i] = mod[j];
// compute modulus-dependent speed-up coefficient
_calc_modulus_coeff(num_words, mod_rev, mod_coeff_rev);
// we now need to compare the calculated coefficient to the reference value,
// _calc_modulus_coeff() places the least significant byte in [0],
// but C array initialization places the least significant byte of the
// reference value in [N], so we need to go in opposite directions
// when comparing, also note, that we should process N+1 words, since the
// coefficient is slightly longer, than the modulus
for (i=0, j=num_words; i<=num_words; i++, j--)
if (mod_coeff_rev[i] != mod_coeff[j]) return 0;
// everything went just fine
return 1;
}
int _sign_handler(uint32_t key_length, uint32_t use_crt, uint32_t first_run,
const uint32_t *m, const uint32_t *n,
const uint32_t *n_factor, const uint32_t *n_coeff,
uint32_t *x, uint32_t *y,
const uint32_t *p, const uint32_t *q,
const uint32_t *p_factor, const uint32_t *p_coeff,
const uint32_t *q_factor, const uint32_t *q_coeff,
const uint32_t *dp, const uint32_t *dq,
const uint32_t *d,
const uint32_t *qinv,
const uint32_t *s,
const uint32_t *xm, const uint32_t *ym)
{
uint32_t i, j, num_cyc;
uint32_t num_words = (key_length / sizeof(uint32_t)) >> 3;
uint32_t num_words_half = num_words >> 1;
uint32_t reg_control, reg_status;
uint32_t reg_mode;
uint32_t reg_modulus_bits, reg_exponent_bits;
// fill in all the necessary input values
// d is only written when CRT is not enabled (we wipe it otherwise just in case)
// note, that n_coeff is one word larger, than the modulus, so we need a single
// extra write after the word-by-word loop
for (i=0, j=num_words-1; i<num_words; i++, j--)
{ fmc_write_32(CORE_ADDR_BANK_M + i * sizeof(uint32_t), m[j]);
fmc_write_32(CORE_ADDR_BANK_N + i * sizeof(uint32_t), n[j]);
fmc_write_32(CORE_ADDR_BANK_N_FACTOR + i * sizeof(uint32_t), n_factor[j]);
fmc_write_32(CORE_ADDR_BANK_N_COEFF + i * sizeof(uint32_t), n_coeff[j+1]); // mind the +1
fmc_write_32(CORE_ADDR_BANK_X + i * sizeof(uint32_t), x[j]);
fmc_write_32(CORE_ADDR_BANK_Y + i * sizeof(uint32_t), y[j]);
if (!use_crt) fmc_write_32(CORE_ADDR_BANK_D + i * sizeof(uint32_t), d[j]);
else fmc_write_32(CORE_ADDR_BANK_D + i * sizeof(uint32_t), 0);
}
fmc_write_32(CORE_ADDR_BANK_N_COEFF + i * sizeof(uint32_t), n_coeff[0]); // j+1 is 0 by now, i is num_words
// also fill in all the input values necessary for CRT mode
// again, we need to write a pair of extra words for p_coeff and q_coeff after the loop
if (use_crt)
{ for (i=0, j=num_words_half-1; i<num_words_half; i++, j--)
{ fmc_write_32(CORE_ADDR_BANK_P + i * sizeof(uint32_t), p[j]);
fmc_write_32(CORE_ADDR_BANK_Q + i * sizeof(uint32_t), q[j]);
fmc_write_32(CORE_ADDR_BANK_P_FACTOR + i * sizeof(uint32_t), p_factor[j]);
fmc_write_32(CORE_ADDR_BANK_P_COEFF + i * sizeof(uint32_t), p_coeff[j+1]); // mind the +1!
fmc_write_32(CORE_ADDR_BANK_Q_FACTOR + i * sizeof(uint32_t), q_factor[j]);
fmc_write_32(CORE_ADDR_BANK_Q_COEFF + i * sizeof(uint32_t), q_coeff[j+1]); // mind the +1!
fmc_write_32(CORE_ADDR_BANK_DP + i * sizeof(uint32_t), dp[j]);
fmc_write_32(CORE_ADDR_BANK_DQ + i * sizeof(uint32_t), dq[j]);
fmc_write_32(CORE_ADDR_BANK_QINV + i * sizeof(uint32_t), qinv[j]);
}
fmc_write_32(CORE_ADDR_BANK_P_COEFF + i * sizeof(uint32_t), p_coeff[0]); // j+1 is 0 by now, i is num_words_half
fmc_write_32(CORE_ADDR_BANK_Q_COEFF + i * sizeof(uint32_t), q_coeff[0]); // j+1 is 0 by now, i is num_words_half
}
// set parameters (there's no need to divide key length by two when CRT is enabled,
// the core takes care of that by itself automatically)
reg_mode = use_crt ? CORE_MODE_USING_CRT : CORE_MODE_WITHOUT_CRT;
reg_modulus_bits = key_length;
reg_exponent_bits = key_length;
fmc_write_32(CORE_ADDR_MODE, reg_mode);
fmc_write_32(CORE_ADDR_MODULUS_BITS, reg_modulus_bits);
fmc_write_32(CORE_ADDR_EXPONENT_BITS, reg_exponent_bits);
// clear 'next' control bit, then set 'next' control bit again to trigger new operation
reg_control = 0;
fmc_write_32(CORE_ADDR_CONTROL, reg_control);
reg_control = CORE_CONTROL_BIT_NEXT;
fmc_write_32(CORE_ADDR_CONTROL, reg_control);
// wait for 'ready' status bit to be set, also turn on the blue LED while the
// core is busy to allow precise measurement with a scope
num_cyc = 0;
do
{ num_cyc++;
fmc_read_32(CORE_ADDR_STATUS, ®_status);
}
while (!(reg_status & CORE_STATUS_BIT_VALID));
// read back s, xm and ym word-by-word
// the first time the function is called, we compare the mutated blinding
// factors to the known correct reference values
// if the very first mutation was ok, we overwrite the currently used
// factors with the mutated ones, so the next time we sign, the new
// mutated factors will be used
// we obviously only know the mutated pair of factors beforehand during the very first call,
// so we don't verify them starting from the second call, but the signature should
// always stay the same, so we always verify it
uint32_t s_word, xm_word, ym_word;
for (i=0, j=num_words-1; i<num_words; i++, j--)
{ fmc_read_32(CORE_ADDR_BANK_S + i * sizeof(uint32_t), &s_word);
fmc_read_32(CORE_ADDR_BANK_XM + i * sizeof(uint32_t), &xm_word);
fmc_read_32(CORE_ADDR_BANK_YM + i * sizeof(uint32_t), &ym_word);
if (s_word != s[j]) return 0;
if (first_run)
{ if (xm_word != xm[j]) return 0;
if (ym_word != ym[j]) return 0;
}
else
{ x[j] = xm_word;
y[j] = ym_word;
}
}
// everything went just fine
return 1;
}
//
// toggle the yellow led to indicate that we're not stuck somewhere
//
void toggle_yellow_led(void)
{
static int led_state = 0;
led_state = !led_state;
if (led_state) led_on(LED_YELLOW);
else led_off(LED_YELLOW);
}
//
// SysTick
//
void SysTick_Handler(void)
{
HAL_IncTick();
HAL_SYSTICK_IRQHandler();
}
//
// End-of-File
//