aboutsummaryrefslogtreecommitdiff
path: root/stm32/modexpng_driver_sample.c
diff options
context:
space:
mode:
Diffstat (limited to 'stm32/modexpng_driver_sample.c')
-rw-r--r--stm32/modexpng_driver_sample.c475
1 files changed, 475 insertions, 0 deletions
diff --git a/stm32/modexpng_driver_sample.c b/stm32/modexpng_driver_sample.c
new file mode 100644
index 0000000..1f0a767
--- /dev/null
+++ b/stm32/modexpng_driver_sample.c
@@ -0,0 +1,475 @@
+//
+// simple driver to test "modexpng" core in hardware
+//
+
+//
+// note, that the test program needs a custom bitstream where
+// the core is located at offset 0 (without the core selector)
+//
+
+// stm32 headers
+#include "stm-init.h"
+#include "stm-led.h"
+#include "stm-fmc.h"
+
+// test vectors (generated by the supplied python math model)
+#include "modexpng_vector_1024.h"
+#include "modexpng_vector_2048.h"
+#include "modexpng_vector_4096.h"
+
+// reference code
+#include "modexpng_util.h"
+
+// locations of core registers
+#define CORE_ADDR_NAME0 (0x00 << 2)
+#define CORE_ADDR_NAME1 (0x01 << 2)
+#define CORE_ADDR_VERSION (0x02 << 2)
+#define CORE_ADDR_CONTROL (0x08 << 2)
+#define CORE_ADDR_STATUS (0x09 << 2)
+#define CORE_ADDR_MODE (0x10 << 2)
+#define CORE_ADDR_MODULUS_BITS (0x11 << 2)
+#define CORE_ADDR_EXPONENT_BITS (0x12 << 2)
+#define CORE_ADDR_BANK_BITS (0x13 << 2)
+#define CORE_ADDR_NUM_MULTS (0x14 << 2)
+
+// locations of data buffers
+#define CORE_ADDR_BANK_M (1 * 0x1000 + 0 * 0x200)
+#define CORE_ADDR_BANK_N (1 * 0x1000 + 1 * 0x200)
+#define CORE_ADDR_BANK_N_FACTOR (1 * 0x1000 + 2 * 0x200)
+#define CORE_ADDR_BANK_N_COEFF (1 * 0x1000 + 3 * 0x200)
+#define CORE_ADDR_BANK_X (1 * 0x1000 + 5 * 0x200)
+#define CORE_ADDR_BANK_Y (1 * 0x1000 + 6 * 0x200)
+
+#define CORE_ADDR_BANK_D (2 * 0x1000 + 0 * 0x200)
+#define CORE_ADDR_BANK_P (2 * 0x1000 + 1 * 0x200)
+#define CORE_ADDR_BANK_DP (2 * 0x1000 + 3 * 0x100)
+#define CORE_ADDR_BANK_P_FACTOR (2 * 0x1000 + 2 * 0x200)
+#define CORE_ADDR_BANK_P_COEFF (2 * 0x1000 + 3 * 0x200)
+#define CORE_ADDR_BANK_Q (2 * 0x1000 + 4 * 0x200)
+#define CORE_ADDR_BANK_DQ (2 * 0x1000 + 9 * 0x100)
+#define CORE_ADDR_BANK_Q_FACTOR (2 * 0x1000 + 5 * 0x200)
+#define CORE_ADDR_BANK_Q_COEFF (2 * 0x1000 + 6 * 0x200)
+#define CORE_ADDR_BANK_QINV (2 * 0x1000 + 7 * 0x200)
+
+#define CORE_ADDR_BANK_S (3 * 0x1000 + 0 * 0x200)
+#define CORE_ADDR_BANK_XM (3 * 0x1000 + 1 * 0x200)
+#define CORE_ADDR_BANK_YM (3 * 0x1000 + 2 * 0x200)
+
+// bit maps
+#define CORE_CONTROL_BIT_NEXT 0x00000002
+#define CORE_STATUS_BIT_VALID 0x00000002
+
+#define CORE_MODE_USING_CRT 0x00000002
+#define CORE_MODE_WITHOUT_CRT 0x00000000
+
+
+//
+// test vectors
+//
+static const uint32_t M_1024[] = M_1024_INIT;
+static const uint32_t N_1024[] = N_1024_INIT;
+static const uint32_t N_FACTOR_1024[] = N_FACTOR_1024_INIT;
+static const uint32_t N_COEFF_1024[] = N_COEFF_1024_INIT;
+static uint32_t X_1024[] = X_1024_INIT;
+static uint32_t Y_1024[] = Y_1024_INIT;
+static const uint32_t P_1024[] = P_1024_INIT;
+static const uint32_t Q_1024[] = Q_1024_INIT;
+static const uint32_t P_FACTOR_1024[] = P_FACTOR_1024_INIT;
+static const uint32_t Q_FACTOR_1024[] = Q_FACTOR_1024_INIT;
+static const uint32_t P_COEFF_1024[] = P_COEFF_1024_INIT;
+static const uint32_t Q_COEFF_1024[] = Q_COEFF_1024_INIT;
+static const uint32_t D_1024[] = D_1024_INIT;
+static const uint32_t DP_1024[] = DP_1024_INIT;
+static const uint32_t DQ_1024[] = DQ_1024_INIT;
+static const uint32_t QINV_1024[] = QINV_1024_INIT;
+static const uint32_t XM_1024[] = XM_1024_INIT;
+static const uint32_t YM_1024[] = YM_1024_INIT;
+static const uint32_t S_1024[] = S_1024_INIT;
+
+static const uint32_t M_2048[] = M_2048_INIT;
+static const uint32_t N_2048[] = N_2048_INIT;
+static const uint32_t N_FACTOR_2048[] = N_FACTOR_2048_INIT;
+static const uint32_t N_COEFF_2048[] = N_COEFF_2048_INIT;
+static uint32_t X_2048[] = X_2048_INIT;
+static uint32_t Y_2048[] = Y_2048_INIT;
+static const uint32_t P_2048[] = P_2048_INIT;
+static const uint32_t Q_2048[] = Q_2048_INIT;
+static const uint32_t P_FACTOR_2048[] = P_FACTOR_2048_INIT;
+static const uint32_t Q_FACTOR_2048[] = Q_FACTOR_2048_INIT;
+static const uint32_t P_COEFF_2048[] = P_COEFF_2048_INIT;
+static const uint32_t Q_COEFF_2048[] = Q_COEFF_2048_INIT;
+static const uint32_t D_2048[] = D_2048_INIT;
+static const uint32_t DP_2048[] = DP_2048_INIT;
+static const uint32_t DQ_2048[] = DQ_2048_INIT;
+static const uint32_t QINV_2048[] = QINV_2048_INIT;
+static const uint32_t XM_2048[] = XM_2048_INIT;
+static const uint32_t YM_2048[] = YM_2048_INIT;
+static const uint32_t S_2048[] = S_2048_INIT;
+
+static const uint32_t M_4096[] = M_4096_INIT;
+static const uint32_t N_4096[] = N_4096_INIT;
+static const uint32_t N_FACTOR_4096[] = N_FACTOR_4096_INIT;
+static const uint32_t N_COEFF_4096[] = N_COEFF_4096_INIT;
+static uint32_t X_4096[] = X_4096_INIT;
+static uint32_t Y_4096[] = Y_4096_INIT;
+static const uint32_t P_4096[] = P_4096_INIT;
+static const uint32_t Q_4096[] = Q_4096_INIT;
+static const uint32_t P_FACTOR_4096[] = P_FACTOR_4096_INIT;
+static const uint32_t Q_FACTOR_4096[] = Q_FACTOR_4096_INIT;
+static const uint32_t P_COEFF_4096[] = P_COEFF_4096_INIT;
+static const uint32_t Q_COEFF_4096[] = Q_COEFF_4096_INIT;
+static const uint32_t D_4096[] = D_4096_INIT;
+static const uint32_t DP_4096[] = DP_4096_INIT;
+static const uint32_t DQ_4096[] = DQ_4096_INIT;
+static const uint32_t QINV_4096[] = QINV_4096_INIT;
+static const uint32_t XM_4096[] = XM_4096_INIT;
+static const uint32_t YM_4096[] = YM_4096_INIT;
+static const uint32_t S_4096[] = S_4096_INIT;
+
+
+//
+// buffers
+//
+static uint32_t mod_rev[BUF_NUM_WORDS];
+static uint32_t mod_factor_rev[BUF_NUM_WORDS];
+static uint32_t mod_coeff_rev[BUF_NUM_WORDS+1];
+
+
+//
+// prototypes
+//
+void toggle_yellow_led(void);
+
+int check_montgomery_factor(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_factor);
+int check_modulus_coeff(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_coeff);
+
+int _sign_handler(uint32_t key_length, uint32_t use_crt, uint32_t first_run,
+ const uint32_t *m, const uint32_t *n,
+ const uint32_t *n_factor, const uint32_t *n_coeff,
+ uint32_t *x, uint32_t *y,
+ const uint32_t *p, const uint32_t *q,
+ const uint32_t *p_factor, const uint32_t *p_coeff,
+ const uint32_t *q_factor, const uint32_t *q_coeff,
+ const uint32_t *dp, const uint32_t *dq,
+ const uint32_t *d,
+ const uint32_t *qinv,
+ const uint32_t *s,
+ const uint32_t *xm, const uint32_t *ym);
+
+//
+// easier calls
+//
+#define sign_without_crt(k,f,m,n,nf,nc,x,y,d,s,xm,ym) \
+ _sign_handler (k,0,f,m,n,nf,nc,x,y,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,d,NULL,s,xm,ym)
+
+#define sign_using_crt(k,f,m,n,nf,nc,x,y,p,q,pf,pc,qf,qc,dp,dq,qinv,s,xm,ym) \
+ _sign_handler (k,1,f,m,n,nf,nc,x,y,p,q,pf,pc,qf,qc,dp,dq,NULL,qinv,s,xm,ym)
+
+
+//
+// test routine
+//
+int main()
+{
+ int ok;
+ int first_run;
+
+ // initialize
+ stm_init();
+ fmc_init();
+
+ // initialize
+ led_on(LED_GREEN);
+ led_off(LED_RED);
+ led_off(LED_YELLOW);
+ led_off(LED_BLUE);
+
+ // make sure, that ModExpNG is there
+ uint32_t core_name0;
+ uint32_t core_name1;
+ uint32_t core_version;
+
+ fmc_read_32(CORE_ADDR_NAME0, &core_name0);
+ fmc_read_32(CORE_ADDR_NAME1, &core_name1);
+ fmc_read_32(CORE_ADDR_VERSION, &core_version);
+
+ // "mode", "xpng"
+ if ((core_name0 != 0x6D6F6465) || (core_name1 != 0x78706E67))
+ { led_off(LED_GREEN);
+ led_on(LED_RED);
+ while (1);
+ }
+
+ // check, that reference code works correctly
+ ok = 1;
+
+ ok = ok && check_montgomery_factor(1024, N_1024, N_FACTOR_1024);
+ ok = ok && check_montgomery_factor( 512, P_1024, P_FACTOR_1024);
+ ok = ok && check_montgomery_factor( 512, Q_1024, Q_FACTOR_1024);
+ ok = ok && check_montgomery_factor(2048, N_2048, N_FACTOR_2048);
+ ok = ok && check_montgomery_factor(1024, P_2048, P_FACTOR_2048);
+ ok = ok && check_montgomery_factor(1024, Q_2048, Q_FACTOR_2048);
+ ok = ok && check_montgomery_factor(4096, N_4096, N_FACTOR_4096);
+ ok = ok && check_montgomery_factor(2048, P_4096, P_FACTOR_4096);
+ ok = ok && check_montgomery_factor(2048, Q_4096, Q_FACTOR_4096);
+
+ ok = ok && check_modulus_coeff(1024, N_1024, N_COEFF_1024);
+ ok = ok && check_modulus_coeff( 512, P_1024, P_COEFF_1024);
+ ok = ok && check_modulus_coeff( 512, Q_1024, Q_COEFF_1024);
+ ok = ok && check_modulus_coeff(2048, N_2048, N_COEFF_2048);
+ ok = ok && check_modulus_coeff(1024, P_2048, P_COEFF_2048);
+ ok = ok && check_modulus_coeff(1024, Q_2048, Q_COEFF_2048);
+// ok = ok && check_modulus_coeff(4096, N_4096, N_COEFF_4096); // SLOW (~20 sec)
+ ok = ok && check_modulus_coeff(2048, P_4096, P_COEFF_4096);
+ ok = ok && check_modulus_coeff(2048, Q_4096, Q_COEFF_4096);
+
+ if (!ok)
+ { led_off(LED_GREEN);
+ led_on(LED_RED);
+ while (1);
+ }
+
+ // repeat forever
+ ok = 1, first_run = 1;
+ while (1)
+ {
+ ok = ok && sign_without_crt(1024, first_run,
+ M_1024, N_1024, N_FACTOR_1024, N_COEFF_1024,
+ X_1024, Y_1024, D_1024, S_1024,
+ XM_1024, YM_1024);
+
+ ok = ok && sign_without_crt(2048, first_run,
+ M_2048, N_2048, N_FACTOR_2048, N_COEFF_2048,
+ X_2048, Y_2048, D_2048, S_2048,
+ XM_2048, YM_2048);
+
+ ok = ok && sign_without_crt(4096, first_run,
+ M_4096, N_4096, N_FACTOR_4096, N_COEFF_4096,
+ X_4096, Y_4096, D_4096, S_4096,
+ XM_4096, YM_4096);
+
+ ok = ok && sign_using_crt(1024, first_run,
+ M_1024, N_1024, N_FACTOR_1024, N_COEFF_1024,
+ X_1024, Y_1024, P_1024, Q_1024,
+ P_FACTOR_1024, P_COEFF_1024, Q_FACTOR_1024, Q_COEFF_1024,
+ DP_1024, DQ_1024, QINV_1024, S_1024,
+ XM_1024, YM_1024);
+
+ ok = ok && sign_using_crt(2048, first_run,
+ M_2048, N_2048, N_FACTOR_2048, N_COEFF_2048,
+ X_2048, Y_2048, P_2048, Q_2048,
+ P_FACTOR_2048, P_COEFF_2048, Q_FACTOR_2048, Q_COEFF_2048,
+ DP_2048, DQ_2048, QINV_2048, S_2048,
+ XM_2048, YM_2048);
+
+ ok = ok && sign_using_crt(4096, first_run,
+ M_4096, N_4096, N_FACTOR_4096, N_COEFF_4096,
+ X_4096, Y_4096, P_4096, Q_4096,
+ P_FACTOR_4096, P_COEFF_4096, Q_FACTOR_4096, Q_COEFF_4096,
+ DP_4096, DQ_4096, QINV_4096, S_4096,
+ XM_4096, YM_4096);
+
+ if (!ok)
+ { led_off(LED_GREEN);
+ led_on(LED_RED);
+ }
+
+ first_run = 0;
+
+ toggle_yellow_led();
+ }
+}
+
+int check_montgomery_factor(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_factor)
+{
+ uint32_t i, j;
+ uint32_t num_words = key_length / UINT32_BITS;
+
+ // _calc_montgomery_factor() expects the least significant byte in [0],
+ // but C array initialization places it in [N-1], so we need to
+ // reverse the array before passing it to the function
+ for (i=0, j=num_words-1; i<num_words; i++, j--)
+ mod_rev[i] = mod[j];
+
+ // compute Montgomery factor
+ _calc_montgomery_factor(num_words, mod_rev, mod_factor_rev);
+
+ // we now need to compare the calculated factor to the reference value,
+ // _calc_montgomery_factor() places the least significant byte in [0],
+ // but C array initialization places the least significant byte of the
+ // reference value in [N-1], so we need to go in opposite directions
+ // when comparing
+ for (i=0, j=num_words-1; i<num_words; i++, j--)
+ if (mod_factor_rev[i] != mod_factor[j]) return 0;
+
+ // everything went just fine
+ return 1;
+}
+
+
+int check_modulus_coeff(uint32_t key_length, const uint32_t *mod, const uint32_t *mod_coeff)
+{
+ uint32_t i, j;
+ uint32_t num_words = key_length / UINT32_BITS;
+
+ // _calc_modulus_coeff() expects the least significant byte in [0],
+ // but C array initialization places it in [N-1], so we need to
+ // reverse the array before passing it to the function
+ for (i=0, j=num_words-1; i<num_words; i++, j--)
+ mod_rev[i] = mod[j];
+
+ // compute modulus-dependent speed-up coefficient
+ _calc_modulus_coeff(num_words, mod_rev, mod_coeff_rev);
+
+ // we now need to compare the calculated coefficient to the reference value,
+ // _calc_modulus_coeff() places the least significant byte in [0],
+ // but C array initialization places the least significant byte of the
+ // reference value in [N], so we need to go in opposite directions
+ // when comparing, also note, that we should process N+1 words, since the
+ // coefficient is slightly longer, than the modulus
+ for (i=0, j=num_words; i<=num_words; i++, j--)
+ if (mod_coeff_rev[i] != mod_coeff[j]) return 0;
+
+ // everything went just fine
+ return 1;
+}
+
+
+int _sign_handler(uint32_t key_length, uint32_t use_crt, uint32_t first_run,
+ const uint32_t *m, const uint32_t *n,
+ const uint32_t *n_factor, const uint32_t *n_coeff,
+ uint32_t *x, uint32_t *y,
+ const uint32_t *p, const uint32_t *q,
+ const uint32_t *p_factor, const uint32_t *p_coeff,
+ const uint32_t *q_factor, const uint32_t *q_coeff,
+ const uint32_t *dp, const uint32_t *dq,
+ const uint32_t *d,
+ const uint32_t *qinv,
+ const uint32_t *s,
+ const uint32_t *xm, const uint32_t *ym)
+{
+ uint32_t i, j, num_cyc;
+ uint32_t num_words = (key_length / sizeof(uint32_t)) >> 3;
+ uint32_t num_words_half = num_words >> 1;
+ uint32_t reg_control, reg_status;
+ uint32_t reg_mode;
+ uint32_t reg_modulus_bits, reg_exponent_bits;
+
+ // fill in all the necessary input values
+ // d is only written when CRT is not enabled (we wipe it otherwise just in case)
+ // note, that n_coeff is one word larger, than the modulus, so we need a single
+ // extra write after the word-by-word loop
+ for (i=0, j=num_words-1; i<num_words; i++, j--)
+ { fmc_write_32(CORE_ADDR_BANK_M + i * sizeof(uint32_t), m[j]);
+ fmc_write_32(CORE_ADDR_BANK_N + i * sizeof(uint32_t), n[j]);
+ fmc_write_32(CORE_ADDR_BANK_N_FACTOR + i * sizeof(uint32_t), n_factor[j]);
+ fmc_write_32(CORE_ADDR_BANK_N_COEFF + i * sizeof(uint32_t), n_coeff[j+1]); // mind the +1
+ fmc_write_32(CORE_ADDR_BANK_X + i * sizeof(uint32_t), x[j]);
+ fmc_write_32(CORE_ADDR_BANK_Y + i * sizeof(uint32_t), y[j]);
+ if (!use_crt) fmc_write_32(CORE_ADDR_BANK_D + i * sizeof(uint32_t), d[j]);
+ else fmc_write_32(CORE_ADDR_BANK_D + i * sizeof(uint32_t), 0);
+ }
+ fmc_write_32(CORE_ADDR_BANK_N_COEFF + i * sizeof(uint32_t), n_coeff[0]); // j+1 is 0 by now, i is num_words
+
+ // also fill in all the input values necessary for CRT mode
+ // again, we need to write a pair of extra words for p_coeff and q_coeff after the loop
+ if (use_crt)
+ { for (i=0, j=num_words_half-1; i<num_words_half; i++, j--)
+ { fmc_write_32(CORE_ADDR_BANK_P + i * sizeof(uint32_t), p[j]);
+ fmc_write_32(CORE_ADDR_BANK_Q + i * sizeof(uint32_t), q[j]);
+ fmc_write_32(CORE_ADDR_BANK_P_FACTOR + i * sizeof(uint32_t), p_factor[j]);
+ fmc_write_32(CORE_ADDR_BANK_P_COEFF + i * sizeof(uint32_t), p_coeff[j+1]); // mind the +1!
+ fmc_write_32(CORE_ADDR_BANK_Q_FACTOR + i * sizeof(uint32_t), q_factor[j]);
+ fmc_write_32(CORE_ADDR_BANK_Q_COEFF + i * sizeof(uint32_t), q_coeff[j+1]); // mind the +1!
+ fmc_write_32(CORE_ADDR_BANK_DP + i * sizeof(uint32_t), dp[j]);
+ fmc_write_32(CORE_ADDR_BANK_DQ + i * sizeof(uint32_t), dq[j]);
+ fmc_write_32(CORE_ADDR_BANK_QINV + i * sizeof(uint32_t), qinv[j]);
+ }
+ fmc_write_32(CORE_ADDR_BANK_P_COEFF + i * sizeof(uint32_t), p_coeff[0]); // j+1 is 0 by now, i is num_words_half
+ fmc_write_32(CORE_ADDR_BANK_Q_COEFF + i * sizeof(uint32_t), q_coeff[0]); // j+1 is 0 by now, i is num_words_half
+ }
+
+ // set parameters (there's no need to divide key length by two when CRT is enabled,
+ // the core takes care of that by itself automatically)
+ reg_mode = use_crt ? CORE_MODE_USING_CRT : CORE_MODE_WITHOUT_CRT;
+ reg_modulus_bits = key_length;
+ reg_exponent_bits = key_length;
+
+ fmc_write_32(CORE_ADDR_MODE, reg_mode);
+ fmc_write_32(CORE_ADDR_MODULUS_BITS, reg_modulus_bits);
+ fmc_write_32(CORE_ADDR_EXPONENT_BITS, reg_exponent_bits);
+
+ // clear 'next' control bit, then set 'next' control bit again to trigger new operation
+ reg_control = 0;
+ fmc_write_32(CORE_ADDR_CONTROL, reg_control);
+ reg_control = CORE_CONTROL_BIT_NEXT;
+ fmc_write_32(CORE_ADDR_CONTROL, reg_control);
+
+ // wait for 'ready' status bit to be set, also turn on the blue LED while the
+ // core is busy to allow precise measurement with a scope
+ num_cyc = 0;
+ do
+ { num_cyc++;
+ fmc_read_32(CORE_ADDR_STATUS, &reg_status);
+ }
+ while (!(reg_status & CORE_STATUS_BIT_VALID));
+
+ // read back s, xm and ym word-by-word
+ // the first time the function is called, we compare the mutated blinding
+ // factors to the known correct reference values
+ // if the very first mutation was ok, we overwrite the currently used
+ // factors with the mutated ones, so the next time we sign, the new
+ // mutated factors will be used
+ // we obviously only know the mutated pair of factors beforehand during the very first call,
+ // so we don't verify them starting from the second call, but the signature should
+ // always stay the same, so we always verify it
+ uint32_t s_word, xm_word, ym_word;
+ for (i=0, j=num_words-1; i<num_words; i++, j--)
+ { fmc_read_32(CORE_ADDR_BANK_S + i * sizeof(uint32_t), &s_word);
+ fmc_read_32(CORE_ADDR_BANK_XM + i * sizeof(uint32_t), &xm_word);
+ fmc_read_32(CORE_ADDR_BANK_YM + i * sizeof(uint32_t), &ym_word);
+
+ if (s_word != s[j]) return 0;
+
+ if (first_run)
+ { if (xm_word != xm[j]) return 0;
+ if (ym_word != ym[j]) return 0;
+ }
+ else
+ { x[j] = xm_word;
+ y[j] = ym_word;
+ }
+ }
+
+ // everything went just fine
+ return 1;
+}
+
+
+//
+// toggle the yellow led to indicate that we're not stuck somewhere
+//
+void toggle_yellow_led(void)
+{
+ static int led_state = 0;
+
+ led_state = !led_state;
+
+ if (led_state) led_on(LED_YELLOW);
+ else led_off(LED_YELLOW);
+}
+
+
+//
+// SysTick
+//
+void SysTick_Handler(void)
+{
+ HAL_IncTick();
+ HAL_SYSTICK_IRQHandler();
+}
+
+
+//
+// End-of-File
+//