From 058c54213a307fd360df1486f5d369d04b3a84d9 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Mon, 4 Sep 2017 00:14:09 +0300 Subject: Updated STM32 demo program to show how to use the precomputation block. --- src/stm32/modexpa7_driver_sample.c | 236 ++++++++++++++++++++++++++----------- 1 file changed, 165 insertions(+), 71 deletions(-) (limited to 'src/stm32') diff --git a/src/stm32/modexpa7_driver_sample.c b/src/stm32/modexpa7_driver_sample.c index 390c949..e1de2bd 100644 --- a/src/stm32/modexpa7_driver_sample.c +++ b/src/stm32/modexpa7_driver_sample.c @@ -59,12 +59,19 @@ #define CORE_ADDR_BUFFER_BITS (0x13 << 2) #define CORE_ADDR_ARRAY_BITS (0x14 << 2) + // operand bank size +#define BANK_LENGTH 0x200 // 0x200 = 512 bytes = 4096 bits // locations of operand buffers -#define CORE_ADDR_BANK_MODULUS (0x800 + 0 * 0x200) -#define CORE_ADDR_BANK_MESSAGE (0x800 + 1 * 0x200) -#define CORE_ADDR_BANK_EXPONENT (0x800 + 2 * 0x200) -#define CORE_ADDR_BANK_RESULT (0x800 + 3 * 0x200) +#define CORE_ADDR_BANK_MODULUS (BANK_LENGTH * (8 + 0)) +#define CORE_ADDR_BANK_MESSAGE (BANK_LENGTH * (8 + 1)) +#define CORE_ADDR_BANK_EXPONENT (BANK_LENGTH * (8 + 2)) +#define CORE_ADDR_BANK_RESULT (BANK_LENGTH * (8 + 3)) + +#define CORE_ADDR_BANK_MODULUS_COEFF_OUT (BANK_LENGTH * (8 + 4)) +#define CORE_ADDR_BANK_MODULUS_COEFF_IN (BANK_LENGTH * (8 + 5)) +#define CORE_ADDR_BANK_MONTGOMERY_FACTOR_OUT (BANK_LENGTH * (8 + 6)) +#define CORE_ADDR_BANK_MONTGOMERY_FACTOR_IN (BANK_LENGTH * (8 + 7)) // bit maps #define CORE_CONTROL_BIT_INIT 0x00000001 @@ -75,6 +82,27 @@ #define CORE_MODE_BIT_CRT 0x00000002 + /* + * zero operands + */ +#define Z_384 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000} + +#define Z_192 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000} + +#define Z_512 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000} + +#define Z_256 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000} /* * test vectors @@ -83,11 +111,15 @@ static const uint32_t m_384[] = M_384; static const uint32_t n_384[] = N_384; static const uint32_t d_384[] = D_384; static const uint32_t s_384[] = S_384; +static uint32_t n_coeff_384[] = Z_384; +static uint32_t factor_384[] = Z_384; static const uint32_t m_512[] = M_512; static const uint32_t n_512[] = N_512; static const uint32_t d_512[] = D_512; static const uint32_t s_512[] = S_512; +static uint32_t n_coeff_512[] = Z_512; +static uint32_t factor_512[] = Z_512; static const uint32_t p_192[] = P_192; static const uint32_t q_192[] = Q_192; @@ -95,6 +127,10 @@ static const uint32_t dp_192[] = DP_192; static const uint32_t dq_192[] = DQ_192; static const uint32_t mp_192[] = MP_192; static const uint32_t mq_192[] = MQ_192; +static uint32_t p_coeff_192[] = Z_192; +static uint32_t q_coeff_192[] = Z_192; +static uint32_t factor_p_192[] = Z_192; +static uint32_t factor_q_192[] = Z_192; static const uint32_t p_256[] = P_256; static const uint32_t q_256[] = Q_256; @@ -102,7 +138,10 @@ static const uint32_t dp_256[] = DP_256; static const uint32_t dq_256[] = DQ_256; static const uint32_t mp_256[] = MP_256; static const uint32_t mq_256[] = MQ_256; - +static uint32_t p_coeff_256[] = Z_256; +static uint32_t q_coeff_256[] = Z_256; +static uint32_t factor_p_256[] = Z_256; +static uint32_t factor_q_256[] = Z_256; /* @@ -110,16 +149,25 @@ static const uint32_t mq_256[] = MQ_256; */ void toggle_yellow_led(void); -void setup_modexpa7( const uint32_t *n, size_t l); +void setup_modexpa7( const uint32_t *n, + uint32_t *coeff, + uint32_t *factor, + size_t l); -int test_modexpa7( const uint32_t *m, +int test_modexpa7( const uint32_t *n, + const uint32_t *m, const uint32_t *d, const uint32_t *s, + const uint32_t *coeff, + const uint32_t *factor, size_t l); -int test_modexpa7_crt( const uint32_t *m, +int test_modexpa7_crt( const uint32_t *n, + const uint32_t *m, const uint32_t *d, const uint32_t *s, + const uint32_t *coeff, + const uint32_t *factor, size_t l); @@ -148,10 +196,10 @@ int main() fmc_read_32(CORE_ADDR_NAME1, &core_name1); fmc_read_32(CORE_ADDR_VERSION, &core_version); - // must be "mode", "xpa7", "0.20" + // must be "mode", "xpa7", "0.25" if ( (core_name0 != 0x6D6F6465) || (core_name1 != 0x78706137) || - (core_version != 0x302E3230)) + (core_version != 0x302E3235)) { led_off(LED_GREEN); led_on(LED_RED); @@ -164,61 +212,63 @@ int main() // largest supported operand width, systolic array "power" fmc_read_32(CORE_ADDR_BUFFER_BITS, &core_buffer_bits); - fmc_read_32(CORE_ADDR_ARRAY_BITS, &core_array_bits); + fmc_read_32(CORE_ADDR_ARRAY_BITS, &core_array_bits); + + // + // do pre-computation for all the moduli and store speed-up quantities, + // note that each key requires three precomputations: one for the entire + // public key and two for each of the corresponding private key components + // + // we set the 'init' control bit, wait for `ready' status bit to go high, + // then retrieve the calculated values from the corresponding "output" banks + // + // we turn off the green led and turn the yellow led during the process to + // get an idea of how long it takes + // + + led_off(LED_GREEN); + led_on(LED_YELLOW); + + // 384-bit key and 192-bit primes + setup_modexpa7(n_384, n_coeff_384, factor_384, 384); + setup_modexpa7(p_192, p_coeff_192, factor_p_192, 192); + setup_modexpa7(q_192, q_coeff_192, factor_q_192, 192); + + // 512-bit key and 256-bit primes + setup_modexpa7(n_512, n_coeff_512, factor_512, 512); + setup_modexpa7(p_256, p_coeff_256, factor_p_256, 256); + setup_modexpa7(q_256, q_coeff_256, factor_q_256, 256); + + led_off(LED_YELLOW); + led_on(LED_GREEN); + // repeat forever while (1) - { - // New modulus requires precomputation of modulus-dependent - // speed-up coefficient, this must be done once per new - // modulus, i.e. when we're repeatedly signing with the - // same key, we only need to do precomputation once before - // starting the very first signing operation. - + { // fresh start ok = 1; - - { - // run precomputation of modulus-dependent factor for the 384-bit modulus - setup_modexpa7(n_384, 384); - - // try signing the message from the 384-bit test vector - ok = ok && test_modexpa7(m_384, d_384, s_384, 384); - } - { - // run precomputation of modulus-dependent factor for the 512-bit modulus - setup_modexpa7(n_512, 512); - - // try signing the message from the 512-bit test vector - ok = ok && test_modexpa7(m_512, d_512, s_512, 512); - } - { - // run precomputation of modulus-dependent factor for the first 192-bit part of 384-bit modulus - setup_modexpa7(p_192, 192); - + { + // try signing the message with the 384-bit test vector + ok = ok && test_modexpa7(n_384, m_384, d_384, s_384, n_coeff_384, factor_384, 384); + // try signing 384-bit base using 192-bit exponent - ok = ok && test_modexpa7_crt(m_384, dp_192, mp_192, 192); - - // run precomputation of modulus-dependent factor for the second 192-bit part of 384-bit modulus - setup_modexpa7(q_192, 192); + ok = ok && test_modexpa7_crt(p_192, m_384, dp_192, mp_192, p_coeff_192, factor_p_192, 192); // try signing 384-bit base using 192-bit exponent - ok = ok && test_modexpa7_crt(m_384, dq_192, mq_192, 192); + ok = ok && test_modexpa7_crt(q_192, m_384, dq_192, mq_192, q_coeff_192, factor_q_192, 192); } + + { + // try signing the message with the 512-bit test vector + ok = ok && test_modexpa7(n_512, m_512, d_512, s_512, n_coeff_512, factor_512, 512); - { - // run precomputation of modulus-dependent factor for the first 256-bit part of 512-bit modulus - setup_modexpa7(p_256, 256); - // try signing 512-bit base using 256-bit exponent - ok = ok && test_modexpa7_crt(m_512, dp_256, mp_256, 256); - - // run precomputation of modulus-dependent factor for the second 256-bit part of 512-bit modulus - setup_modexpa7(q_256, 256); + ok = ok && test_modexpa7_crt(p_256, m_512, dp_256, mp_256, p_coeff_256, factor_p_256, 256); // try signing 512-bit base using 256-bit exponent - ok = ok && test_modexpa7_crt(m_512, dq_256, mq_256, 256); + ok = ok && test_modexpa7_crt(q_256, m_512, dq_256, mq_256, q_coeff_256, factor_q_256, 256); } // turn on the red led to indicate something went wrong @@ -234,15 +284,18 @@ int main() /* - * Load new modulus and do the necessary precomputations. + * Load new modulus and do all the necessary precomputations. */ void setup_modexpa7( const uint32_t *n, + uint32_t *coeff, + uint32_t *factor, size_t l) { size_t i, num_words; uint32_t num_bits; uint32_t reg_control, reg_status; uint32_t n_word; + uint32_t coeff_word, factor_word; uint32_t dummy_num_cyc; // determine numbers of 32-bit words @@ -250,10 +303,9 @@ void setup_modexpa7( const uint32_t *n, // set modulus width num_bits = l; - fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); + fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); - // fill modulus bank (the least significant word - // is at the lowest offset) + // fill modulus bank (the least significant word is at the lowest offset) for (i=0; i> 5; - // set exponent width + // set modulus width, exponent width num_bits = l; - fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); + fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); + fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); // disable CRT mode mode = 0; fmc_write_32(CORE_ADDR_MODE, &mode); - // fill message and exponent banks (the least significant - // word is at the lowest offset) + // fill modulus, message and exponent banks (the least significant + // word is at the lowest offset), we also need to fill "input" core + // banks with previously pre-calculated and saved modulus-dependent + // speed-up coefficient and Montgomery factor for (i=0; i> 5; - // set exponent width + // set modulus width, exponent width num_bits = l; - fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); + fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); + fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); // enable CRT mode mode = CORE_MODE_BIT_CRT; fmc_write_32(CORE_ADDR_MODE, &mode); - // fill exponent bank (the least significant word - // is at the lowest offset) + // fill modulus and exponent banks (the least significant word is at + // the lowest offset), we also need to fill "input" core banks with + // previously pre-calculated and saved modulus-dependent speed-up + // coefficient and Montgomery factor for (i=0; i