From 058c54213a307fd360df1486f5d369d04b3a84d9 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Mon, 4 Sep 2017 00:14:09 +0300 Subject: Updated STM32 demo program to show how to use the precomputation block. --- src/rtl/modexpa7_top.v | 40 +++++-- src/stm32/modexpa7_driver_sample.c | 236 ++++++++++++++++++++++++++----------- src/tb/tb_wrapper.v | 131 +++++++++++++++++++- 3 files changed, 322 insertions(+), 85 deletions(-) diff --git a/src/rtl/modexpa7_top.v b/src/rtl/modexpa7_top.v index 7723b88..ea3d2c2 100644 --- a/src/rtl/modexpa7_top.v +++ b/src/rtl/modexpa7_top.v @@ -109,24 +109,38 @@ module modexpa7_top # reg valid_reg = 1'b0; assign ready = ready_reg; - assign valid = valid_reg; + assign valid = valid_reg; + + reg init_trig_latch; + reg next_trig_latch; + + always @(posedge clk) + // + if (fsm_state == FSM_STATE_IDLE) + // + case ({next_trig, init_trig}) + 2'b00: {next_trig_latch, init_trig_latch} <= 2'b00; // do nothing + 2'b01: {next_trig_latch, init_trig_latch} <= 2'b01; // precalculate + 2'b10: {next_trig_latch, init_trig_latch} <= 2'b10; // exponentiate + 2'b11: {next_trig_latch, init_trig_latch} <= 2'b01; // 'init' has priority over 'next' + endcase // ready flag logic always @(posedge clk or negedge rst_n) // - if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state + if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state else case (fsm_state) - FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started - FSM_STATE_STOP: if (!ready_reg) ready_reg <= 1'b1; // set flag after operation is finished + FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (init_trig_latch) ready_reg <= 1'b1; // set flag after operation is finished endcase // valid flag logic always @(posedge clk or negedge rst_n) // - if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state + if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state else case (fsm_state) - FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started - FSM_STATE_STOP: if (!valid_reg) valid_reg <= 1'b1; // set flag after operation is finished + FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (next_trig_latch) valid_reg <= 1'b1; // set flag after operation is finished endcase @@ -137,14 +151,20 @@ module modexpa7_top # reg [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits_latch; // save number of words in modulus when pre-calculation has been triggered, - // i.e. user has apparently loaded a new modulus into the core + // i.e. user has apparently loaded a new modulus into the core + // + // we also need to update modulus length when user wants to exponentiate, + // because he could have done precomputation for some modulus, then used + // a different length modulus and then reverted back the original modulus + // without doing precomputation (dammit, spent whole day chasing this bug :( always @(posedge clk) // - if (fsm_next_state == FSM_STATE_PRECALC_START) + if ((fsm_next_state == FSM_STATE_PRECALC_START) || + (fsm_next_state == FSM_STATE_EXPONENT_START)) modulus_num_words_latch <= modulus_num_words; // save number of bits in exponent when exponentiation has been triggered, - // i.e. user has loaded a new message into the core and wants exponentiate + // i.e. user has loaded a new message into the core and wants to exponentiate always @(posedge clk) // if (fsm_next_state == FSM_STATE_EXPONENT_START) diff --git a/src/stm32/modexpa7_driver_sample.c b/src/stm32/modexpa7_driver_sample.c index 390c949..e1de2bd 100644 --- a/src/stm32/modexpa7_driver_sample.c +++ b/src/stm32/modexpa7_driver_sample.c @@ -59,12 +59,19 @@ #define CORE_ADDR_BUFFER_BITS (0x13 << 2) #define CORE_ADDR_ARRAY_BITS (0x14 << 2) + // operand bank size +#define BANK_LENGTH 0x200 // 0x200 = 512 bytes = 4096 bits // locations of operand buffers -#define CORE_ADDR_BANK_MODULUS (0x800 + 0 * 0x200) -#define CORE_ADDR_BANK_MESSAGE (0x800 + 1 * 0x200) -#define CORE_ADDR_BANK_EXPONENT (0x800 + 2 * 0x200) -#define CORE_ADDR_BANK_RESULT (0x800 + 3 * 0x200) +#define CORE_ADDR_BANK_MODULUS (BANK_LENGTH * (8 + 0)) +#define CORE_ADDR_BANK_MESSAGE (BANK_LENGTH * (8 + 1)) +#define CORE_ADDR_BANK_EXPONENT (BANK_LENGTH * (8 + 2)) +#define CORE_ADDR_BANK_RESULT (BANK_LENGTH * (8 + 3)) + +#define CORE_ADDR_BANK_MODULUS_COEFF_OUT (BANK_LENGTH * (8 + 4)) +#define CORE_ADDR_BANK_MODULUS_COEFF_IN (BANK_LENGTH * (8 + 5)) +#define CORE_ADDR_BANK_MONTGOMERY_FACTOR_OUT (BANK_LENGTH * (8 + 6)) +#define CORE_ADDR_BANK_MONTGOMERY_FACTOR_IN (BANK_LENGTH * (8 + 7)) // bit maps #define CORE_CONTROL_BIT_INIT 0x00000001 @@ -75,6 +82,27 @@ #define CORE_MODE_BIT_CRT 0x00000002 + /* + * zero operands + */ +#define Z_384 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000} + +#define Z_192 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000} + +#define Z_512 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000} + +#define Z_256 \ + {0x00000000, 0x00000000, 0x00000000, 0x00000000, \ + 0x00000000, 0x00000000, 0x00000000, 0x00000000} /* * test vectors @@ -83,11 +111,15 @@ static const uint32_t m_384[] = M_384; static const uint32_t n_384[] = N_384; static const uint32_t d_384[] = D_384; static const uint32_t s_384[] = S_384; +static uint32_t n_coeff_384[] = Z_384; +static uint32_t factor_384[] = Z_384; static const uint32_t m_512[] = M_512; static const uint32_t n_512[] = N_512; static const uint32_t d_512[] = D_512; static const uint32_t s_512[] = S_512; +static uint32_t n_coeff_512[] = Z_512; +static uint32_t factor_512[] = Z_512; static const uint32_t p_192[] = P_192; static const uint32_t q_192[] = Q_192; @@ -95,6 +127,10 @@ static const uint32_t dp_192[] = DP_192; static const uint32_t dq_192[] = DQ_192; static const uint32_t mp_192[] = MP_192; static const uint32_t mq_192[] = MQ_192; +static uint32_t p_coeff_192[] = Z_192; +static uint32_t q_coeff_192[] = Z_192; +static uint32_t factor_p_192[] = Z_192; +static uint32_t factor_q_192[] = Z_192; static const uint32_t p_256[] = P_256; static const uint32_t q_256[] = Q_256; @@ -102,7 +138,10 @@ static const uint32_t dp_256[] = DP_256; static const uint32_t dq_256[] = DQ_256; static const uint32_t mp_256[] = MP_256; static const uint32_t mq_256[] = MQ_256; - +static uint32_t p_coeff_256[] = Z_256; +static uint32_t q_coeff_256[] = Z_256; +static uint32_t factor_p_256[] = Z_256; +static uint32_t factor_q_256[] = Z_256; /* @@ -110,16 +149,25 @@ static const uint32_t mq_256[] = MQ_256; */ void toggle_yellow_led(void); -void setup_modexpa7( const uint32_t *n, size_t l); +void setup_modexpa7( const uint32_t *n, + uint32_t *coeff, + uint32_t *factor, + size_t l); -int test_modexpa7( const uint32_t *m, +int test_modexpa7( const uint32_t *n, + const uint32_t *m, const uint32_t *d, const uint32_t *s, + const uint32_t *coeff, + const uint32_t *factor, size_t l); -int test_modexpa7_crt( const uint32_t *m, +int test_modexpa7_crt( const uint32_t *n, + const uint32_t *m, const uint32_t *d, const uint32_t *s, + const uint32_t *coeff, + const uint32_t *factor, size_t l); @@ -148,10 +196,10 @@ int main() fmc_read_32(CORE_ADDR_NAME1, &core_name1); fmc_read_32(CORE_ADDR_VERSION, &core_version); - // must be "mode", "xpa7", "0.20" + // must be "mode", "xpa7", "0.25" if ( (core_name0 != 0x6D6F6465) || (core_name1 != 0x78706137) || - (core_version != 0x302E3230)) + (core_version != 0x302E3235)) { led_off(LED_GREEN); led_on(LED_RED); @@ -164,61 +212,63 @@ int main() // largest supported operand width, systolic array "power" fmc_read_32(CORE_ADDR_BUFFER_BITS, &core_buffer_bits); - fmc_read_32(CORE_ADDR_ARRAY_BITS, &core_array_bits); + fmc_read_32(CORE_ADDR_ARRAY_BITS, &core_array_bits); + + // + // do pre-computation for all the moduli and store speed-up quantities, + // note that each key requires three precomputations: one for the entire + // public key and two for each of the corresponding private key components + // + // we set the 'init' control bit, wait for `ready' status bit to go high, + // then retrieve the calculated values from the corresponding "output" banks + // + // we turn off the green led and turn the yellow led during the process to + // get an idea of how long it takes + // + + led_off(LED_GREEN); + led_on(LED_YELLOW); + + // 384-bit key and 192-bit primes + setup_modexpa7(n_384, n_coeff_384, factor_384, 384); + setup_modexpa7(p_192, p_coeff_192, factor_p_192, 192); + setup_modexpa7(q_192, q_coeff_192, factor_q_192, 192); + + // 512-bit key and 256-bit primes + setup_modexpa7(n_512, n_coeff_512, factor_512, 512); + setup_modexpa7(p_256, p_coeff_256, factor_p_256, 256); + setup_modexpa7(q_256, q_coeff_256, factor_q_256, 256); + + led_off(LED_YELLOW); + led_on(LED_GREEN); + // repeat forever while (1) - { - // New modulus requires precomputation of modulus-dependent - // speed-up coefficient, this must be done once per new - // modulus, i.e. when we're repeatedly signing with the - // same key, we only need to do precomputation once before - // starting the very first signing operation. - + { // fresh start ok = 1; - - { - // run precomputation of modulus-dependent factor for the 384-bit modulus - setup_modexpa7(n_384, 384); - - // try signing the message from the 384-bit test vector - ok = ok && test_modexpa7(m_384, d_384, s_384, 384); - } - { - // run precomputation of modulus-dependent factor for the 512-bit modulus - setup_modexpa7(n_512, 512); - - // try signing the message from the 512-bit test vector - ok = ok && test_modexpa7(m_512, d_512, s_512, 512); - } - { - // run precomputation of modulus-dependent factor for the first 192-bit part of 384-bit modulus - setup_modexpa7(p_192, 192); - + { + // try signing the message with the 384-bit test vector + ok = ok && test_modexpa7(n_384, m_384, d_384, s_384, n_coeff_384, factor_384, 384); + // try signing 384-bit base using 192-bit exponent - ok = ok && test_modexpa7_crt(m_384, dp_192, mp_192, 192); - - // run precomputation of modulus-dependent factor for the second 192-bit part of 384-bit modulus - setup_modexpa7(q_192, 192); + ok = ok && test_modexpa7_crt(p_192, m_384, dp_192, mp_192, p_coeff_192, factor_p_192, 192); // try signing 384-bit base using 192-bit exponent - ok = ok && test_modexpa7_crt(m_384, dq_192, mq_192, 192); + ok = ok && test_modexpa7_crt(q_192, m_384, dq_192, mq_192, q_coeff_192, factor_q_192, 192); } + + { + // try signing the message with the 512-bit test vector + ok = ok && test_modexpa7(n_512, m_512, d_512, s_512, n_coeff_512, factor_512, 512); - { - // run precomputation of modulus-dependent factor for the first 256-bit part of 512-bit modulus - setup_modexpa7(p_256, 256); - // try signing 512-bit base using 256-bit exponent - ok = ok && test_modexpa7_crt(m_512, dp_256, mp_256, 256); - - // run precomputation of modulus-dependent factor for the second 256-bit part of 512-bit modulus - setup_modexpa7(q_256, 256); + ok = ok && test_modexpa7_crt(p_256, m_512, dp_256, mp_256, p_coeff_256, factor_p_256, 256); // try signing 512-bit base using 256-bit exponent - ok = ok && test_modexpa7_crt(m_512, dq_256, mq_256, 256); + ok = ok && test_modexpa7_crt(q_256, m_512, dq_256, mq_256, q_coeff_256, factor_q_256, 256); } // turn on the red led to indicate something went wrong @@ -234,15 +284,18 @@ int main() /* - * Load new modulus and do the necessary precomputations. + * Load new modulus and do all the necessary precomputations. */ void setup_modexpa7( const uint32_t *n, + uint32_t *coeff, + uint32_t *factor, size_t l) { size_t i, num_words; uint32_t num_bits; uint32_t reg_control, reg_status; uint32_t n_word; + uint32_t coeff_word, factor_word; uint32_t dummy_num_cyc; // determine numbers of 32-bit words @@ -250,10 +303,9 @@ void setup_modexpa7( const uint32_t *n, // set modulus width num_bits = l; - fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); + fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); - // fill modulus bank (the least significant word - // is at the lowest offset) + // fill modulus bank (the least significant word is at the lowest offset) for (i=0; i> 5; - // set exponent width + // set modulus width, exponent width num_bits = l; - fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); + fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); + fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); // disable CRT mode mode = 0; fmc_write_32(CORE_ADDR_MODE, &mode); - // fill message and exponent banks (the least significant - // word is at the lowest offset) + // fill modulus, message and exponent banks (the least significant + // word is at the lowest offset), we also need to fill "input" core + // banks with previously pre-calculated and saved modulus-dependent + // speed-up coefficient and Montgomery factor for (i=0; i> 5; - // set exponent width + // set modulus width, exponent width num_bits = l; - fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); + fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits); + fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits); // enable CRT mode mode = CORE_MODE_BIT_CRT; fmc_write_32(CORE_ADDR_MODE, &mode); - // fill exponent bank (the least significant word - // is at the lowest offset) + // fill modulus and exponent banks (the least significant word is at + // the lowest offset), we also need to fill "input" core banks with + // previously pre-calculated and saved modulus-dependent speed-up + // coefficient and Montgomery factor for (i=0; i> 32; + end + // + write_reg('h08, 32'd0); // CONTROL.init = 0 + write_reg('h08, 32'd1); // CONTROL.init = 1 + // + poll = 1; + while (poll) begin + #10; + read_reg('h09, tmp); // tmp = STATUS + poll = ~tmp[0]; // poll = STATUS.ready + end + // + // fill banks + // + for (i=0; i<384/32; i=i+1) begin + read_bank(3'b100, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp); + write_bank(3'b101, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp); + read_bank(3'b110, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp); + write_bank(3'b111, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp); + end + // + shreg = M_384; + for (i=0; i<384/32; i=i+1) begin + write_bank(3'b001, i[USE_OPERAND_ADDR_WIDTH-1:0], shreg[31:0]); + shreg = shreg >> 32; + end + // + shreg = D_384; + for (i=0; i<384/32; i=i+1) begin + write_bank(3'b010, i[USE_OPERAND_ADDR_WIDTH-1:0], shreg[31:0]); + shreg = shreg >> 32; + end + // + // wipe + // + shreg = {384{1'b0}}; + for (i=0; i<384/32; i=i+1) begin + write_bank(3'b000, i[USE_OPERAND_ADDR_WIDTH-1:0], shreg[31:0]); + shreg = shreg >> 32; + end + // + write_reg('h08, 32'd0); // CONTROL.init = 0 + write_reg('h08, 32'd1); // CONTROL.init = 1 + // + poll = 1; + while (poll) begin + #10; + read_reg('h09, tmp); // tmp = STATUS + poll = ~tmp[0]; // poll = STATUS.ready + end + // + // restore + // + shreg = N_384; + for (i=0; i<384/32; i=i+1) begin + write_bank(3'b000, i[USE_OPERAND_ADDR_WIDTH-1:0], shreg[31:0]); + shreg = shreg >> 32; + end + // + // + // + write_reg('h08, 32'd0); // CONTROL.next = 0 + write_reg('h08, 32'd2); // CONTROL.next = 1 + // + poll = 1; + while (poll) begin + #10; + read_reg('h09, tmp); // tmp = STATUS + poll = ~tmp[1]; // poll = STATUS.valid + end + // + for (i=0; i<384/32; i=i+1) begin + read_bank(3'b011, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp); + shreg = {tmp, shreg[383:32]}; + end // end task read_reg; - input [USE_OPERAND_ADDR_WIDTH+1:0] addr; + input [USE_OPERAND_ADDR_WIDTH+2:0] addr; output [ 32-1:0] data; begin bus_cs = 1; @@ -89,9 +183,23 @@ module tb_wrapper; data = bus_rd_data; end endtask + + task read_bank; + input [ 2:0] bank; + input [USE_OPERAND_ADDR_WIDTH-1:0] addr; + output [ 32-1:0] data; + begin + bus_cs = 1; + bus_addr = {1'b1, bank, addr}; + #10; + bus_cs = 0; + bus_addr = 'bX; + data = bus_rd_data; + end + endtask task write_reg; - input [USE_OPERAND_ADDR_WIDTH+1:0] addr; + input [USE_OPERAND_ADDR_WIDTH+2:0] addr; input [ 32-1:0] data; begin bus_cs = 1; @@ -104,6 +212,21 @@ module tb_wrapper; bus_addr = 'bX; end endtask - + + task write_bank; + input [ 2:0] bank; + input [USE_OPERAND_ADDR_WIDTH-1:0] addr; + input [ 32-1:0] data; + begin + bus_cs = 1; + bus_we = 1; + bus_addr = {1'b1, bank, addr}; + bus_wr_data = data; + #10; + bus_cs = 0; + bus_we = 0; + bus_addr = 'bX; + end + endtask endmodule -- cgit v1.2.3