diff options
-rw-r--r-- | src/rtl/modexpa7_top.v | 120 | ||||
-rw-r--r-- | src/rtl/modexpa7_wrapper.v | 30 | ||||
-rw-r--r-- | src/stm32/modexpa7_driver_sample.c | 236 | ||||
-rw-r--r-- | src/tb/tb_wrapper.v | 121 |
4 files changed, 382 insertions, 125 deletions
diff --git a/src/rtl/modexpa7_top.v b/src/rtl/modexpa7_top.v index ad101dd..ea3d2c2 100644 --- a/src/rtl/modexpa7_top.v +++ b/src/rtl/modexpa7_top.v @@ -54,7 +54,7 @@ module modexpa7_top # input bus_cs, input bus_we, - input [OPERAND_ADDR_WIDTH+1:0] bus_addr, + input [OPERAND_ADDR_WIDTH+2:0] bus_addr, input [ 32-1:0] bus_data_wr, output [ 32-1:0] bus_data_rd ); @@ -109,24 +109,38 @@ module modexpa7_top # reg valid_reg = 1'b0; assign ready = ready_reg; - assign valid = valid_reg; + assign valid = valid_reg;
+
+ reg init_trig_latch;
+ reg next_trig_latch;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_IDLE)
+ //
+ case ({next_trig, init_trig})
+ 2'b00: {next_trig_latch, init_trig_latch} <= 2'b00; // do nothing
+ 2'b01: {next_trig_latch, init_trig_latch} <= 2'b01; // precalculate
+ 2'b10: {next_trig_latch, init_trig_latch} <= 2'b10; // exponentiate
+ 2'b11: {next_trig_latch, init_trig_latch} <= 2'b01; // 'init' has priority over 'next'
+ endcase // ready flag logic always @(posedge clk or negedge rst_n) // - if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state + if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state else case (fsm_state) - FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started - FSM_STATE_STOP: if (!ready_reg) ready_reg <= 1'b1; // set flag after operation is finished + FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (init_trig_latch) ready_reg <= 1'b1; // set flag after operation is finished endcase // valid flag logic always @(posedge clk or negedge rst_n) // - if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state + if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state else case (fsm_state) - FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started - FSM_STATE_STOP: if (!valid_reg) valid_reg <= 1'b1; // set flag after operation is finished + FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (next_trig_latch) valid_reg <= 1'b1; // set flag after operation is finished endcase @@ -137,14 +151,20 @@ module modexpa7_top # reg [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits_latch; // save number of words in modulus when pre-calculation has been triggered, - // i.e. user has apparently loaded a new modulus into the core + // i.e. user has apparently loaded a new modulus into the core
+ //
+ // we also need to update modulus length when user wants to exponentiate,
+ // because he could have done precomputation for some modulus, then used
+ // a different length modulus and then reverted back the original modulus
+ // without doing precomputation (dammit, spent whole day chasing this bug :( always @(posedge clk) // - if (fsm_next_state == FSM_STATE_PRECALC_START) + if ((fsm_next_state == FSM_STATE_PRECALC_START) ||
+ (fsm_next_state == FSM_STATE_EXPONENT_START)) modulus_num_words_latch <= modulus_num_words; // save number of bits in exponent when exponentiation has been triggered, - // i.e. user has loaded a new message into the core and wants exponentiate + // i.e. user has loaded a new message into the core and wants to exponentiate always @(posedge clk) // if (fsm_next_state == FSM_STATE_EXPONENT_START) @@ -154,17 +174,21 @@ module modexpa7_top # /* * Split bus address into bank/word parts. */ - wire [ 2 - 1 : 0] bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+1:OPERAND_ADDR_WIDTH]; + wire [ 3 - 1 : 0] bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+2:OPERAND_ADDR_WIDTH]; wire [OPERAND_ADDR_WIDTH - 1 : 0] bus_addr_word = bus_addr[OPERAND_ADDR_WIDTH-1:0]; /* * Define bank offsets. */ - localparam [ 1: 0] BANK_MODULUS = 2'b00; // 0 - localparam [ 1: 0] BANK_MESSAGE = 2'b01; // 1 - localparam [ 1: 0] BANK_EXPONENT = 2'b10; // 2 - localparam [ 1: 0] BANK_RESULT = 2'b11; // 3 + localparam [ 2: 0] BANK_MODULUS = 3'b000; // 0 + localparam [ 2: 0] BANK_MESSAGE = 3'b001; // 1 + localparam [ 2: 0] BANK_EXPONENT = 3'b010; // 2 + localparam [ 2: 0] BANK_RESULT = 3'b011; // 3
+ localparam [ 2: 0] BANK_MODULUS_COEFF_OUT = 3'b100; // 5
+ localparam [ 2: 0] BANK_MODULUS_COEFF_IN = 3'b101; // 4
+ localparam [ 2: 0] BANK_MONTGOMERY_FACTOR_OUT = 3'b110; // 7 + localparam [ 2: 0] BANK_MONTGOMERY_FACTOR_IN = 3'b111; // 6
/* @@ -176,7 +200,7 @@ module modexpa7_top # * * Note, that the core does squaring and multiplication simultaneously, so * there are two identical systolic multipliers inside. It's better to have two - * copies of modulus to give router some freeding in placing the multipliers, + * copies of modulus to give router some freedom in placing the multipliers, * that's why there are actually two identical block memories N1 and N2 instead of N. * User reads from the first one, but writes to both of them. Note that the synthesis * tool might get too clever and find out that N1 and N2 are identical and decide @@ -250,14 +274,18 @@ module modexpa7_top # /* - * Instantiate internal memories. + * Instantiate more block memories.
+ *
+ * Fast modular exponentiation requires two pre-calculated helper quantities: Montgomery
+ * factor F and modulus-dependent speed-up coefficient N_COEFF. This core has two separate
+ * buffers for each of those quantities, during pre-computation F and N_COEFF are written to
+ * the "output" buffers, so that user can retrieve them and store along with the key for
+ * future use. During exponentiation F and N_COEFF are read from the "input" buffers and
+ * must be supplied by user along with the modulus. * - * We have two block memories: F for Montgomery factor and N_COEFF for modulus-dependent - * coefficient, they are written to during pre-calculation and read from during exponentiation. - * - * Note, that there are actually two identical block memories N_COEFF1 and N_COEFF2 instead of - * just one N_COEFF, read the explanation above. F is only used by one of the multipliers, so - * we don't need F1 and F2. + * Note, that there are actually two identical input block memories N_COEFF1 and N_COEFF2
+ * instead of just one N_COEFF, read the explanation above. F is only used by one of
+ * the multipliers, so we don't need F1 and F2. */ wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_wr; @@ -274,20 +302,38 @@ module modexpa7_top # wire core_f_wren; wire core_n_coeff_wren; +
+ wire [ 32-1:0] user_f_out_data; + wire [ 32-1:0] user_f_in_data; + wire [ 32-1:0] user_n_coeff_out_data; + wire [ 32-1:0] user_n_coeff_in_data; +
+ wire user_f_in_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MONTGOMERY_FACTOR_IN); + wire user_n_coeff_in_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MODULUS_COEFF_IN);
bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_f (.clk(clk), + bram_f_out (.clk(clk), .a_addr(core_f_addr_wr), .a_out(), .a_wr(core_f_wren), .a_in(core_f_data_wr), + .b_addr(bus_addr_word), .b_out(user_f_out_data)); +
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_f_in (.clk(clk), + .a_addr(bus_addr_word), .a_out(user_f_in_data), .a_wr(user_f_in_wren), .a_in(bus_data_wr), .b_addr(core_f_addr_rd), .b_out(core_f_data_rd)); - +
bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_n_coeff1 (.clk(clk), + bram_n_coeff_out (.clk(clk), .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), + .b_addr(bus_addr_word), .b_out(user_n_coeff_out_data));
+ + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n_coeff_in1 (.clk(clk), + .a_addr(bus_addr_word), .a_out(user_n_coeff_in_data), .a_wr(user_n_coeff_in_wren), .a_in(bus_data_wr), .b_addr(core_n_coeff1_addr_rd), .b_out(core_n_coeff1_data_rd)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_n_coeff2 (.clk(clk), - .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), + bram_n_coeff_in2 (.clk(clk), + .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_coeff_in_wren), .a_in(bus_data_wr), .b_addr(core_n_coeff2_addr_rd), .b_out(core_n_coeff2_data_rd)); @@ -461,7 +507,7 @@ module modexpa7_top # */ // delay bus_addr_bank by 1 clock cycle to remember from where we've just been reading - reg [1: 0] bus_addr_bank_dly; + reg [2: 0] bus_addr_bank_dly; always @(posedge clk) if (bus_cs) bus_addr_bank_dly <= bus_addr_bank; @@ -474,12 +520,16 @@ module modexpa7_top # // case (bus_addr_bank_dly) // - BANK_MODULUS: bus_data_rd_mux = user_n_data; - BANK_MESSAGE: bus_data_rd_mux = user_m_data; - BANK_EXPONENT: bus_data_rd_mux = user_d_data; - BANK_RESULT: bus_data_rd_mux = user_r_data; + BANK_MODULUS: bus_data_rd_mux = user_n_data; + BANK_MESSAGE: bus_data_rd_mux = user_m_data; + BANK_EXPONENT: bus_data_rd_mux = user_d_data; + BANK_RESULT: bus_data_rd_mux = user_r_data;
+ //
+ BANK_MODULUS_COEFF_OUT: bus_data_rd_mux = user_n_coeff_out_data;
+ BANK_MODULUS_COEFF_IN: bus_data_rd_mux = user_n_coeff_in_data;
+ BANK_MONTGOMERY_FACTOR_OUT: bus_data_rd_mux = user_f_out_data; + BANK_MONTGOMERY_FACTOR_IN: bus_data_rd_mux = user_f_in_data; // endcase - endmodule diff --git a/src/rtl/modexpa7_wrapper.v b/src/rtl/modexpa7_wrapper.v index a4e2319..8ebc22a 100644 --- a/src/rtl/modexpa7_wrapper.v +++ b/src/rtl/modexpa7_wrapper.v @@ -42,7 +42,7 @@ module modexpa7_wrapper # input cs, input we, - input [OPERAND_ADDR_WIDTH+2:0] address, + input [OPERAND_ADDR_WIDTH+3:0] address, input [ 32-1:0] write_data, output [ 32-1:0] read_data ); @@ -54,8 +54,8 @@ module modexpa7_wrapper # localparam ADDR_MSB_REGS = 1'b0; localparam ADDR_MSB_CORE = 1'b1; - wire address_msb = address[OPERAND_ADDR_WIDTH+2]; - wire [OPERAND_ADDR_WIDTH+1:0] address_lsb = address[OPERAND_ADDR_WIDTH+1:0]; + wire address_msb = address[OPERAND_ADDR_WIDTH+3]; + wire [OPERAND_ADDR_WIDTH+2:0] address_lsb = address[OPERAND_ADDR_WIDTH+2:0]; /* @@ -68,17 +68,17 @@ module modexpa7_wrapper # /* * Registers */ - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_NAME0 = 'h00; // - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_NAME1 = 'h01; // - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_VERSION = 'h02; // - - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_CONTROL = 'h08; // {next, init} - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_STATUS = 'h09; // {valid, ready} - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODE = 'h10; // {crt, dummy} - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODULUS_BITS = 'h11; // number of bits in modulus - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_EXPONENT_BITS = 'h12; // number of bits in exponent - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_BUFFER_BITS = 'h13; // largest supported number of bits
- localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_ARRAY_BITS = 'h14; // number of bits in systolic array + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_NAME0 = 'h00; // + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_NAME1 = 'h01; // + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_VERSION = 'h02; // + + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_CONTROL = 'h08; // {next, init} + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_STATUS = 'h09; // {valid, ready} + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_MODE = 'h10; // {crt, dummy} + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_MODULUS_BITS = 'h11; // number of bits in modulus + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_EXPONENT_BITS = 'h12; // number of bits in exponent + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_BUFFER_BITS = 'h13; // largest supported number of bits
+ localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_ARRAY_BITS = 'h14; // number of bits in systolic array localparam CONTROL_INIT_BIT = 0; localparam CONTROL_NEXT_BIT = 1; @@ -91,7 +91,7 @@ module modexpa7_wrapper # localparam CORE_NAME0 = 32'h6D6F6465; // "mode" localparam CORE_NAME1 = 32'h78706137; // "xpa7" - localparam CORE_VERSION = 32'h302E3230; // "0.20" + localparam CORE_VERSION = 32'h302E3235; // "0.25" /* diff --git a/src/stm32/modexpa7_driver_sample.c b/src/stm32/modexpa7_driver_sample.c index 390c949..e1de2bd 100644 --- a/src/stm32/modexpa7_driver_sample.c +++ b/src/stm32/modexpa7_driver_sample.c @@ -59,12 +59,19 @@ #define CORE_ADDR_BUFFER_BITS (0x13 << 2)
#define CORE_ADDR_ARRAY_BITS (0x14 << 2)
+ // operand bank size
+#define BANK_LENGTH 0x200 // 0x200 = 512 bytes = 4096 bits
// locations of operand buffers
-#define CORE_ADDR_BANK_MODULUS (0x800 + 0 * 0x200)
-#define CORE_ADDR_BANK_MESSAGE (0x800 + 1 * 0x200)
-#define CORE_ADDR_BANK_EXPONENT (0x800 + 2 * 0x200)
-#define CORE_ADDR_BANK_RESULT (0x800 + 3 * 0x200)
+#define CORE_ADDR_BANK_MODULUS (BANK_LENGTH * (8 + 0))
+#define CORE_ADDR_BANK_MESSAGE (BANK_LENGTH * (8 + 1))
+#define CORE_ADDR_BANK_EXPONENT (BANK_LENGTH * (8 + 2))
+#define CORE_ADDR_BANK_RESULT (BANK_LENGTH * (8 + 3))
+
+#define CORE_ADDR_BANK_MODULUS_COEFF_OUT (BANK_LENGTH * (8 + 4))
+#define CORE_ADDR_BANK_MODULUS_COEFF_IN (BANK_LENGTH * (8 + 5))
+#define CORE_ADDR_BANK_MONTGOMERY_FACTOR_OUT (BANK_LENGTH * (8 + 6))
+#define CORE_ADDR_BANK_MONTGOMERY_FACTOR_IN (BANK_LENGTH * (8 + 7))
// bit maps
#define CORE_CONTROL_BIT_INIT 0x00000001
@@ -75,6 +82,27 @@ #define CORE_MODE_BIT_CRT 0x00000002
+ /*
+ * zero operands
+ */
+#define Z_384 \
+ {0x00000000, 0x00000000, 0x00000000, 0x00000000, \
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000, \
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000}
+
+#define Z_192 \
+ {0x00000000, 0x00000000, 0x00000000, 0x00000000, \
+ 0x00000000, 0x00000000}
+
+#define Z_512 \
+ {0x00000000, 0x00000000, 0x00000000, 0x00000000, \
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000, \
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000, \
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000}
+
+#define Z_256 \
+ {0x00000000, 0x00000000, 0x00000000, 0x00000000, \
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000}
/*
* test vectors
@@ -83,11 +111,15 @@ static const uint32_t m_384[] = M_384; static const uint32_t n_384[] = N_384;
static const uint32_t d_384[] = D_384;
static const uint32_t s_384[] = S_384;
+static uint32_t n_coeff_384[] = Z_384;
+static uint32_t factor_384[] = Z_384;
static const uint32_t m_512[] = M_512;
static const uint32_t n_512[] = N_512;
static const uint32_t d_512[] = D_512;
static const uint32_t s_512[] = S_512;
+static uint32_t n_coeff_512[] = Z_512;
+static uint32_t factor_512[] = Z_512;
static const uint32_t p_192[] = P_192;
static const uint32_t q_192[] = Q_192;
@@ -95,6 +127,10 @@ static const uint32_t dp_192[] = DP_192; static const uint32_t dq_192[] = DQ_192;
static const uint32_t mp_192[] = MP_192;
static const uint32_t mq_192[] = MQ_192;
+static uint32_t p_coeff_192[] = Z_192;
+static uint32_t q_coeff_192[] = Z_192;
+static uint32_t factor_p_192[] = Z_192;
+static uint32_t factor_q_192[] = Z_192;
static const uint32_t p_256[] = P_256;
static const uint32_t q_256[] = Q_256;
@@ -102,7 +138,10 @@ static const uint32_t dp_256[] = DP_256; static const uint32_t dq_256[] = DQ_256;
static const uint32_t mp_256[] = MP_256;
static const uint32_t mq_256[] = MQ_256;
-
+static uint32_t p_coeff_256[] = Z_256;
+static uint32_t q_coeff_256[] = Z_256;
+static uint32_t factor_p_256[] = Z_256;
+static uint32_t factor_q_256[] = Z_256;
/*
@@ -110,16 +149,25 @@ static const uint32_t mq_256[] = MQ_256; */
void toggle_yellow_led(void);
-void setup_modexpa7( const uint32_t *n, size_t l);
+void setup_modexpa7( const uint32_t *n,
+ uint32_t *coeff,
+ uint32_t *factor,
+ size_t l);
-int test_modexpa7( const uint32_t *m,
+int test_modexpa7( const uint32_t *n,
+ const uint32_t *m,
const uint32_t *d,
const uint32_t *s,
+ const uint32_t *coeff,
+ const uint32_t *factor,
size_t l);
-int test_modexpa7_crt( const uint32_t *m,
+int test_modexpa7_crt( const uint32_t *n,
+ const uint32_t *m,
const uint32_t *d,
const uint32_t *s,
+ const uint32_t *coeff,
+ const uint32_t *factor,
size_t l);
@@ -148,10 +196,10 @@ int main() fmc_read_32(CORE_ADDR_NAME1, &core_name1);
fmc_read_32(CORE_ADDR_VERSION, &core_version);
- // must be "mode", "xpa7", "0.20"
+ // must be "mode", "xpa7", "0.25"
if ( (core_name0 != 0x6D6F6465) ||
(core_name1 != 0x78706137) ||
- (core_version != 0x302E3230))
+ (core_version != 0x302E3235))
{
led_off(LED_GREEN);
led_on(LED_RED);
@@ -164,61 +212,63 @@ int main() // largest supported operand width, systolic array "power"
fmc_read_32(CORE_ADDR_BUFFER_BITS, &core_buffer_bits);
- fmc_read_32(CORE_ADDR_ARRAY_BITS, &core_array_bits);
+ fmc_read_32(CORE_ADDR_ARRAY_BITS, &core_array_bits);
+
+ //
+ // do pre-computation for all the moduli and store speed-up quantities,
+ // note that each key requires three precomputations: one for the entire
+ // public key and two for each of the corresponding private key components
+ //
+ // we set the 'init' control bit, wait for `ready' status bit to go high,
+ // then retrieve the calculated values from the corresponding "output" banks
+ //
+ // we turn off the green led and turn the yellow led during the process to
+ // get an idea of how long it takes
+ //
+
+ led_off(LED_GREEN);
+ led_on(LED_YELLOW);
+
+ // 384-bit key and 192-bit primes
+ setup_modexpa7(n_384, n_coeff_384, factor_384, 384);
+ setup_modexpa7(p_192, p_coeff_192, factor_p_192, 192);
+ setup_modexpa7(q_192, q_coeff_192, factor_q_192, 192);
+
+ // 512-bit key and 256-bit primes
+ setup_modexpa7(n_512, n_coeff_512, factor_512, 512);
+ setup_modexpa7(p_256, p_coeff_256, factor_p_256, 256);
+ setup_modexpa7(q_256, q_coeff_256, factor_q_256, 256);
+
+ led_off(LED_YELLOW);
+ led_on(LED_GREEN);
+
// repeat forever
while (1)
- {
- // New modulus requires precomputation of modulus-dependent
- // speed-up coefficient, this must be done once per new
- // modulus, i.e. when we're repeatedly signing with the
- // same key, we only need to do precomputation once before
- // starting the very first signing operation.
-
+ {
// fresh start
ok = 1;
-
- {
- // run precomputation of modulus-dependent factor for the 384-bit modulus
- setup_modexpa7(n_384, 384);
-
- // try signing the message from the 384-bit test vector
- ok = ok && test_modexpa7(m_384, d_384, s_384, 384);
- }
- {
- // run precomputation of modulus-dependent factor for the 512-bit modulus
- setup_modexpa7(n_512, 512);
-
- // try signing the message from the 512-bit test vector
- ok = ok && test_modexpa7(m_512, d_512, s_512, 512);
- }
- {
- // run precomputation of modulus-dependent factor for the first 192-bit part of 384-bit modulus
- setup_modexpa7(p_192, 192);
-
+ {
+ // try signing the message with the 384-bit test vector
+ ok = ok && test_modexpa7(n_384, m_384, d_384, s_384, n_coeff_384, factor_384, 384);
+
// try signing 384-bit base using 192-bit exponent
- ok = ok && test_modexpa7_crt(m_384, dp_192, mp_192, 192);
-
- // run precomputation of modulus-dependent factor for the second 192-bit part of 384-bit modulus
- setup_modexpa7(q_192, 192);
+ ok = ok && test_modexpa7_crt(p_192, m_384, dp_192, mp_192, p_coeff_192, factor_p_192, 192);
// try signing 384-bit base using 192-bit exponent
- ok = ok && test_modexpa7_crt(m_384, dq_192, mq_192, 192);
+ ok = ok && test_modexpa7_crt(q_192, m_384, dq_192, mq_192, q_coeff_192, factor_q_192, 192);
}
+
+ {
+ // try signing the message with the 512-bit test vector
+ ok = ok && test_modexpa7(n_512, m_512, d_512, s_512, n_coeff_512, factor_512, 512);
- {
- // run precomputation of modulus-dependent factor for the first 256-bit part of 512-bit modulus
- setup_modexpa7(p_256, 256);
-
// try signing 512-bit base using 256-bit exponent
- ok = ok && test_modexpa7_crt(m_512, dp_256, mp_256, 256);
-
- // run precomputation of modulus-dependent factor for the second 256-bit part of 512-bit modulus
- setup_modexpa7(q_256, 256);
+ ok = ok && test_modexpa7_crt(p_256, m_512, dp_256, mp_256, p_coeff_256, factor_p_256, 256);
// try signing 512-bit base using 256-bit exponent
- ok = ok && test_modexpa7_crt(m_512, dq_256, mq_256, 256);
+ ok = ok && test_modexpa7_crt(q_256, m_512, dq_256, mq_256, q_coeff_256, factor_q_256, 256);
}
// turn on the red led to indicate something went wrong
@@ -234,15 +284,18 @@ int main() /*
- * Load new modulus and do the necessary precomputations.
+ * Load new modulus and do all the necessary precomputations.
*/
void setup_modexpa7( const uint32_t *n,
+ uint32_t *coeff,
+ uint32_t *factor,
size_t l)
{
size_t i, num_words;
uint32_t num_bits;
uint32_t reg_control, reg_status;
uint32_t n_word;
+ uint32_t coeff_word, factor_word;
uint32_t dummy_num_cyc;
// determine numbers of 32-bit words
@@ -250,10 +303,9 @@ void setup_modexpa7( const uint32_t *n, // set modulus width
num_bits = l;
- fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits);
+ fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits);
- // fill modulus bank (the least significant word
- // is at the lowest offset)
+ // fill modulus bank (the least significant word is at the lowest offset)
for (i=0; i<num_words; i++)
{ n_word = n[i];
fmc_write_32(CORE_ADDR_BANK_MODULUS + ((num_words - (i + 1)) * sizeof(uint32_t)), &n_word);
@@ -273,42 +325,70 @@ void setup_modexpa7( const uint32_t *n, fmc_read_32(CORE_ADDR_STATUS, ®_status);
}
while (!(reg_status & CORE_STATUS_BIT_READY));
+
+ // retrieve the modulus-dependent coefficient and Montgomery factor
+ // from the corresponding core "output" banks and store them for later use
+ for (i=0; i<num_words; i++)
+ {
+ fmc_read_32(CORE_ADDR_BANK_MODULUS_COEFF_OUT + i * sizeof(uint32_t), &coeff_word);
+ coeff[i] = coeff_word;
+
+ fmc_read_32(CORE_ADDR_BANK_MONTGOMERY_FACTOR_OUT + i * sizeof(uint32_t), &factor_word);
+ factor[i] = factor_word;
+ }
}
//
// Sign the message and compare it against the correct reference value.
//
-int test_modexpa7( const uint32_t *m,
+int test_modexpa7( const uint32_t *n,
+ const uint32_t *m,
const uint32_t *d,
const uint32_t *s,
+ const uint32_t *coeff,
+ const uint32_t *factor,
size_t l)
{
size_t i, num_words;
uint32_t num_bits;
uint32_t reg_control, reg_status;
- uint32_t m_word, d_word, s_word;
+ uint32_t n_word, m_word, d_word, s_word;
+ uint32_t coeff_word, factor_word;
uint32_t dummy_num_cyc;
uint32_t mode;
// determine numbers of 32-bit words
num_words = l >> 5;
- // set exponent width
+ // set modulus width, exponent width
num_bits = l;
- fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits);
+ fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits);
+ fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits);
// disable CRT mode
mode = 0;
fmc_write_32(CORE_ADDR_MODE, &mode);
- // fill message and exponent banks (the least significant
- // word is at the lowest offset)
+ // fill modulus, message and exponent banks (the least significant
+ // word is at the lowest offset), we also need to fill "input" core
+ // banks with previously pre-calculated and saved modulus-dependent
+ // speed-up coefficient and Montgomery factor
for (i=0; i<num_words; i++)
- { m_word = m[i];
+ {
+ n_word = n[i];
+ m_word = m[i];
d_word = d[i];
+
+ fmc_write_32(CORE_ADDR_BANK_MODULUS + ((num_words - (i + 1)) * sizeof(uint32_t)), &n_word);
fmc_write_32(CORE_ADDR_BANK_MESSAGE + ((num_words - (i + 1)) * sizeof(uint32_t)), &m_word);
fmc_write_32(CORE_ADDR_BANK_EXPONENT + ((num_words - (i + 1)) * sizeof(uint32_t)), &d_word);
+
+ coeff_word = coeff[i];
+ factor_word = factor[i];
+
+ fmc_write_32(CORE_ADDR_BANK_MODULUS_COEFF_IN + i * sizeof(uint32_t), &coeff_word);
+ fmc_write_32(CORE_ADDR_BANK_MONTGOMERY_FACTOR_IN + i * sizeof(uint32_t), &factor_word);
}
// clear 'next' control bit, then set 'next' control bit again
@@ -331,8 +411,7 @@ int test_modexpa7( const uint32_t *m, {
fmc_read_32(CORE_ADDR_BANK_RESULT + (i * sizeof(uint32_t)), &s_word);
- if (s_word != s[num_words - (i + 1)])
- return 0;
+ if (s_word != s[num_words - (i + 1)]) return 0;
}
// everything went just fine
@@ -340,34 +419,49 @@ int test_modexpa7( const uint32_t *m, }
-int test_modexpa7_crt( const uint32_t *m,
+int test_modexpa7_crt( const uint32_t *n,
+ const uint32_t *m,
const uint32_t *d,
const uint32_t *s,
+ const uint32_t *coeff,
+ const uint32_t *factor,
size_t l)
{
size_t i, num_words;
uint32_t num_bits;
uint32_t reg_control, reg_status;
- uint32_t m_word, d_word, s_word;
+ uint32_t n_word, m_word, d_word, s_word;
+ uint32_t coeff_word, factor_word;
uint32_t dummy_num_cyc;
uint32_t mode;
// determine numbers of 32-bit words
num_words = l >> 5;
- // set exponent width
+ // set modulus width, exponent width
num_bits = l;
- fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits);
+ fmc_write_32(CORE_ADDR_MODULUS_BITS, &num_bits);
+ fmc_write_32(CORE_ADDR_EXPONENT_BITS, &num_bits);
// enable CRT mode
mode = CORE_MODE_BIT_CRT;
fmc_write_32(CORE_ADDR_MODE, &mode);
- // fill exponent bank (the least significant word
- // is at the lowest offset)
+ // fill modulus and exponent banks (the least significant word is at
+ // the lowest offset), we also need to fill "input" core banks with
+ // previously pre-calculated and saved modulus-dependent speed-up
+ // coefficient and Montgomery factor
for (i=0; i<num_words; i++)
- { d_word = d[i];
+ { n_word = n[i];
+ d_word = d[i];
+ fmc_write_32(CORE_ADDR_BANK_MODULUS + ((num_words - (i + 1)) * sizeof(uint32_t)), &n_word);
fmc_write_32(CORE_ADDR_BANK_EXPONENT + ((num_words - (i + 1)) * sizeof(uint32_t)), &d_word);
+
+ coeff_word = coeff[i];
+ factor_word = factor[i];
+
+ fmc_write_32(CORE_ADDR_BANK_MODULUS_COEFF_IN + i * sizeof(uint32_t), &coeff_word);
+ fmc_write_32(CORE_ADDR_BANK_MONTGOMERY_FACTOR_IN + i * sizeof(uint32_t), &factor_word);
}
// fill message bank (the least significant word
diff --git a/src/tb/tb_wrapper.v b/src/tb/tb_wrapper.v index fae0934..c7619f0 100644 --- a/src/tb/tb_wrapper.v +++ b/src/tb/tb_wrapper.v @@ -2,6 +2,13 @@ module tb_wrapper;
+
+ //
+ // Test Vectors
+ //
+ `include "modexp_fpga_model_vectors.v";
+
+
/*
* Settings
*/
@@ -25,7 +32,7 @@ module tb_wrapper; */
reg bus_cs;
reg bus_we;
- reg [USE_OPERAND_ADDR_WIDTH+2:0] bus_addr;
+ reg [USE_OPERAND_ADDR_WIDTH+3:0] bus_addr;
reg [ 32-1:0] bus_wr_data;
wire [ 32-1:0] bus_rd_data;
@@ -47,7 +54,10 @@ module tb_wrapper; .read_data (bus_rd_data)
);
+ integer i;
reg [31: 0] tmp;
+ reg [383:0] shreg;
+ reg poll;
initial begin
//
rst_n = 0;
@@ -61,6 +71,8 @@ module tb_wrapper; //
rst_n = 1;
//
+ // read common registers to make sure core header reads out ok
+ //
read_reg('h00, tmp); // NAME0
read_reg('h01, tmp); // NAME1
read_reg('h02, tmp); // VERSION
@@ -74,11 +86,82 @@ module tb_wrapper; write_reg('h11, 32'd384); // MODULUS_BITS
read_reg ('h11, tmp);
//
+ write_reg('h10, 32'd0); // MODE
+ read_reg ('h10, tmp);
+ //
+ // fill in 384-bit modulus
+ //
+ shreg = N_384;
+ for (i=0; i<384/32; i=i+1) begin
+ write_bank(3'b000, i[USE_OPERAND_ADDR_WIDTH-1:0], shreg[31:0]);
+ shreg = shreg >> 32;
+ end
+ //
+ // start precomputation
+ //
+ write_reg('h08, 32'd0); // CONTROL.init = 0
+ write_reg('h08, 32'd1); // CONTROL.init = 1
+ //
+ // wait for precomputation to complete
+ //
+ poll = 1;
+ while (poll) begin
+ #10;
+ read_reg('h09, tmp); // tmp = STATUS
+ poll = ~tmp[0]; // poll = STATUS.ready
+ end
+ //
+ // move modulus-dependent coefficient and Montgomery factor
+ // from "output" to "input" banks
+ //
+ for (i=0; i<384/32; i=i+1) begin
+ read_bank (3'b100, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp);
+ write_bank(3'b101, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp);
+ read_bank (3'b110, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp);
+ write_bank(3'b111, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp);
+ end
+ //
+ // fill in 384-bit message
+ //
+ shreg = M_384;
+ for (i=0; i<384/32; i=i+1) begin
+ write_bank(3'b001, i[USE_OPERAND_ADDR_WIDTH-1:0], shreg[31:0]);
+ shreg = shreg >> 32;
+ end
+ //
+ // fill in 384-bit exponent
+ //
+ shreg = D_384;
+ for (i=0; i<384/32; i=i+1) begin
+ write_bank(3'b010, i[USE_OPERAND_ADDR_WIDTH-1:0], shreg[31:0]);
+ shreg = shreg >> 32;
+ end
+ //
+ // start exponentiation
+ //
+ write_reg('h08, 32'd0); // CONTROL.next = 0
+ write_reg('h08, 32'd2); // CONTROL.next = 1
+ //
+ // wait for exponentiation to complete
+ //
+ poll = 1;
+ while (poll) begin
+ #10;
+ read_reg('h09, tmp); // tmp = STATUS
+ poll = ~tmp[1]; // poll = STATUS.valid
+ end
+ //
+ // read result
+ //
+ for (i=0; i<384/32; i=i+1) begin
+ read_bank(3'b011, i[USE_OPERAND_ADDR_WIDTH-1:0], tmp);
+ shreg = {tmp, shreg[383:32]};
+ end
//
end
task read_reg;
- input [USE_OPERAND_ADDR_WIDTH+1:0] addr;
+ input [USE_OPERAND_ADDR_WIDTH+2:0] addr;
output [ 32-1:0] data;
begin
bus_cs = 1;
@@ -89,9 +172,23 @@ module tb_wrapper; data = bus_rd_data;
end
endtask
+
+ task read_bank;
+ input [ 2:0] bank;
+ input [USE_OPERAND_ADDR_WIDTH-1:0] addr;
+ output [ 32-1:0] data;
+ begin
+ bus_cs = 1;
+ bus_addr = {1'b1, bank, addr};
+ #10;
+ bus_cs = 0;
+ bus_addr = 'bX;
+ data = bus_rd_data;
+ end
+ endtask
task write_reg;
- input [USE_OPERAND_ADDR_WIDTH+1:0] addr;
+ input [USE_OPERAND_ADDR_WIDTH+2:0] addr;
input [ 32-1:0] data;
begin
bus_cs = 1;
@@ -104,6 +201,22 @@ module tb_wrapper; bus_addr = 'bX;
end
endtask
-
+
+ task write_bank;
+ input [ 2:0] bank;
+ input [USE_OPERAND_ADDR_WIDTH-1:0] addr;
+ input [ 32-1:0] data;
+ begin
+ bus_cs = 1;
+ bus_we = 1;
+ bus_addr = {1'b1, bank, addr};
+ bus_wr_data = data;
+ #10;
+ bus_cs = 0;
+ bus_we = 0;
+ bus_addr = 'bX;
+ end
+ endtask
+
endmodule
|