From 516ca870fd3ad2a87e0ac56f0d453667e021c52d Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Sun, 11 Apr 2021 17:21:36 +0300 Subject: * Microcode layer redesigned to take advantage of Montgomery ladder architecture. Instead of R and S there are now two working ("cycle") registers R0 and R1. After every cycle R0+R1 is placed in register S ("sum"), 2*R0|1 (depending on current multiplier bit) is placed in register T. Then the working variables are updated, final result ends up in R0. * Due to the change of working registers, modular inversion routines were updated accordingly. * Added optional debugging output control --- ecdsa_fpga_microcode.cpp | 49 ++++++++++++------------ ecdsa_fpga_microcode.h | 96 ++++++++++++++++++++++-------------------------- 2 files changed, 68 insertions(+), 77 deletions(-) diff --git a/ecdsa_fpga_microcode.cpp b/ecdsa_fpga_microcode.cpp index f02dc8a..2171ac2 100644 --- a/ecdsa_fpga_microcode.cpp +++ b/ecdsa_fpga_microcode.cpp @@ -59,10 +59,8 @@ FPGA_BUFFER BUF_HI[ECDSA_UOP_OPERAND_COUNT]; //------------------------------------------------------------------------------ // Global Flags //------------------------------------------------------------------------------ -bool uop_flagz_sz; -bool uop_flagz_rz; -bool uop_flagz_e; -bool uop_flagz_f; +bool uop_flagz_r0z; +bool uop_flagz_r1z; //------------------------------------------------------------------------------ @@ -96,17 +94,11 @@ void uop_cmpz(UOP_BANK src, int s_op) switch (s_op) { - case CYCLE_SZ: - uop_flagz_sz = flagz; + case CYCLE_R0Z: + uop_flagz_r0z = flagz; break; - case CYCLE_RZ: - uop_flagz_rz = flagz; - break; - case CYCLE_E: - uop_flagz_e = flagz; - break; - case CYCLE_F: - uop_flagz_f = flagz; + case CYCLE_R1Z: + uop_flagz_r1z = flagz; break; } } @@ -141,6 +133,13 @@ void uop_calc(UOP_MATH math, if (math == ADD) fpga_modular_add(s_ptr1, s_ptr2, d_ptr); if (math == SUB) fpga_modular_sub(s_ptr1, s_ptr2, d_ptr); if (math == MUL) fpga_modular_mul(s_ptr1, s_ptr2, d_ptr); + +#ifdef DUMP_UOP_OUTPUTS + if (math == ADD) dump_uop_output("ADD", d_ptr); + if (math == SUB) dump_uop_output("SUB", d_ptr); + if (math == MUL) dump_uop_output("MUL", d_ptr); +#endif + } @@ -201,16 +200,16 @@ void fpga_modular_inv23_p256_microcode() // first obtain intermediate helper quantities (X#) // mirror X1 to HI bank (don't waste time copying to X1, just use RZ) - uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_RZ); + uop_move(BANK_LO, CYCLE_R0Z, BANK_HI, CYCLE_R0Z); // compute X2 and mirror to the other bank - uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ, BANK_HI, INVERT_R1); - uop_calc(MUL, BANK_HI, CYCLE_RZ, INVERT_R1, BANK_LO, INVERT_X2); + uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z, BANK_HI, INVERT_R1); + uop_calc(MUL, BANK_HI, CYCLE_R0Z, INVERT_R1, BANK_LO, INVERT_X2); uop_move(BANK_LO, INVERT_X2, BANK_HI, INVERT_X2); // compute X3 and mirror to the other bank uop_calc(MUL, BANK_LO, INVERT_X2, INVERT_X2, BANK_HI, INVERT_R1); - uop_calc(MUL, BANK_HI, INVERT_R1, CYCLE_RZ, BANK_LO, INVERT_X3); + uop_calc(MUL, BANK_HI, INVERT_R1, CYCLE_R0Z, BANK_LO, INVERT_X3); uop_move(BANK_LO, INVERT_X3, BANK_HI, INVERT_X3); // compute X6 (stored in the lower bank) @@ -257,7 +256,7 @@ void fpga_modular_inv23_p256_microcode() uop_calc_if_odd (MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1); uop_repeat(); - uop_calc(MUL, BANK_LO, INVERT_R2, CYCLE_RZ, BANK_HI, INVERT_R1); + uop_calc(MUL, BANK_LO, INVERT_R2, CYCLE_R0Z, BANK_HI, INVERT_R1); uop_cycle(128); uop_calc_if_even(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2); @@ -287,7 +286,7 @@ void fpga_modular_inv23_p256_microcode() // A3 ends up in the upper bank by itself uop_calc(MUL, BANK_HI, INVERT_A2, INVERT_A2, BANK_LO, INVERT_R1); - uop_calc(MUL, BANK_LO, INVERT_R1, CYCLE_RZ, BANK_HI, INVERT_A3); + uop_calc(MUL, BANK_LO, INVERT_R1, CYCLE_R0Z, BANK_HI, INVERT_A3); /* END_MICROCODE */ } @@ -322,16 +321,16 @@ void fpga_modular_inv23_p384_microcode() // first obtain intermediate helper quantities (X#) // mirror X1 to HI bank (don't waste time copying to X1, just use RZ) - uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_RZ); + uop_move(BANK_LO, CYCLE_R0Z, BANK_HI, CYCLE_R0Z); // compute X2 and mirror to the other bank - uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ, BANK_HI, INVERT_R1); - uop_calc(MUL, BANK_HI, CYCLE_RZ, INVERT_R1, BANK_LO, INVERT_X2); + uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z, BANK_HI, INVERT_R1); + uop_calc(MUL, BANK_HI, CYCLE_R0Z, INVERT_R1, BANK_LO, INVERT_X2); uop_move(BANK_LO, INVERT_X2, BANK_HI, INVERT_X2); // compute X3 and mirror to the other bank uop_calc(MUL, BANK_LO, INVERT_X2, INVERT_X2, BANK_HI, INVERT_R1); - uop_calc(MUL, BANK_HI, INVERT_R1, CYCLE_RZ, BANK_LO, INVERT_X3); + uop_calc(MUL, BANK_HI, INVERT_R1, CYCLE_R0Z, BANK_LO, INVERT_X3); uop_move(BANK_LO, INVERT_X3, BANK_HI, INVERT_X3); // compute X6 (stored in the lower bank) @@ -421,7 +420,7 @@ void fpga_modular_inv23_p384_microcode() // A3 ends up in the upper bank by itself uop_calc(MUL, BANK_HI, INVERT_A2, INVERT_A2, BANK_LO, INVERT_R1); - uop_calc(MUL, BANK_LO, INVERT_R1, CYCLE_RZ, BANK_HI, INVERT_A3); + uop_calc(MUL, BANK_LO, INVERT_R1, CYCLE_R0Z, BANK_HI, INVERT_A3); /* END_MICROCODE */ } diff --git a/ecdsa_fpga_microcode.h b/ecdsa_fpga_microcode.h index f551d96..32e061e 100644 --- a/ecdsa_fpga_microcode.h +++ b/ecdsa_fpga_microcode.h @@ -57,56 +57,50 @@ enum UOP_OPERAND CONST_ZERO, // 0 CONST_ONE, // 1 CONST_DELTA, // 2 - + CONST_GX, // 3 CONST_GY, // 4 - - CONST_HX, // 5 - CONST_HY, // 6 - - CYCLE_RX, // 7 - CYCLE_RY, // 8 - CYCLE_RZ, // 9 - - CYCLE_SX, // 10 - CYCLE_SY, // 11 - CYCLE_SZ, // 12 - - CYCLE_A, // 13 - CYCLE_A2, // 14 - CYCLE_B, // 15 - CYCLE_C, // 16 - CYCLE_C2, // 17 - CYCLE_C2_2, // 18 - CYCLE_D, // 19 - CYCLE_E, // 20 - CYCLE_F, // 21 - CYCLE_G, // 22 - CYCLE_H, // 23 - CYCLE_J, // 24 - - CYCLE_Z2, // 25 - - CYCLE_T1, // 26 - CYCLE_T2, // 27 - CYCLE_T3, // 28 - CYCLE_T4, // 29 - - INVERT_R1, // 30 - INVERT_R2, // 31 - - INVERT_X2, // 32 - INVERT_X3, // 33 - INVERT_X6, // 34 - INVERT_X12, // 35 - INVERT_X15, // 36 - INVERT_X30, // 37 - INVERT_X32, // 38 - INVERT_X60, // 39 - INVERT_X120, // 40 - - INVERT_A2, // 41 - INVERT_A3, // 42 + + CYCLE_R0X, // 5 + CYCLE_R0Y, // 6 + CYCLE_R0Z, // 7 + + CYCLE_R1X, // 8 + CYCLE_R1Y, // 9 + CYCLE_R1Z, // 10 + + CYCLE_SX, // 11 + CYCLE_SY, // 12 + CYCLE_SZ, // 13 + + CYCLE_TX, // 14 + CYCLE_TY, // 15 + CYCLE_TZ, // 16 + + CYCLE_T1, // 17 + CYCLE_T2, // 18 + CYCLE_T3, // 19 + CYCLE_T4, // 20 + CYCLE_T5, // 21 + CYCLE_T6, // 22 + CYCLE_T7, // 23 + CYCLE_T8, // 24 + + INVERT_R1, // 25 + INVERT_R2, // 26 + + INVERT_X2, // 27 + INVERT_X3, // 28 + INVERT_X6, // 29 + INVERT_X12, // 30 + INVERT_X15, // 31 + INVERT_X30, // 32 + INVERT_X32, // 33 + INVERT_X60, // 34 + INVERT_X120, // 35 + + INVERT_A2, // 36 + INVERT_A3, // 37 ECDSA_UOP_OPERAND_COUNT }; @@ -129,10 +123,8 @@ extern FPGA_BUFFER BUF_HI[ECDSA_UOP_OPERAND_COUNT]; //------------------------------------------------------------------------------ // Global Flags //------------------------------------------------------------------------------ -extern bool uop_flagz_sz; -extern bool uop_flagz_rz; -extern bool uop_flagz_e; -extern bool uop_flagz_f; +extern bool uop_flagz_r0z; +extern bool uop_flagz_r1z; //------------------------------------------------------------------------------ -- cgit v1.2.3