From 1e16303d718986e0e991444a7cdcab3c5c89b1f4 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Sun, 11 Apr 2021 17:42:52 +0300 Subject: Updated curve math layer to do multiplication using the Montgomery ladder method. Also added optional debugging output to help debug microcoded versions of double and add routines. --- ecdsa_fpga_curve_microcode.cpp | 547 ++++++++++++++++++++--------------------- 1 file changed, 271 insertions(+), 276 deletions(-) (limited to 'ecdsa_fpga_curve_microcode.cpp') diff --git a/ecdsa_fpga_curve_microcode.cpp b/ecdsa_fpga_curve_microcode.cpp index 553498c..128e087 100644 --- a/ecdsa_fpga_curve_microcode.cpp +++ b/ecdsa_fpga_curve_microcode.cpp @@ -6,7 +6,7 @@ // // Authors: Pavel Shatov // -// Copyright (c) 2018 NORDUnet A/S +// Copyright (c) 2018, 2021 NORDUnet A/S // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: @@ -51,205 +51,186 @@ //------------------------------------------------------------------------------ // -// Doubles the point stored in CYCLE_R* and stores the result in CYCLE_S*. +// Doubles the point stored in CYCLE_R0* and stores the result in CYCLE_T*. // //------------------------------------------------------------------------------ -void fpga_curve_double_jacobian_microcode() +void fpga_curve_double_jacobian_microcode_r0() //------------------------------------------------------------------------------ { - // fpga_modular_mul(RZ, RZ, RZ2 ); // 2. RZ2 = RZ * RZ - // fpga_modular_sub(RX, RZ2, T1 ); // 3. T1 = RX - RZ2 - // fpga_modular_add(RX, RZ2, T2 ); // 4. T2 = RX + RZ2 - // fpga_modular_mul(T1, T2, T3 ); // 5. T3 = T1 * T2 - // fpga_modular_add(T3, T3, T4 ); // 6a. T4 = T3 + T3 - // fpga_modular_add(T3, T4, A ); // 6b. A = T3 + T4 - // fpga_modular_add(RY, RY, B ); // 7. B = RY + RY - // fpga_modular_mul(B, RZ, SZ ); // 8. SZ = B * RZ [output] - // fpga_modular_mul(B, B, C ); // 9. C = B * B - // fpga_modular_mul(C, RX, D ); // 10. D = C * RX - // fpga_modular_mul(C, C, C2 ); // 11. C2 = C * C - // fpga_modular_mul(C2, DELTA, C2_2); // 12. C2_2 = C / 2 - // fpga_modular_mul(A, A, A2 ); // 13. A2 = A * A - // fpga_modular_add(D, D, T1 ); // 14. T1 = D + D - // fpga_modular_sub(A2, T1, SX ); // 15. SX = A2 - T1 [output] - // fpga_modular_sub(D, SX, T1 ); // 16. T1 = D - SX - // fpga_modular_mul(A , T1, T2 ); // 17. T2 = A * T1 - // fpga_modular_sub(T2, C2_2, SY ); // 18. SY = T2 - C2_2 [output] + /* BEGIN_MICROCODE: CYCLE_DOUBLE_R0 */ + + uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z, BANK_HI, CYCLE_T1); + uop_calc(SUB, BANK_HI, CYCLE_R0X, CYCLE_T1, BANK_LO, CYCLE_T2); + uop_calc(ADD, BANK_HI, CYCLE_R0X, CYCLE_T1, BANK_LO, CYCLE_T3); + uop_calc(MUL, BANK_LO, CYCLE_T3, CYCLE_T2, BANK_HI, CYCLE_T4); + uop_calc(ADD, BANK_HI, CYCLE_T4, CYCLE_T4, BANK_LO, CYCLE_T1); + + uop_move( BANK_HI, CYCLE_T4, BANK_LO, CYCLE_T4); + + uop_calc(ADD, BANK_LO, CYCLE_T1, CYCLE_T4, BANK_HI, CYCLE_T2); + uop_calc(ADD, BANK_HI, CYCLE_R0Y, CYCLE_R0Y, BANK_LO, CYCLE_TY); + uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_TY, BANK_HI, CYCLE_TZ); + uop_calc(MUL, BANK_LO, CYCLE_TY, CYCLE_TY, BANK_HI, CYCLE_T1); + uop_calc(MUL, BANK_HI, CYCLE_R0X, CYCLE_T1, BANK_LO, CYCLE_T3); + uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T1, BANK_LO, CYCLE_T4); + uop_calc(MUL, BANK_LO, CYCLE_T4, CONST_DELTA, BANK_HI, CYCLE_T5); + uop_calc(MUL, BANK_HI, CYCLE_T2, CYCLE_T2, BANK_LO, CYCLE_T4); + uop_calc(ADD, BANK_LO, CYCLE_T3, CYCLE_T3, BANK_HI, CYCLE_T1); + + uop_move( BANK_LO, CYCLE_T4, BANK_HI, CYCLE_T4); + + uop_calc(SUB, BANK_HI, CYCLE_T4, CYCLE_T1, BANK_LO, CYCLE_TX); + uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_TX, BANK_HI, CYCLE_T1); + uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T2, BANK_LO, CYCLE_T3); + + uop_move( BANK_LO, CYCLE_T3, BANK_HI, CYCLE_T3); + + uop_calc(SUB, BANK_HI, CYCLE_T3, CYCLE_T5, BANK_LO, CYCLE_TY); - /* BEGIN_MICROCODE: CYCLE_DOUBLE */ - - FPGA_BUFFER TEMP; - - uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ, BANK_HI, CYCLE_Z2); - uop_stor(BANK_HI, CYCLE_Z2, &TEMP); print_fpga_buffer("CYCLE_Z2 = ", &TEMP); - - uop_calc(SUB, BANK_HI, CYCLE_RX, CYCLE_Z2, BANK_LO, CYCLE_T1); - uop_stor(BANK_LO, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP); - - uop_calc(ADD, BANK_HI, CYCLE_RX, CYCLE_Z2, BANK_LO, CYCLE_T2); - uop_stor(BANK_LO, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP); + /* END_MICROCODE */ +} - uop_calc(MUL, BANK_LO, CYCLE_T1, CYCLE_T2, BANK_HI, CYCLE_T3); - uop_stor(BANK_HI, CYCLE_T3, &TEMP); print_fpga_buffer("CYCLE_T3 = ", &TEMP); - uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T3, BANK_LO, CYCLE_T4); - uop_stor(BANK_LO, CYCLE_T4, &TEMP); print_fpga_buffer("CYCLE_T4 = ", &TEMP); +//------------------------------------------------------------------------------ +// +// Doubles the point stored in CYCLE_R1* and stores the result in CYCLE_T*. +// +//------------------------------------------------------------------------------ +void fpga_curve_double_jacobian_microcode_r1() +//------------------------------------------------------------------------------ +{ + /* BEGIN_MICROCODE: CYCLE_DOUBLE_R1 */ + + uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_R1Z, BANK_HI, CYCLE_T1); + uop_calc(SUB, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T2); + uop_calc(ADD, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T3); + uop_calc(MUL, BANK_LO, CYCLE_T3, CYCLE_T2, BANK_HI, CYCLE_T4); + uop_calc(ADD, BANK_HI, CYCLE_T4, CYCLE_T4, BANK_LO, CYCLE_T1); + + uop_move( BANK_HI, CYCLE_T4, BANK_LO, CYCLE_T4); + + uop_calc(ADD, BANK_LO, CYCLE_T1, CYCLE_T4, BANK_HI, CYCLE_T2); + uop_calc(ADD, BANK_HI, CYCLE_R1Y, CYCLE_R1Y, BANK_LO, CYCLE_TY); + uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_TY, BANK_HI, CYCLE_TZ); + uop_calc(MUL, BANK_LO, CYCLE_TY, CYCLE_TY, BANK_HI, CYCLE_T1); + uop_calc(MUL, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T3); + uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T1, BANK_LO, CYCLE_T4); + uop_calc(MUL, BANK_LO, CYCLE_T4, CONST_DELTA, BANK_HI, CYCLE_T5); + uop_calc(MUL, BANK_HI, CYCLE_T2, CYCLE_T2, BANK_LO, CYCLE_T4); + uop_calc(ADD, BANK_LO, CYCLE_T3, CYCLE_T3, BANK_HI, CYCLE_T1); + + uop_move( BANK_LO, CYCLE_T4, BANK_HI, CYCLE_T4); + + uop_calc(SUB, BANK_HI, CYCLE_T4, CYCLE_T1, BANK_LO, CYCLE_TX); + uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_TX, BANK_HI, CYCLE_T1); + uop_calc(MUL, BANK_HI, CYCLE_T1, CYCLE_T2, BANK_LO, CYCLE_T3); + + uop_move( BANK_LO, CYCLE_T3, BANK_HI, CYCLE_T3); + + uop_calc(SUB, BANK_HI, CYCLE_T3, CYCLE_T5, BANK_LO, CYCLE_TY); - uop_move( BANK_LO, CYCLE_T4, BANK_HI, CYCLE_T4); + /* END_MICROCODE */ +} - uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T4, BANK_LO, CYCLE_A); - uop_stor(BANK_LO, CYCLE_A, &TEMP); print_fpga_buffer("CYCLE_A = ", &TEMP); - uop_calc(ADD, BANK_HI, CYCLE_RY, CYCLE_RY, BANK_LO, CYCLE_B); - uop_stor(BANK_LO, CYCLE_B, &TEMP); print_fpga_buffer("CYCLE_B = ", &TEMP); +//------------------------------------------------------------------------------ +// +// Adds the points stored in CYCLE_R0|1 and stores the result in CYCLE_S. +// +//------------------------------------------------------------------------------ +void fpga_curve_add_jacobian_microcode_2() +{ - uop_calc(MUL, BANK_LO, CYCLE_B, CYCLE_RZ, BANK_HI, CYCLE_SZ); - uop_stor(BANK_HI, CYCLE_SZ, &TEMP); print_fpga_buffer("CYCLE_SZ = ", &TEMP); + /* BEGIN_MICROCODE: CYCLE_ADD */ - uop_calc(MUL, BANK_LO, CYCLE_B, CYCLE_B, BANK_HI, CYCLE_C); - uop_stor(BANK_HI, CYCLE_C, &TEMP); print_fpga_buffer("CYCLE_C = ", &TEMP); + uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R0Z, BANK_HI, CYCLE_T1); + uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_R1Z, BANK_HI, CYCLE_T2); + + uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1); + uop_move( BANK_HI, CYCLE_T2, BANK_LO, CYCLE_T2); - uop_calc(MUL, BANK_HI, CYCLE_C, CYCLE_RX, BANK_LO, CYCLE_D); - uop_stor(BANK_LO, CYCLE_D, &TEMP); print_fpga_buffer("CYCLE_D = ", &TEMP); + uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_T1, BANK_HI, CYCLE_T3); + uop_calc(MUL, BANK_LO, CYCLE_R1Z, CYCLE_T2, BANK_HI, CYCLE_T4); - uop_calc(MUL, BANK_HI, CYCLE_C, CYCLE_C, BANK_LO, CYCLE_C2); - uop_stor(BANK_LO, CYCLE_C2, &TEMP); print_fpga_buffer("CYCLE_C2 = ", &TEMP); + uop_calc(MUL, BANK_HI, CYCLE_R0X, CYCLE_T2, BANK_LO, CYCLE_T5); + uop_calc(MUL, BANK_HI, CYCLE_R1X, CYCLE_T1, BANK_LO, CYCLE_T2); - uop_calc(MUL, BANK_LO, CYCLE_C2, CONST_DELTA, BANK_HI, CYCLE_C2_2); - uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer("CYCLE_C2_2 = ", &TEMP); + uop_calc(MUL, BANK_HI, CYCLE_R0Y, CYCLE_T4, BANK_LO, CYCLE_T6); + uop_calc(MUL, BANK_HI, CYCLE_R1Y, CYCLE_T3, BANK_LO, CYCLE_T4); - uop_calc(MUL, BANK_LO, CYCLE_A, CYCLE_A, BANK_HI, CYCLE_A2); - uop_stor(BANK_HI, CYCLE_A2, &TEMP); print_fpga_buffer("CYCLE_A2 = ", &TEMP); + uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T5, BANK_HI, CYCLE_T7); + uop_calc(SUB, BANK_LO, CYCLE_T4, CYCLE_T6, BANK_HI, CYCLE_T8); - uop_calc(ADD, BANK_LO, CYCLE_D, CYCLE_D, BANK_HI, CYCLE_T1); - uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP); + uop_calc(MUL, BANK_LO, CYCLE_R0Z, CYCLE_R1Z, BANK_HI, CYCLE_T1); - uop_calc(SUB, BANK_HI, CYCLE_A2, CYCLE_T1, BANK_LO, CYCLE_SX); - uop_stor(BANK_LO, CYCLE_SX, &TEMP); print_fpga_buffer("CYCLE_SX = ", &TEMP); + uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1); + uop_move( BANK_HI, CYCLE_T7, BANK_LO, CYCLE_T7); - uop_calc(SUB, BANK_LO, CYCLE_D, CYCLE_SX, BANK_HI, CYCLE_T1); - uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP); + uop_calc(MUL, BANK_LO, CYCLE_T7, CYCLE_T1, BANK_HI, CYCLE_SZ); - uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1); + uop_calc(MUL, BANK_HI, CYCLE_T8, CYCLE_T8, BANK_LO, CYCLE_T2); + uop_calc(MUL, BANK_LO, CYCLE_T7, CYCLE_T7, BANK_HI, CYCLE_T3); + uop_calc(MUL, BANK_HI, CYCLE_T7, CYCLE_T3, BANK_LO, CYCLE_T4); - uop_calc(MUL, BANK_LO, CYCLE_A, CYCLE_T1, BANK_HI, CYCLE_T2); - uop_stor(BANK_HI, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP); + uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T4, BANK_HI, CYCLE_T1); - uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_C2_2, BANK_LO, CYCLE_SY); - uop_stor(BANK_LO, CYCLE_SY, &TEMP); print_fpga_buffer("CYCLE_SY = ", &TEMP); + uop_move( BANK_LO, CYCLE_T5, BANK_HI, CYCLE_T5); - /* END_MICROCODE */ -} + uop_calc(MUL, BANK_HI, CYCLE_T5, CYCLE_T3, BANK_LO, CYCLE_T2); + uop_calc(ADD, BANK_LO, CYCLE_T2, CYCLE_T2, BANK_HI, CYCLE_T3); + uop_calc(SUB, BANK_HI, CYCLE_T1, CYCLE_T3, BANK_LO, CYCLE_SX); + uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_SX, BANK_HI, CYCLE_T1); -//------------------------------------------------------------------------------ -// -// Adds the base point G to the point stored in CYCLE_S* and stores the result -// again in CYCLE_R*. -// -//------------------------------------------------------------------------------ -void fpga_curve_add_jacobian_microcode() -{ - //fpga_modular_mul(SZ, SZ, A) ; // 3. A = SZ * SZ - //fpga_modular_mul(A, SZ, B ); // 4. B = A * SZ - //fpga_modular_mul(A, &ECDSA_GX, C ); // 5. C = A * GX - //fpga_modular_mul(B, &ECDSA_GY, D ); // 6. D = B * GY - //fpga_modular_sub(C, SX, E ); // 7. E = C - SX - //fpga_modular_sub(D, SY, F ); // 8. F = D - SY - //fpga_modular_mul(E, SZ, RZ); // 10. RZ = E * SZ [output] - //fpga_modular_mul(E, E, G ); // 11. G = E * E - //fpga_modular_mul(E, G, H ); // 12. H = E * G - //fpga_modular_mul(G, SX, J ); // 13. J = G * SX - //fpga_modular_add(J, J, T1); // 14. T1 = J + J - //fpga_modular_mul(F, F, T2); // 15. T2 = F * F - //fpga_modular_sub(T2, T1, T3); // 16. T3 = T2 - T1 - //fpga_modular_sub(T3, H, RX); // 17. RX = T3 - H [output] - //fpga_modular_sub(J, RX, T1); // 18. T1 = J - RX - //fpga_modular_mul(F, T1, T2); // 19. T2 = F * T1 - //fpga_modular_mul(H, SY, T3); // 20. T3 = H * SY - //fpga_modular_sub(T2, T3, RY); // 21. RY = T2 - T3 [output] + uop_move( BANK_HI, CYCLE_T8, BANK_LO, CYCLE_T8); + uop_move( BANK_HI, CYCLE_T1, BANK_LO, CYCLE_T1); - /* BEGIN_MICROCODE: CYCLE_ADD */ + uop_calc(MUL, BANK_LO, CYCLE_T1, CYCLE_T8, BANK_HI, CYCLE_T2); + uop_calc(MUL, BANK_LO, CYCLE_T6, CYCLE_T4, BANK_HI, CYCLE_T3); + uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_T3, BANK_LO, CYCLE_SY); - uop_cmpz( BANK_HI, CYCLE_SZ); - uop_move( BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_SZ); - uop_calc(MUL, BANK_LO, CYCLE_SZ, CYCLE_SZ, BANK_HI, CYCLE_A); - uop_calc(MUL, BANK_HI, CYCLE_A, CYCLE_SZ, BANK_LO, CYCLE_B); - uop_move( BANK_LO, CYCLE_B, BANK_HI, CYCLE_B); - uop_calc(MUL, BANK_HI, CYCLE_A, CONST_GX, BANK_LO, CYCLE_C); - uop_calc(MUL, BANK_HI, CYCLE_B, CONST_GY, BANK_LO, CYCLE_D); - uop_calc(SUB, BANK_LO, CYCLE_C, CYCLE_SX, BANK_HI, CYCLE_E); - uop_calc(SUB, BANK_LO, CYCLE_D, CYCLE_SY, BANK_HI, CYCLE_F); - uop_cmpz( BANK_HI, CYCLE_E); - uop_cmpz( BANK_HI, CYCLE_F); - uop_calc(MUL, BANK_HI, CYCLE_E, CYCLE_SZ, BANK_LO, CYCLE_RZ); - uop_calc(MUL, BANK_HI, CYCLE_E, CYCLE_E, BANK_LO, CYCLE_G); - uop_move( BANK_LO, CYCLE_G, BANK_HI, CYCLE_G); - uop_calc(MUL, BANK_HI, CYCLE_E, CYCLE_G, BANK_LO, CYCLE_H); - uop_calc(MUL, BANK_LO, CYCLE_G, CYCLE_SX, BANK_HI, CYCLE_J); - uop_calc(ADD, BANK_HI, CYCLE_J, CYCLE_J, BANK_LO, CYCLE_T1); - uop_calc(MUL, BANK_HI, CYCLE_F, CYCLE_F, BANK_LO, CYCLE_T2); - uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T1, BANK_HI, CYCLE_T3); - uop_move( BANK_HI, CYCLE_T3, BANK_LO, CYCLE_T3); - uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_H, BANK_HI, CYCLE_RX); - uop_calc(SUB, BANK_HI, CYCLE_J, CYCLE_RX, BANK_LO, CYCLE_T1); - uop_move( BANK_HI, CYCLE_F, BANK_LO, CYCLE_F); - uop_calc(MUL, BANK_LO, CYCLE_F, CYCLE_T1, BANK_HI, CYCLE_T2); - uop_calc(MUL, BANK_LO, CYCLE_H, CYCLE_SY, BANK_HI, CYCLE_T3); - uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_T3, BANK_LO, CYCLE_RY); - uop_move( BANK_LO, CYCLE_RY, BANK_HI, CYCLE_RY); + uop_cmpz(BANK_LO, CYCLE_R0Z); + uop_cmpz(BANK_LO, CYCLE_R1Z); /* END_MICROCODE */ // // handle special corner cases // - if (uop_flagz_sz) + + if (uop_flagz_r0z && !uop_flagz_r1z) { - /* BEGIN_MICROCODE: CYCLE_ADD_AT_INFINITY */ + /* BEGIN_MICROCODE: CYCLE_ADD_R0_AT_INFINITY */ - uop_move(BANK_LO, CONST_GX, BANK_HI, CYCLE_RX); - uop_move(BANK_LO, CONST_GY, BANK_HI, CYCLE_RY); - uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ); + uop_move(BANK_HI, CYCLE_R1X, BANK_LO, CYCLE_SX); + uop_move(BANK_HI, CYCLE_R1Y, BANK_LO, CYCLE_SY); + uop_move(BANK_LO, CYCLE_R1Z, BANK_HI, CYCLE_SZ); /* END_MICROCODE */ + + return; } - else + + if (!uop_flagz_r0z && uop_flagz_r1z) { - if (uop_flagz_e) - { - if (uop_flagz_f) - { - /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X_SAME_Y */ + /* BEGIN_MICROCODE: CYCLE_ADD_R1_AT_INFINITY */ - uop_move(BANK_LO, CONST_HX, BANK_HI, CYCLE_RX); - uop_move(BANK_LO, CONST_HY, BANK_HI, CYCLE_RY); - uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ); + uop_move(BANK_HI, CYCLE_R0X, BANK_LO, CYCLE_SX); + uop_move(BANK_HI, CYCLE_R0Y, BANK_LO, CYCLE_SY); + uop_move(BANK_LO, CYCLE_R0Z, BANK_HI, CYCLE_SZ); - /* END_MICROCODE */ - } - else - { - /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X */ + /* END_MICROCODE */ - uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RX); - uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RY); - uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ); + return; + } - /* END_MICROCODE */ - } - } - else - { - /* BEGIN_MICROCODE: CYCLE_ADD_REGULAR */ + /* BEGIN_MICROCODE: CYCLE_ADD_REGULAR */ - uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_T1); - uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_T2); - uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_T3); + uop_move(BANK_LO, CONST_GX, BANK_HI, CYCLE_SX); + uop_move(BANK_LO, CONST_GY, BANK_HI, CYCLE_SY); + uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_SZ); - /* END_MICROCODE */ - } - } + /* END_MICROCODE */ } @@ -262,6 +243,13 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER FPGA_WORD k_word; bool k_bit; +#ifdef DUMP_CYCLE_STATES + FPGA_BUFFER r0x, r0y, r0z; + FPGA_BUFFER r1x, r1y, r1z; + FPGA_BUFFER sx, sy, sz; + FPGA_BUFFER tx, ty, tz; +#endif + // initialize internal banks fpga_multiword_copy(&ECDSA_ZERO, &BUF_LO[CONST_ZERO]); fpga_multiword_copy(&ECDSA_ZERO, &BUF_HI[CONST_ZERO]); @@ -278,61 +266,110 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER fpga_multiword_copy(&ECDSA_GY, &BUF_LO[CONST_GY]); fpga_multiword_copy(&ECDSA_GY, &BUF_HI[CONST_GY]); - fpga_multiword_copy(&ECDSA_HX, &BUF_LO[CONST_HX]); - fpga_multiword_copy(&ECDSA_HX, &BUF_HI[CONST_HX]); - - fpga_multiword_copy(&ECDSA_HY, &BUF_LO[CONST_HY]); - fpga_multiword_copy(&ECDSA_HY, &BUF_HI[CONST_HY]); - /* BEGIN_MICROCODE: PREPARE */ - // set initial value of R to point at infinity - uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RX); - uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_RY); - uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ); + // set initial value of R0 to point at infinity + // set initial value of R1 to the base point + + uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_R0X); + uop_move(BANK_LO, CONST_ONE, BANK_HI, CYCLE_R0Y); + uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_R0Z); + + uop_move(BANK_LO, CONST_GX, BANK_HI, CYCLE_R1X); + uop_move(BANK_LO, CONST_GY, BANK_HI, CYCLE_R1Y); + uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_R1Z); /* END_MICROCODE */ + /* process bits of k left-to-right */ for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--) for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--) { k_word = k->words[word_count-1]; - k_bit = (k_word & (FPGA_WORD)(1 << (bit_count-1))) > 0; + k_bit = (k_word & (FPGA_WORD)(1 << (bit_count-1))) > 0; - // Banks of working cycle operands - // ------------------------------- - // RX: HI - // RY: HI - // RZ: LO +#ifdef DUMP_CYCLE_STATES + dump_cycle_header(word_count, bit_count, k_bit); +#endif - // calculate S = 2 * R - fpga_curve_double_jacobian_microcode(); + // + // calculate S = R0 + R1 + // // Banks of working cycle operands // ------------------------------- + // R0|1X: HI + // R0|1Y: HI + // R0|1Z: LO + // SX: LO // SY: LO // SZ: HI - // always calculate R = S * G for constant-time operation - fpga_curve_add_jacobian_microcode(); + fpga_curve_add_jacobian_microcode_2(); + + // + // calculate T = 2 * R0 or T = 2 * R1 + // // Banks of working cycle operands // ------------------------------- - // RX: HI - // RY: HI - // RZ: LO + // R0|1X: HI + // R0|1Y: HI + // R0|1Z: LO + + // TX: LO + // TY: LO + // TZ: HI + if (!k_bit) + fpga_curve_double_jacobian_microcode_r0(); + else + fpga_curve_double_jacobian_microcode_r1(); + + // + // dump cycle state + // +#ifdef DUMP_CYCLE_STATES + uop_stor(BANK_HI, CYCLE_R0X, &r0x); + uop_stor(BANK_HI, CYCLE_R0Y, &r0y); + uop_stor(BANK_LO, CYCLE_R0Z, &r0z); + + uop_stor(BANK_HI, CYCLE_R1X, &r1x); + uop_stor(BANK_HI, CYCLE_R1Y, &r1y); + uop_stor(BANK_LO, CYCLE_R1Z, &r1z); + + uop_stor(BANK_LO, CYCLE_SX, &sx); + uop_stor(BANK_LO, CYCLE_SY, &sy); + uop_stor(BANK_HI, CYCLE_SZ, &sz); + + uop_stor(BANK_LO, CYCLE_TX, &tx); + uop_stor(BANK_LO, CYCLE_TY, &ty); + uop_stor(BANK_HI, CYCLE_TZ, &tz); + + dump_cycle_state(&r0x, &r0y, &r0z, &r1x, &r1y, &r1z, + &sx, &sy, &sz, &tx, &ty, &tz); +#endif + + // + // update working variables + // if (!k_bit) { /* BEGIN_MICROCODE: CYCLE_K0 */ - // revert to the value of S before addition if the current bit of k is not set - uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX); - uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY); - uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_RZ); + // R0 = 2 * R0 (double) + // R1 = R0 + R1 (add) + + uop_move(BANK_LO, CYCLE_TX, BANK_HI, CYCLE_R0X); + uop_move(BANK_LO, CYCLE_TY, BANK_HI, CYCLE_R0Y); + uop_move(BANK_HI, CYCLE_TZ, BANK_LO, CYCLE_R0Z); + + uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R1X); + uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R1Y); + uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_R1Z); /* END_MICROCODE */ } @@ -340,74 +377,20 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER { /* BEGIN_MICROCODE: CYCLE_K1 */ - // do dummy overwrite for constant-time operation - uop_move(BANK_HI, CYCLE_RX, BANK_LO, CYCLE_SX); - uop_move(BANK_HI, CYCLE_RY, BANK_LO, CYCLE_SY); - uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_SZ); + // R0 = R0 + R1 (add) + // R1 = 2 * R1 (double) + + uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R0X); + uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R0Y); + uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_R0Z); + + uop_move(BANK_LO, CYCLE_TX, BANK_HI, CYCLE_R1X); + uop_move(BANK_LO, CYCLE_TY, BANK_HI, CYCLE_R1Y); + uop_move(BANK_HI, CYCLE_TZ, BANK_LO, CYCLE_R1Z); /* END_MICROCODE */ } - FPGA_BUFFER TEMP; - - //printf("wc = %d, bc = %d\n", word_count-1, bit_count-1); - - uop_stor(BANK_LO, CYCLE_RX, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RX = ", &TEMP); - uop_stor(BANK_LO, CYCLE_RY, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RY = ", &TEMP); - uop_stor(BANK_LO, CYCLE_RZ, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RZ = ", &TEMP); - - uop_stor(BANK_LO, CYCLE_SX, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SX = ", &TEMP); - uop_stor(BANK_LO, CYCLE_SY, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SY = ", &TEMP); - uop_stor(BANK_LO, CYCLE_SZ, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SZ = ", &TEMP); - - uop_stor(BANK_LO, CYCLE_A, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A = ", &TEMP); - uop_stor(BANK_LO, CYCLE_A2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A2 = ", &TEMP); - uop_stor(BANK_LO, CYCLE_B, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_B = ", &TEMP); - uop_stor(BANK_LO, CYCLE_C, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C = ", &TEMP); - uop_stor(BANK_LO, CYCLE_C2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2 = ", &TEMP); - uop_stor(BANK_LO, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2_2 = ", &TEMP); - uop_stor(BANK_LO, CYCLE_D, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_D = ", &TEMP); - uop_stor(BANK_LO, CYCLE_E, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_E = ", &TEMP); - uop_stor(BANK_LO, CYCLE_F, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_F = ", &TEMP); - uop_stor(BANK_LO, CYCLE_G, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_G = ", &TEMP); - uop_stor(BANK_LO, CYCLE_H, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_H = ", &TEMP); - uop_stor(BANK_LO, CYCLE_J, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_J = ", &TEMP); - - uop_stor(BANK_LO, CYCLE_Z2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_Z2 = ", &TEMP); - - uop_stor(BANK_LO, CYCLE_T1, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T1 = ", &TEMP); - uop_stor(BANK_LO, CYCLE_T2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T2 = ", &TEMP); - uop_stor(BANK_LO, CYCLE_T3, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T3 = ", &TEMP); - uop_stor(BANK_LO, CYCLE_T4, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T4 = ", &TEMP); - - uop_stor(BANK_HI, CYCLE_RX, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RX = ", &TEMP); - uop_stor(BANK_HI, CYCLE_RY, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RY = ", &TEMP); - uop_stor(BANK_HI, CYCLE_RZ, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RZ = ", &TEMP); - - uop_stor(BANK_HI, CYCLE_SX, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SX = ", &TEMP); - uop_stor(BANK_HI, CYCLE_SY, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SY = ", &TEMP); - uop_stor(BANK_HI, CYCLE_SZ, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SZ = ", &TEMP); - - uop_stor(BANK_HI, CYCLE_A, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A = ", &TEMP); - uop_stor(BANK_HI, CYCLE_A2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A2 = ", &TEMP); - uop_stor(BANK_HI, CYCLE_B, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_B = ", &TEMP); - uop_stor(BANK_HI, CYCLE_C, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C = ", &TEMP); - uop_stor(BANK_HI, CYCLE_C2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2 = ", &TEMP); - uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2_2 = ", &TEMP); - uop_stor(BANK_HI, CYCLE_D, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_D = ", &TEMP); - uop_stor(BANK_HI, CYCLE_E, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_E = ", &TEMP); - uop_stor(BANK_HI, CYCLE_F, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_F = ", &TEMP); - uop_stor(BANK_HI, CYCLE_G, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_G = ", &TEMP); - uop_stor(BANK_HI, CYCLE_H, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_H = ", &TEMP); - uop_stor(BANK_HI, CYCLE_J, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_J = ", &TEMP); - - uop_stor(BANK_HI, CYCLE_Z2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_Z2 = ", &TEMP); - - uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T1 = ", &TEMP); - uop_stor(BANK_HI, CYCLE_T2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T2 = ", &TEMP); - uop_stor(BANK_HI, CYCLE_T3, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T3 = ", &TEMP); - uop_stor(BANK_HI, CYCLE_T4, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T4 = ", &TEMP); - } // now convert to affine coordinates @@ -415,18 +398,18 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER /* BEGIN_MICROCODE: CONVERT */ - uop_calc(MUL, BANK_HI, INVERT_A2, CYCLE_RX, BANK_LO, CYCLE_SX); - uop_calc(MUL, BANK_HI, INVERT_A3, CYCLE_RY, BANK_LO, CYCLE_SY); - uop_cmpz(BANK_LO, CYCLE_RZ); + uop_calc(MUL, BANK_HI, INVERT_A2, CYCLE_R0X, BANK_LO, CYCLE_SX); + uop_calc(MUL, BANK_HI, INVERT_A3, CYCLE_R0Y, BANK_LO, CYCLE_SY); + uop_cmpz(BANK_LO, CYCLE_R0Z); /* END_MICROCODE */ - if (uop_flagz_rz) + if (uop_flagz_r0z) { /* BEGIN_MICROCODE: CONVERT_AT_INFINITY */ - uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RX); - uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RY); + uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_R0X); + uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_R0Y); /* END_MICROCODE */ } @@ -434,15 +417,15 @@ void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER { /* BEGIN_MICROCODE: CONVERT_REGULAR */ - uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX); - uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY); + uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_R0X); + uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_R0Y); /* END_MICROCODE */ } // return - uop_stor(BANK_HI, CYCLE_RX, qx); - uop_stor(BANK_HI, CYCLE_RY, qy); + uop_stor(BANK_HI, CYCLE_R0X, qx); + uop_stor(BANK_HI, CYCLE_R0Y, qy); } #endif USE_MICROCODE @@ -456,36 +439,48 @@ void fpga_curve_double_jacobian_microcode_wrapper(const FPGA_BUFFER *rx, FPGA_BUFFER *sz) //------------------------------------------------------------------------------ { - uop_load(rx, BANK_HI, CYCLE_RX); - uop_load(ry, BANK_HI, CYCLE_RY); - uop_load(rz, BANK_LO, CYCLE_RZ); + // + // we have two pieces of microcode to double either R0 or R1 (this + // depends on the current multiplier bit), here we can just always + // use the one meant for R0 - fpga_curve_double_jacobian_microcode(); + uop_load(rx, BANK_HI, CYCLE_R0X); + uop_load(ry, BANK_HI, CYCLE_R0Y); + uop_load(rz, BANK_LO, CYCLE_R0Z); - uop_stor(BANK_LO, CYCLE_SX, sx); - uop_stor(BANK_LO, CYCLE_SY, sy); - uop_stor(BANK_HI, CYCLE_SZ, sz); + fpga_curve_double_jacobian_microcode_r0(); + + uop_stor(BANK_LO, CYCLE_TX, sx); + uop_stor(BANK_LO, CYCLE_TY, sy); + uop_stor(BANK_HI, CYCLE_TZ, sz); } //------------------------------------------------------------------------------ -void fpga_curve_add_jacobian_microcode_wrapper(const FPGA_BUFFER *sx, - const FPGA_BUFFER *sy, - const FPGA_BUFFER *sz, - FPGA_BUFFER *rx, - FPGA_BUFFER *ry, - FPGA_BUFFER *rz) +void fpga_curve_add_jacobian_microcode_2_wrapper(const FPGA_BUFFER *px, + const FPGA_BUFFER *py, + const FPGA_BUFFER *pz, + const FPGA_BUFFER *qx, + const FPGA_BUFFER *qy, + const FPGA_BUFFER *qz, + FPGA_BUFFER *rx, + FPGA_BUFFER *ry, + FPGA_BUFFER *rz) //------------------------------------------------------------------------------ { - uop_load(sx, BANK_LO, CYCLE_SX); - uop_load(sy, BANK_LO, CYCLE_SY); - uop_load(sz, BANK_HI, CYCLE_SZ); + uop_load(px, BANK_HI, CYCLE_R0X); + uop_load(py, BANK_HI, CYCLE_R0Y); + uop_load(pz, BANK_LO, CYCLE_R0Z); + + uop_load(qx, BANK_HI, CYCLE_R1X); + uop_load(qy, BANK_HI, CYCLE_R1Y); + uop_load(qz, BANK_LO, CYCLE_R1Z); - fpga_curve_add_jacobian_microcode(); + fpga_curve_add_jacobian_microcode_2(); - uop_stor(BANK_HI, CYCLE_RX, rx); - uop_stor(BANK_HI, CYCLE_RY, ry); - uop_stor(BANK_LO, CYCLE_RZ, rz); + uop_stor(BANK_HI, CYCLE_SX, rx); + uop_stor(BANK_HI, CYCLE_SY, ry); + uop_stor(BANK_LO, CYCLE_SZ, rz); } -- cgit v1.2.3