* New hardware architecture

* Randomized test vector
author: Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru> 2018-12-19 16:03:08 +0300
committer: Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru> 2018-12-19 16:03:08 +0300
commit: 1f8d13bf8d2e813f0c5da653c4abffb7a817db9a (patch)
tree: 7b6290a838f460a9d104f28a32de08be8bcf8605
parent: cae8718217846cfaefcbfecd55f9a117731a8d99 (diff)
27 files changed, 4808 insertions, 2297 deletions
diff --git a/ecdsa_fpga_curve.h b/ecdsa_fpga_curve.h
new file mode 100644
index 0000000..00448eb
--- /dev/null
+++ b/ecdsa_fpga_curve.h
@@ -0,0 +1,203 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_curve.h
+// ----------------------------------------------
+// Elliptic curve arithmetic procedures for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// ECDSA Parameters (P-256)
+//------------------------------------------------------------------------------
+
+/* Base Point */
+#define ECDSA_P256_GX_INIT \
+    {0x6b17d1f2, 0xe12c4247, 0xf8bce6e5, 0x63a440f2, \
+     0x77037d81, 0x2deb33a0, 0xf4a13945, 0xd898c296}
+
+#define ECDSA_P256_GY_INIT \
+    {0x4fe342e2, 0xfe1a7f9b, 0x8ee7eb4a, 0x7c0f9e16, \
+     0x2bce3357, 0x6b315ece, 0xcbb64068, 0x37bf51f5}
+
+/* Doubled Base Point */
+#define ECDSA_P256_HX_INIT \
+    {0x7cf27b18, 0x8d034f7e, 0x8a523803, 0x04b51ac3, \
+     0xc08969e2, 0x77f21b35, 0xa60b48fc, 0x47669978}
+
+#define ECDSA_P256_HY_INIT \
+    {0x07775510, 0xdb8ed040, 0x293d9ac6, 0x9f7430db, \
+     0xba7dade6, 0x3ce98229, 0x9e04b79d, 0x227873d1}
+
+/* Order of the Base Point */
+#define ECDSA_P256_N_INIT \
+    {0xffffffff, 0x00000000, 0xffffffff, 0xffffffff, \
+     0xbce6faad, 0xa7179e84, 0xf3b9cac2, 0xfc632551}
+
+
+//------------------------------------------------------------------------------
+// ECDSA Parameters (P-384)
+//------------------------------------------------------------------------------
+
+/* Base Point */
+#define ECDSA_P384_GX_INIT \
+    {0xaa87ca22, 0xbe8b0537, 0x8eb1c71e, 0xf320ad74, \
+     0x6e1d3b62, 0x8ba79b98, 0x59f741e0, 0x82542a38, \
+     0x5502f25d, 0xbf55296c, 0x3a545e38, 0x72760ab7}
+
+#define ECDSA_P384_GY_INIT \
+    {0x3617de4a, 0x96262c6f, 0x5d9e98bf, 0x9292dc29, \
+     0xf8f41dbd, 0x289a147c, 0xe9da3113, 0xb5f0b8c0, \
+     0x0a60b1ce, 0x1d7e819d, 0x7a431d7c, 0x90ea0e5f}
+
+/* Doubled Base Point */
+#define ECDSA_P384_HX_INIT \
+    {0x08d99905, 0x7ba3d2d9, 0x69260045, 0xc55b97f0, \
+     0x89025959, 0xa6f434d6, 0x51d207d1, 0x9fb96e9e, \
+     0x4fe0e86e, 0xbe0e64f8, 0x5b96a9c7, 0x5295df61}
+
+#define ECDSA_P384_HY_INIT \
+    {0x8e80f1fa, 0x5b1b3ced, 0xb7bfe8df, 0xfd6dba74, \
+     0xb275d875, 0xbc6cc43e, 0x904e505f, 0x256ab425, \
+     0x5ffd43e9, 0x4d39e22d, 0x61501e70, 0x0a940e80}
+
+/* Order of the Base Point */
+#define ECDSA_P384_N_INIT \
+    {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+     0xffffffff, 0xffffffff, 0xc7634d81, 0xf4372ddf, \
+     0x581a0db2, 0x48b0a77a, 0xecec196a, 0xccc52973}
+
+//------------------------------------------------------------------------------
+// ECDSA Parameters Switch
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+
+#define ECDSA_GX_INIT       ECDSA_P256_GX_INIT
+#define ECDSA_GY_INIT       ECDSA_P256_GY_INIT
+#define ECDSA_HX_INIT       ECDSA_P256_HX_INIT
+#define ECDSA_HY_INIT       ECDSA_P256_HY_INIT
+
+#define ECDSA_N_INIT        ECDSA_P256_N_INIT
+
+#elif USE_CURVE == 2
+
+#define ECDSA_GX_INIT       ECDSA_P384_GX_INIT
+#define ECDSA_GY_INIT       ECDSA_P384_GY_INIT
+#define ECDSA_HX_INIT       ECDSA_P384_HX_INIT
+#define ECDSA_HY_INIT       ECDSA_P384_HY_INIT
+
+#define ECDSA_N_INIT        ECDSA_P384_N_INIT
+
+#else
+
+BAD_CURVE
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER ECDSA_GX, ECDSA_GY;
+extern FPGA_BUFFER ECDSA_HX, ECDSA_HY;
+extern FPGA_BUFFER ECDSA_N;
+
+
+//------------------------------------------------------------------------------
+// Switch
+//------------------------------------------------------------------------------
+#ifdef USE_MICROCODE
+
+#define fpga_curve_base_scalar_multiply fpga_curve_base_scalar_multiply_microcode
+#define fpga_curve_add_jacobian         fpga_curve_add_jacobian_microcode_wrapper
+#define fpga_curve_double_jacobian      fpga_curve_double_jacobian_microcode_wrapper
+
+#else
+
+#define fpga_curve_base_scalar_multiply fpga_curve_base_scalar_multiply_abstract
+#define fpga_curve_add_jacobian         fpga_curve_add_jacobian_abstract
+#define fpga_curve_double_jacobian      fpga_curve_double_jacobian_abstract
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void fpga_curve_init ();
+
+void fpga_curve_base_scalar_multiply_abstract  (const FPGA_BUFFER *k,
+                                                      FPGA_BUFFER *qx,
+                                                      FPGA_BUFFER *qy);
+
+void fpga_curve_base_scalar_multiply_microcode (const FPGA_BUFFER *k,
+                                                      FPGA_BUFFER *qx,
+                                                      FPGA_BUFFER *qy);
+
+void fpga_curve_add_jacobian_abstract    (const FPGA_BUFFER *px,
+                                          const FPGA_BUFFER *py,
+                                          const FPGA_BUFFER *pz,
+                                                FPGA_BUFFER *rx,
+                                                FPGA_BUFFER *ry,
+                                                FPGA_BUFFER *rz);
+
+void fpga_curve_double_jacobian_abstract (const FPGA_BUFFER *px,
+                                          const FPGA_BUFFER *py,
+                                          const FPGA_BUFFER *pz,
+                                                FPGA_BUFFER *rx,
+                                                FPGA_BUFFER *ry,
+                                                FPGA_BUFFER *rz);
+
+void fpga_curve_add_jacobian_microcode    ();
+
+void fpga_curve_double_jacobian_microcode ();
+
+void fpga_curve_add_jacobian_microcode_wrapper    (const FPGA_BUFFER *px,
+                                                   const FPGA_BUFFER *py,
+                                                   const FPGA_BUFFER *pz,
+                                                         FPGA_BUFFER *rx,
+                                                         FPGA_BUFFER *ry,
+                                                         FPGA_BUFFER *rz);
+
+
+void fpga_curve_double_jacobian_microcode_wrapper (const FPGA_BUFFER *px,
+                                                   const FPGA_BUFFER *py,
+                                                   const FPGA_BUFFER *pz,
+                                                         FPGA_BUFFER *rx,
+                                                         FPGA_BUFFER *ry,
+                                                         FPGA_BUFFER *rz);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_curve_abstract.cpp b/ecdsa_fpga_curve_abstract.cpp
new file mode 100644
index 0000000..5510ac1
--- /dev/null
+++ b/ecdsa_fpga_curve_abstract.cpp
@@ -0,0 +1,313 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_curve_abstract.cpp
+// ----------------------------------------------
+// Elliptic curve arithmetic procedures for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_model.h"
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+FPGA_BUFFER ECDSA_GX, ECDSA_GY;
+FPGA_BUFFER ECDSA_HX, ECDSA_HY;
+FPGA_BUFFER ECDSA_N;
+
+
+//------------------------------------------------------------------------------
+void fpga_curve_init()
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+    FPGA_BUFFER tmp_gx = ECDSA_GX_INIT, tmp_gy = ECDSA_GY_INIT;
+    FPGA_BUFFER tmp_hx = ECDSA_HX_INIT, tmp_hy = ECDSA_HY_INIT;
+    FPGA_BUFFER tmp_n = ECDSA_N_INIT;
+
+        /* fill buffers for large multi-word integers */
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {   ECDSA_GX.words[w] = tmp_gx.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ECDSA_GY.words[w] = tmp_gy.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ECDSA_HX.words[w] = tmp_hx.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ECDSA_HY.words[w] = tmp_hy.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ECDSA_N.words[w] = tmp_n.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+    }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve base point scalar multiplication routine.
+//
+// Q(qx,qy) = k * G(px,py)
+//
+// Note, that Q is supposed to be in affine coordinates. Multiplication is done
+// using the double-and-add algorithm 3.27 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// WARNING: Though this procedure always does the addition step, it only
+// updates the result when current bit of k is set. It does not take any
+// active measures to keep run-time constant. The main purpose of this model
+// is to help debug Verilog code for FPGA, so *DO NOT* use it anywhere near
+// production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_base_scalar_multiply_abstract(const FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+{
+    int word_count, bit_count;  // counters
+
+    FPGA_BUFFER rx, ry, rz;     // intermediate result
+    FPGA_BUFFER tx, ty, tz;     // temporary variable
+
+        /* set initial value of R to point at infinity */
+    fpga_multiword_copy(&ECDSA_ONE,  &rx);
+    fpga_multiword_copy(&ECDSA_ONE,  &ry);
+    fpga_multiword_copy(&ECDSA_ZERO, &rz);
+
+        /* process bits of k left-to-right */
+    for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
+        for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
+        {
+                /* calculate T = 2 * R */
+            fpga_curve_double_jacobian_abstract(&rx, &ry, &rz, &tx, &ty, &tz);
+
+                /* always calculate R = T + P for constant-time */
+            fpga_curve_add_jacobian_abstract(&tx, &ty, &tz, &rx, &ry, &rz);
+
+                /* revert to the value of T before addition if the current bit of k is not set */
+            if (!((k->words[word_count-1] >> (bit_count-1)) & 1))
+            {   fpga_multiword_copy(&tx, &rx);
+                fpga_multiword_copy(&ty, &ry);
+                fpga_multiword_copy(&tz, &rz);
+            }
+
+        }
+
+    FPGA_BUFFER a2, a3; 
+
+	fpga_modular_inv23(&rz, &a2, &a3);
+
+    fpga_modular_mul(&rx, &a2, qx);      // qx = px * (pz^-1)^2 (mod q)
+    fpga_modular_mul(&ry, &a3, qy);      // qy = py * (pz^-1)^3 (mod q)
+
+        // check, that rz is non-zero (not point at infinity)
+    bool rz_is_zero = fpga_multiword_is_zero(&rz);
+
+        // handle special case (result is point at infinity)
+    if (rz_is_zero)
+    {   fpga_multiword_copy(&ECDSA_ZERO, qx);
+        fpga_multiword_copy(&ECDSA_ZERO, qy);
+    }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve point doubling routine.
+//
+// R(rx,ry,rz) = 2 * P(px,py,pz)
+//
+// Note, that P(px,py,pz) is supposed to be in projective Jacobian coordinates,
+// R will be in projective Jacobian coordinates.
+//
+// This routine implements algorithm 3.21 from "Guide to Elliptic Curve
+// Cryptography", the only difference is that step 6. does T1 = T2 + T2 and
+// then T2 = T2 + T1 instead of T2 = 3 * T2, because our addition is much
+// faster, than multiplication.
+//
+// Note, that this routine also handles one special case, namely when P is at
+// infinity.
+//
+// Instead of actual modular division, multiplication by pre-computed constant
+// (2^-1 mod q) is done.
+//
+// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
+// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
+// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
+// then fpga_modular_mul(pz, &t1, &t1) instead.
+//
+// WARNING: Though this procedure always does doubling steps, it does not take
+// any active measures to keep run-time constant. The main purpose of this
+// model is to help debug Verilog code for FPGA, so *DO NOT* use is anywhere
+// near production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_double_jacobian_abstract(const FPGA_BUFFER *px,
+                                         const FPGA_BUFFER *py,
+                                         const FPGA_BUFFER *pz,
+                                               FPGA_BUFFER *rx,
+                                               FPGA_BUFFER *ry,
+                                               FPGA_BUFFER *rz)
+//------------------------------------------------------------------------------
+{
+    FPGA_BUFFER t1, t2, t3; // temporary variables
+
+        // check, whether P is at infinity
+    bool pz_is_zero = fpga_multiword_is_zero(pz);
+
+    /*  2. */ fpga_multiword_copy(pz,  &t1);
+              fpga_modular_mul(pz,  &t1,          &t1);
+    /*  3. */ fpga_modular_sub(px,  &t1,          &t2);
+    /*  4. */ fpga_modular_add(px,  &t1,          &t1);
+    /*  5. */ fpga_modular_mul(&t1, &t2,          &t2);
+    /*  6. */ fpga_modular_add(&t2, &t2,          &t1);
+    /*     */ fpga_modular_add(&t1, &t2,          &t2);
+    /*  7. */ fpga_modular_add(py,  py,           ry);
+    /*  8. */ fpga_modular_mul(pz,  ry,           rz);
+    /*  9. */ fpga_multiword_copy(ry,  &t1);
+              fpga_multiword_copy(ry,  &t3);
+              fpga_modular_mul(&t1, &t3,          ry);
+    /* 10. */ fpga_modular_mul(px,  ry,           &t3);
+    /* 11. */ fpga_multiword_copy(ry,  &t1);
+              fpga_modular_mul(ry,  &t1,          &t1);
+    /* 12. */ fpga_modular_mul(&t1, &ECDSA_DELTA, ry);
+    /* 13. */ fpga_multiword_copy(&t2, &t1);
+              fpga_modular_mul(&t1, &t2,          rx);
+    /* 14. */ fpga_modular_add(&t3, &t3,          &t1);
+    /* 15. */ fpga_modular_sub(rx,  &t1,          rx);
+    /* 16. */ fpga_modular_sub(&t3, rx,           &t1); 
+    /* 17. */ fpga_modular_mul(&t1, &t2,          &t1);
+    /* 18. */ fpga_modular_sub(&t1, ry,           ry);  
+
+        // handle special case (input point is at infinity)
+    if (pz_is_zero)
+    {   fpga_multiword_copy(&ECDSA_ONE,  rx);
+        fpga_multiword_copy(&ECDSA_ONE,  ry);
+        fpga_multiword_copy(&ECDSA_ZERO, rz);
+    }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve point addition routine.
+//
+// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy)
+//
+// Note, that P(px, py, pz) is supposed to be in projective Jacobian
+// coordinates, while Q(qx,qy) is supposed to be in affine coordinates,
+// R(rx, ry, rz) will be in projective Jacobian coordinates. Moreover, in this
+// particular implementation Q is always the base point G.
+//
+// This routine implements algorithm 3.22 from "Guide to Elliptic Curve
+// Cryptography". Differences from the original algorithm:
+// 
+// 1) Step 1. is omitted, because point Q is always the base point, which is
+//    not at infinity by definition.
+//
+// 2) Step 9.1 just returns the pre-computed double of the base point instead
+// of actually doubling it.
+//
+// Note, that this routine also handles three special cases:
+//
+// 1) P is at infinity
+// 2) P == Q
+// 3) P == -Q
+//
+// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
+// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
+// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
+// then fpga_modular_mul(pz, &t1, &t1) instead.
+//
+// WARNING: This procedure does not take any active measures to keep run-time
+// constant. The main purpose of this model is to help debug Verilog code for
+// FPGA, so *DO NOT* use is anywhere near production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_add_jacobian_abstract(const FPGA_BUFFER *px,
+                                      const FPGA_BUFFER *py,
+                                      const FPGA_BUFFER *pz,
+                                            FPGA_BUFFER *rx,
+                                            FPGA_BUFFER *ry,
+                                            FPGA_BUFFER *rz)
+//------------------------------------------------------------------------------
+{
+    FPGA_BUFFER t1, t2, t3, t4;     // temporary variables
+    
+    bool pz_is_zero = fpga_multiword_is_zero(pz);       // Step 2.
+
+    /*  3. */ fpga_multiword_copy(pz,  &t1);
+              fpga_modular_mul(pz,  &t1,        &t1);
+    /*  4. */ fpga_modular_mul(pz,  &t1,        &t2);
+    /*  5. */ fpga_modular_mul(&t1, &ECDSA_GX, &t1);
+    /*  6. */ fpga_modular_mul(&t2, &ECDSA_GY, &t2);
+    /*  7. */ fpga_modular_sub(&t1, px,         &t1);
+    /*  8. */ fpga_modular_sub(&t2, py,         &t2);
+
+    bool t1_is_zero = fpga_multiword_is_zero(&t1);      // | Step 9.
+    bool t2_is_zero = fpga_multiword_is_zero(&t2);      // |
+
+    /* 10. */ fpga_modular_mul(pz,  &t1,        rz);
+    /* 11. */ fpga_multiword_copy(&t1, &t3);
+              fpga_modular_mul(&t1, &t3,        &t3);
+    /* 12. */ fpga_modular_mul(&t1, &t3,        &t4);
+    /* 13. */ fpga_modular_mul(px,  &t3,        &t3);
+    /* 14. */ fpga_modular_add(&t3, &t3,        &t1);
+    /* 15. */ fpga_multiword_copy(&t2, rx);
+              fpga_modular_mul(rx,  &t2,        rx);
+    /* 16. */ fpga_modular_sub(rx,  &t1,        rx);
+    /* 17. */ fpga_modular_sub(rx,  &t4,        rx);
+    /* 18. */ fpga_modular_sub(&t3, rx,         &t3);
+    /* 19. */ fpga_modular_mul(&t2, &t3,        &t3);
+    /* 20. */ fpga_modular_mul(py,  &t4,        &t4);
+    /* 21. */ fpga_modular_sub(&t3, &t4,        ry);
+
+        //
+        // final selection
+        //
+    if (pz_is_zero) // P at infinity ?
+    {   fpga_multiword_copy(&ECDSA_GX, rx);
+        fpga_multiword_copy(&ECDSA_GY, ry);
+        fpga_multiword_copy(&ECDSA_ONE, rz);
+    }
+    else if (t1_is_zero) // same x for P and Q ?
+    {
+        fpga_multiword_copy(t2_is_zero ? &ECDSA_HX : &ECDSA_ONE,  rx);  // | same y ? (P==Q => R=2*G) : (P==-Q => R=O)
+        fpga_multiword_copy(t2_is_zero ? &ECDSA_HY : &ECDSA_ONE,  ry);  // |
+        fpga_multiword_copy(t2_is_zero ? &ECDSA_ONE : &ECDSA_ZERO, rz); // |
+    }
+}
+
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_curve_microcode.cpp b/ecdsa_fpga_curve_microcode.cpp
new file mode 100644
index 0000000..553498c
--- /dev/null
+++ b/ecdsa_fpga_curve_microcode.cpp
@@ -0,0 +1,494 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_curve_microcode.cpp
+// ----------------------------------------------
+// Elliptic curve arithmetic procedures for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Required for Microcode Routines
+//------------------------------------------------------------------------------
+#define USE_MICROCODE
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_model.h"
+
+
+//------------------------------------------------------------------------------
+//
+// Doubles the point stored in CYCLE_R* and stores the result in CYCLE_S*.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_double_jacobian_microcode()
+//------------------------------------------------------------------------------
+{
+    // fpga_modular_mul(RZ, RZ,    RZ2 ); //  2.  RZ2  = RZ * RZ
+    // fpga_modular_sub(RX, RZ2,   T1  ); //  3.  T1   = RX - RZ2
+    // fpga_modular_add(RX, RZ2,   T2  ); //  4.  T2   = RX + RZ2
+    // fpga_modular_mul(T1, T2,    T3  ); //  5.  T3   = T1 * T2
+    // fpga_modular_add(T3, T3,    T4  ); //  6a. T4   = T3 + T3
+    // fpga_modular_add(T3, T4,    A   ); //  6b. A    = T3 + T4
+    // fpga_modular_add(RY, RY,    B   ); //  7.  B    = RY + RY
+    // fpga_modular_mul(B,  RZ,    SZ  ); //  8.  SZ   = B  * RZ [output]
+    // fpga_modular_mul(B,  B,     C   ); //  9.  C    = B  * B
+    // fpga_modular_mul(C,  RX,    D   ); // 10.  D    = C  * RX
+    // fpga_modular_mul(C,  C,     C2  ); // 11.  C2   = C  * C
+    // fpga_modular_mul(C2, DELTA, C2_2); // 12.  C2_2 = C  / 2
+    // fpga_modular_mul(A,  A,     A2  ); // 13.  A2   = A  * A
+    // fpga_modular_add(D,  D,     T1  ); // 14.  T1   = D  + D
+    // fpga_modular_sub(A2, T1,    SX  ); // 15.  SX   = A2 - T1 [output]
+    // fpga_modular_sub(D,  SX,    T1  ); // 16.  T1   = D  - SX
+    // fpga_modular_mul(A , T1,    T2  ); // 17.  T2   = A  * T1
+    // fpga_modular_sub(T2, C2_2,  SY  ); // 18.  SY   = T2 - C2_2 [output]
+
+    /* BEGIN_MICROCODE: CYCLE_DOUBLE */
+
+    FPGA_BUFFER TEMP;
+
+    uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ,    BANK_HI,  CYCLE_Z2);
+    uop_stor(BANK_HI, CYCLE_Z2, &TEMP); print_fpga_buffer("CYCLE_Z2 = ", &TEMP);
+
+    uop_calc(SUB, BANK_HI, CYCLE_RX, CYCLE_Z2,    BANK_LO,  CYCLE_T1);
+    uop_stor(BANK_LO, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
+
+    uop_calc(ADD, BANK_HI, CYCLE_RX, CYCLE_Z2,    BANK_LO,  CYCLE_T2);
+    uop_stor(BANK_LO, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP);
+
+    uop_calc(MUL, BANK_LO, CYCLE_T1, CYCLE_T2,    BANK_HI,  CYCLE_T3);
+    uop_stor(BANK_HI, CYCLE_T3, &TEMP); print_fpga_buffer("CYCLE_T3 = ", &TEMP);
+
+    uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T3,    BANK_LO,  CYCLE_T4);
+    uop_stor(BANK_LO, CYCLE_T4, &TEMP); print_fpga_buffer("CYCLE_T4 = ", &TEMP);
+
+    uop_move(     BANK_LO, CYCLE_T4, BANK_HI,     CYCLE_T4);
+
+    uop_calc(ADD, BANK_HI, CYCLE_T3, CYCLE_T4,    BANK_LO,  CYCLE_A);
+    uop_stor(BANK_LO, CYCLE_A, &TEMP); print_fpga_buffer("CYCLE_A = ", &TEMP);
+
+    uop_calc(ADD, BANK_HI, CYCLE_RY, CYCLE_RY,    BANK_LO,  CYCLE_B);
+    uop_stor(BANK_LO, CYCLE_B, &TEMP); print_fpga_buffer("CYCLE_B = ", &TEMP);
+
+    uop_calc(MUL, BANK_LO, CYCLE_B,  CYCLE_RZ,    BANK_HI,  CYCLE_SZ);
+    uop_stor(BANK_HI, CYCLE_SZ, &TEMP); print_fpga_buffer("CYCLE_SZ = ", &TEMP);
+
+    uop_calc(MUL, BANK_LO, CYCLE_B,  CYCLE_B,     BANK_HI,  CYCLE_C);
+    uop_stor(BANK_HI, CYCLE_C, &TEMP); print_fpga_buffer("CYCLE_C = ", &TEMP);
+
+    uop_calc(MUL, BANK_HI, CYCLE_C,  CYCLE_RX,    BANK_LO,  CYCLE_D);
+    uop_stor(BANK_LO, CYCLE_D, &TEMP); print_fpga_buffer("CYCLE_D = ", &TEMP);
+
+    uop_calc(MUL, BANK_HI, CYCLE_C,  CYCLE_C,     BANK_LO,  CYCLE_C2);
+    uop_stor(BANK_LO, CYCLE_C2, &TEMP); print_fpga_buffer("CYCLE_C2 = ", &TEMP);
+
+    uop_calc(MUL, BANK_LO, CYCLE_C2, CONST_DELTA, BANK_HI,  CYCLE_C2_2);
+    uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer("CYCLE_C2_2 = ", &TEMP);
+
+    uop_calc(MUL, BANK_LO, CYCLE_A,  CYCLE_A,     BANK_HI,  CYCLE_A2);
+    uop_stor(BANK_HI, CYCLE_A2, &TEMP); print_fpga_buffer("CYCLE_A2 = ", &TEMP);
+
+    uop_calc(ADD, BANK_LO, CYCLE_D,  CYCLE_D,     BANK_HI,  CYCLE_T1);
+    uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
+
+    uop_calc(SUB, BANK_HI, CYCLE_A2, CYCLE_T1,    BANK_LO,  CYCLE_SX);
+    uop_stor(BANK_LO, CYCLE_SX, &TEMP); print_fpga_buffer("CYCLE_SX = ", &TEMP);
+
+    uop_calc(SUB, BANK_LO, CYCLE_D,  CYCLE_SX,    BANK_HI,  CYCLE_T1);
+    uop_stor(BANK_HI, CYCLE_T1, &TEMP); print_fpga_buffer("CYCLE_T1 = ", &TEMP);
+
+    uop_move(     BANK_HI, CYCLE_T1, BANK_LO,     CYCLE_T1);
+
+    uop_calc(MUL, BANK_LO, CYCLE_A,  CYCLE_T1,    BANK_HI,  CYCLE_T2);
+    uop_stor(BANK_HI, CYCLE_T2, &TEMP); print_fpga_buffer("CYCLE_T2 = ", &TEMP);
+
+    uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_C2_2,  BANK_LO,  CYCLE_SY);
+    uop_stor(BANK_LO, CYCLE_SY, &TEMP); print_fpga_buffer("CYCLE_SY = ", &TEMP);
+
+    /* END_MICROCODE */
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Adds the base point G to the point stored in CYCLE_S* and stores the result
+// again in CYCLE_R*.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_add_jacobian_microcode()
+{
+    //fpga_modular_mul(SZ, SZ,        A) ; //  3. A  = SZ * SZ
+    //fpga_modular_mul(A,  SZ,        B ); //  4. B  = A  * SZ
+    //fpga_modular_mul(A,  &ECDSA_GX, C ); //  5. C  = A  * GX
+    //fpga_modular_mul(B,  &ECDSA_GY, D ); //  6. D  = B  * GY
+    //fpga_modular_sub(C,  SX,        E ); //  7. E  = C  - SX
+    //fpga_modular_sub(D,  SY,        F ); //  8. F  = D  - SY
+    //fpga_modular_mul(E,  SZ,        RZ); // 10. RZ = E  * SZ [output]
+    //fpga_modular_mul(E,  E,         G ); // 11. G  = E  * E
+    //fpga_modular_mul(E,  G,         H ); // 12. H  = E  * G
+    //fpga_modular_mul(G,  SX,        J ); // 13. J  = G  * SX
+    //fpga_modular_add(J,  J,         T1); // 14. T1 = J  + J
+    //fpga_modular_mul(F,  F,         T2); // 15. T2 = F  * F
+    //fpga_modular_sub(T2, T1,        T3); // 16. T3 = T2 - T1
+    //fpga_modular_sub(T3, H,         RX); // 17. RX = T3 - H [output]
+    //fpga_modular_sub(J,  RX,        T1); // 18. T1 = J  - RX
+    //fpga_modular_mul(F,  T1,        T2); // 19. T2 = F  * T1
+    //fpga_modular_mul(H,  SY,        T3); // 20. T3 = H  * SY
+    //fpga_modular_sub(T2, T3,        RY); // 21. RY = T2 - T3 [output]
+
+    /* BEGIN_MICROCODE: CYCLE_ADD */
+
+    uop_cmpz(     BANK_HI, CYCLE_SZ);
+    uop_move(     BANK_HI, CYCLE_SZ, BANK_LO,  CYCLE_SZ);
+    uop_calc(MUL, BANK_LO, CYCLE_SZ, CYCLE_SZ, BANK_HI,  CYCLE_A);
+    uop_calc(MUL, BANK_HI, CYCLE_A,  CYCLE_SZ, BANK_LO,  CYCLE_B);
+    uop_move(     BANK_LO, CYCLE_B,  BANK_HI,  CYCLE_B);
+    uop_calc(MUL, BANK_HI, CYCLE_A,  CONST_GX, BANK_LO,  CYCLE_C);
+    uop_calc(MUL, BANK_HI, CYCLE_B,  CONST_GY, BANK_LO,  CYCLE_D);
+    uop_calc(SUB, BANK_LO, CYCLE_C,  CYCLE_SX, BANK_HI,  CYCLE_E);
+    uop_calc(SUB, BANK_LO, CYCLE_D,  CYCLE_SY, BANK_HI,  CYCLE_F);
+    uop_cmpz(     BANK_HI, CYCLE_E);
+    uop_cmpz(     BANK_HI, CYCLE_F);
+    uop_calc(MUL, BANK_HI, CYCLE_E,  CYCLE_SZ, BANK_LO,  CYCLE_RZ);
+    uop_calc(MUL, BANK_HI, CYCLE_E,  CYCLE_E,  BANK_LO,  CYCLE_G);
+    uop_move(     BANK_LO, CYCLE_G,  BANK_HI,  CYCLE_G);
+    uop_calc(MUL, BANK_HI, CYCLE_E,  CYCLE_G,  BANK_LO,  CYCLE_H);
+    uop_calc(MUL, BANK_LO, CYCLE_G,  CYCLE_SX, BANK_HI,  CYCLE_J);
+    uop_calc(ADD, BANK_HI, CYCLE_J,  CYCLE_J,  BANK_LO,  CYCLE_T1);
+    uop_calc(MUL, BANK_HI, CYCLE_F,  CYCLE_F,  BANK_LO,  CYCLE_T2);
+    uop_calc(SUB, BANK_LO, CYCLE_T2, CYCLE_T1, BANK_HI,  CYCLE_T3);
+    uop_move(     BANK_HI, CYCLE_T3, BANK_LO,  CYCLE_T3);
+    uop_calc(SUB, BANK_LO, CYCLE_T3, CYCLE_H,  BANK_HI,  CYCLE_RX);
+    uop_calc(SUB, BANK_HI, CYCLE_J,  CYCLE_RX, BANK_LO,  CYCLE_T1);
+    uop_move(     BANK_HI, CYCLE_F,  BANK_LO,  CYCLE_F);
+    uop_calc(MUL, BANK_LO, CYCLE_F,  CYCLE_T1, BANK_HI,  CYCLE_T2);
+    uop_calc(MUL, BANK_LO, CYCLE_H,  CYCLE_SY, BANK_HI,  CYCLE_T3);
+    uop_calc(SUB, BANK_HI, CYCLE_T2, CYCLE_T3, BANK_LO,  CYCLE_RY);
+    uop_move(     BANK_LO, CYCLE_RY, BANK_HI,  CYCLE_RY);
+
+    /* END_MICROCODE */
+
+    //
+    // handle special corner cases
+    //
+    if (uop_flagz_sz)
+    {
+        /* BEGIN_MICROCODE: CYCLE_ADD_AT_INFINITY */
+
+        uop_move(BANK_LO, CONST_GX,  BANK_HI, CYCLE_RX);
+        uop_move(BANK_LO, CONST_GY,  BANK_HI, CYCLE_RY);
+        uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ);
+
+        /* END_MICROCODE */
+    }
+    else
+    {
+        if (uop_flagz_e)
+        {
+            if (uop_flagz_f)
+            {
+                /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X_SAME_Y */
+
+                uop_move(BANK_LO, CONST_HX,  BANK_HI, CYCLE_RX);
+                uop_move(BANK_LO, CONST_HY,  BANK_HI, CYCLE_RY);
+                uop_move(BANK_HI, CONST_ONE, BANK_LO, CYCLE_RZ);
+
+                /* END_MICROCODE */
+            }
+            else
+            {
+                /* BEGIN_MICROCODE: CYCLE_ADD_SAME_X */
+
+                uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RX);
+                uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RY);
+                uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ);
+
+                /* END_MICROCODE */
+            }
+        }
+        else
+        {
+            /* BEGIN_MICROCODE: CYCLE_ADD_REGULAR */
+
+            uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_T1);
+            uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_T2);
+            uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_T3);
+
+            /* END_MICROCODE */
+        }
+    }
+}
+
+
+#ifdef USE_MICROCODE
+//------------------------------------------------------------------------------
+void fpga_curve_base_scalar_multiply_microcode(const FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+{
+	int word_count, bit_count;  // counters
+	FPGA_WORD k_word;
+	bool k_bit;
+
+    // initialize internal banks
+    fpga_multiword_copy(&ECDSA_ZERO,  &BUF_LO[CONST_ZERO]);
+    fpga_multiword_copy(&ECDSA_ZERO,  &BUF_HI[CONST_ZERO]);
+
+    fpga_multiword_copy(&ECDSA_ONE,   &BUF_LO[CONST_ONE]);
+    fpga_multiword_copy(&ECDSA_ONE,   &BUF_HI[CONST_ONE]);
+
+    fpga_multiword_copy(&ECDSA_DELTA, &BUF_LO[CONST_DELTA]);
+    fpga_multiword_copy(&ECDSA_DELTA, &BUF_HI[CONST_DELTA]);
+
+    fpga_multiword_copy(&ECDSA_GX, &BUF_LO[CONST_GX]);
+    fpga_multiword_copy(&ECDSA_GX, &BUF_HI[CONST_GX]);
+
+    fpga_multiword_copy(&ECDSA_GY, &BUF_LO[CONST_GY]);
+    fpga_multiword_copy(&ECDSA_GY, &BUF_HI[CONST_GY]);
+
+    fpga_multiword_copy(&ECDSA_HX, &BUF_LO[CONST_HX]);
+    fpga_multiword_copy(&ECDSA_HX, &BUF_HI[CONST_HX]);
+
+    fpga_multiword_copy(&ECDSA_HY, &BUF_LO[CONST_HY]);
+    fpga_multiword_copy(&ECDSA_HY, &BUF_HI[CONST_HY]);
+
+    /* BEGIN_MICROCODE: PREPARE */
+    
+    // set initial value of R to point at infinity
+	uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RX);
+	uop_move(BANK_LO, CONST_ONE,  BANK_HI, CYCLE_RY);
+	uop_move(BANK_HI, CONST_ZERO, BANK_LO, CYCLE_RZ);
+
+    /* END_MICROCODE */
+
+        /* process bits of k left-to-right */
+    for (word_count=FPGA_OPERAND_NUM_WORDS; word_count>0; word_count--)
+        for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
+        {
+			k_word = k->words[word_count-1];
+			k_bit = (k_word & (FPGA_WORD)(1 << (bit_count-1))) > 0;            
+
+            // Banks of working cycle operands
+            // -------------------------------
+            // RX: HI
+            // RY: HI
+            // RZ: LO
+
+            // calculate S = 2 * R
+            fpga_curve_double_jacobian_microcode();
+
+            // Banks of working cycle operands
+            // -------------------------------
+            // SX: LO
+            // SY: LO
+            // SZ: HI
+
+            // always calculate R = S * G for constant-time operation
+            fpga_curve_add_jacobian_microcode();
+
+            // Banks of working cycle operands
+            // -------------------------------
+            // RX: HI
+            // RY: HI
+            // RZ: LO
+
+            
+			if (!k_bit)
+            {
+                /* BEGIN_MICROCODE: CYCLE_K0 */
+
+                // revert to the value of S before addition if the current bit of k is not set
+                uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX);
+                uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY);
+                uop_move(BANK_HI, CYCLE_SZ, BANK_LO, CYCLE_RZ);
+
+                /* END_MICROCODE */
+            }
+            else
+            {
+                /* BEGIN_MICROCODE: CYCLE_K1 */
+
+                // do dummy overwrite for constant-time operation
+                uop_move(BANK_HI, CYCLE_RX, BANK_LO, CYCLE_SX);
+                uop_move(BANK_HI, CYCLE_RY, BANK_LO, CYCLE_SY);
+                uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_SZ);
+
+                /* END_MICROCODE */
+            }
+
+            FPGA_BUFFER TEMP;
+
+            //printf("wc = %d, bc = %d\n", word_count-1, bit_count-1);
+
+            uop_stor(BANK_LO, CYCLE_RX,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RX   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_RY,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RY   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_RZ,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_RZ   = ", &TEMP);
+
+            uop_stor(BANK_LO, CYCLE_SX,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SX   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_SY,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SY   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_SZ,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_SZ   = ", &TEMP);
+
+            uop_stor(BANK_LO, CYCLE_A,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_A2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_A2   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_B,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_B    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_C,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_C2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_C2_2 = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_D,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_D    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_E,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_E    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_F,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_F    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_G,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_G    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_H,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_H    = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_J,    &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_J    = ", &TEMP);
+
+            uop_stor(BANK_LO, CYCLE_Z2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_Z2   = ", &TEMP);
+
+            uop_stor(BANK_LO, CYCLE_T1,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T1   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_T2,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T2   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_T3,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T3   = ", &TEMP);
+            uop_stor(BANK_LO, CYCLE_T4,   &TEMP); print_fpga_buffer_nodelim("LO:CYCLE_T4   = ", &TEMP);
+
+            uop_stor(BANK_HI, CYCLE_RX,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RX   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_RY,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RY   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_RZ,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_RZ   = ", &TEMP);
+
+            uop_stor(BANK_HI, CYCLE_SX,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SX   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_SY,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SY   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_SZ,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_SZ   = ", &TEMP);
+
+            uop_stor(BANK_HI, CYCLE_A,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_A2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_A2   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_B,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_B    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_C,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_C2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_C2_2, &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_C2_2 = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_D,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_D    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_E,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_E    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_F,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_F    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_G,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_G    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_H,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_H    = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_J,    &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_J    = ", &TEMP);
+
+            uop_stor(BANK_HI, CYCLE_Z2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_Z2   = ", &TEMP);
+
+            uop_stor(BANK_HI, CYCLE_T1,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T1   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_T2,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T2   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_T3,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T3   = ", &TEMP);
+            uop_stor(BANK_HI, CYCLE_T4,   &TEMP); print_fpga_buffer_nodelim("HI:CYCLE_T4   = ", &TEMP);
+
+        }
+
+    // now convert to affine coordinates
+	fpga_modular_inv23_microcode();
+
+    /* BEGIN_MICROCODE: CONVERT */
+
+    uop_calc(MUL, BANK_HI, INVERT_A2, CYCLE_RX, BANK_LO, CYCLE_SX);
+    uop_calc(MUL, BANK_HI, INVERT_A3, CYCLE_RY, BANK_LO, CYCLE_SY);
+    uop_cmpz(BANK_LO, CYCLE_RZ);
+
+    /* END_MICROCODE */
+
+    if (uop_flagz_rz)
+    {   
+        /* BEGIN_MICROCODE: CONVERT_AT_INFINITY */
+
+        uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RX);
+        uop_move(BANK_LO, CONST_ZERO, BANK_HI, CYCLE_RY);
+
+        /* END_MICROCODE */
+    }
+	else
+	{
+        /* BEGIN_MICROCODE: CONVERT_REGULAR */
+
+        uop_move(BANK_LO, CYCLE_SX, BANK_HI, CYCLE_RX);
+        uop_move(BANK_LO, CYCLE_SY, BANK_HI, CYCLE_RY);
+
+        /* END_MICROCODE */
+	}
+
+    // return
+    uop_stor(BANK_HI, CYCLE_RX, qx);
+    uop_stor(BANK_HI, CYCLE_RY, qy);
+}
+#endif USE_MICROCODE
+
+
+//------------------------------------------------------------------------------
+void fpga_curve_double_jacobian_microcode_wrapper(const FPGA_BUFFER *rx,
+                                                  const FPGA_BUFFER *ry,
+                                                  const FPGA_BUFFER *rz,
+                                                        FPGA_BUFFER *sx,
+                                                        FPGA_BUFFER *sy,
+                                                        FPGA_BUFFER *sz)
+//------------------------------------------------------------------------------
+{
+    uop_load(rx, BANK_HI, CYCLE_RX);
+    uop_load(ry, BANK_HI, CYCLE_RY);
+    uop_load(rz, BANK_LO, CYCLE_RZ);
+
+    fpga_curve_double_jacobian_microcode();
+
+    uop_stor(BANK_LO, CYCLE_SX, sx);
+    uop_stor(BANK_LO, CYCLE_SY, sy);
+    uop_stor(BANK_HI, CYCLE_SZ, sz);
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_curve_add_jacobian_microcode_wrapper(const FPGA_BUFFER *sx,
+                                               const FPGA_BUFFER *sy,
+                                               const FPGA_BUFFER *sz,
+                                                     FPGA_BUFFER *rx,
+                                                     FPGA_BUFFER *ry,
+                                                     FPGA_BUFFER *rz)
+//------------------------------------------------------------------------------
+{
+    uop_load(sx, BANK_LO, CYCLE_SX);
+    uop_load(sy, BANK_LO, CYCLE_SY);
+    uop_load(sz, BANK_HI, CYCLE_SZ);
+
+    fpga_curve_add_jacobian_microcode();
+
+    uop_stor(BANK_HI, CYCLE_RX, rx);
+    uop_stor(BANK_HI, CYCLE_RY, ry);
+    uop_stor(BANK_LO, CYCLE_RZ, rz);
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_lowlevel.cpp b/ecdsa_fpga_lowlevel.cpp
index 511856a..c7ab322 100644
--- a/fpga_lowlevel.cpp
+++ b/ecdsa_fpga_lowlevel.cpp
@@ -1,137 +1,135 @@
-//------------------------------------------------------------------------------
-//
-// fpga_lowlevel.cpp
-// -----------------------------------
-// Models of Low-level FPGA primitives
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Headers
-//------------------------------------------------------------------------------
-#include <stdint.h>
-#include "ecdsa_model.h"
-#include "fpga_lowlevel.h"
-
-
-//------------------------------------------------------------------------------
-//
-// Low-level 32-bit adder with carry input and carry output.
-//
-// Carries are 1 bit wide.
-//
-// {c_out, s} = x + y + c_in
-//
-//------------------------------------------------------------------------------
-void fpga_lowlevel_add32(FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out)
-//------------------------------------------------------------------------------
-{
-		// calculate sum
-	FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x + (FPGA_WORD_EXTENDED)y;
-
-		// process carry input
-	if (c_in) r++;
-
-		// store sum
-	*s = (FPGA_WORD)r;
-
-		// shift sum to obtain carry flag
-	r >>= FPGA_WORD_WIDTH;
-
-		// store carry
-	*c_out = (r != 0);
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Low-level 32-bit subtractor with borrow input and borrow output.
-//
-// Borrows are 1 bit wide.
-//
-// {b_out, d} = x - y - b_in
-//
-void fpga_lowlevel_sub32(FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out)
-//------------------------------------------------------------------------------
-{
-		// calculate difference
-	FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x - (FPGA_WORD_EXTENDED)y;
-
-		// process borrow input
-	if (b_in) r--;
-
-		// store difference
-	*d = (FPGA_WORD)r;
-
-		// shift difference to obtain borrow flag
-	r >>= FPGA_WORD_WIDTH;
-
-		// store borrow
-	*b_out = (r != 0);
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Low-level 16x16-bit multiplier.
-//
-// Inputs are 16-bit wide, outputs are 32-bit wide.
-//
-// p = x * y
-//
-void fpga_lowlevel_mul16(FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p)
-//------------------------------------------------------------------------------
-{
-		// multiply and store result
-	*p = (FPGA_WORD)x * (FPGA_WORD)y;
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Low-level 48-bit adder without carries.
-//
-// s = (x + y)[47:0]
-//
-void fpga_lowlevel_add48(FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s)
-//------------------------------------------------------------------------------
-{
-		// add and store result truncated to 48 bits
-	*s = (x + y) & FPGA_MASK_ADDER48;
-}
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+//
+// fpga_ecdsa_lowlevel.cpp
+// -----------------------------------
+// Models of low-level FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_model.h"
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 32-bit adder with carry input and carry output.
+//
+// Carries are 1 bit wide.
+//
+// {c_out, s} = x + y + c_in
+//
+//------------------------------------------------------------------------------
+void fpga_lowlevel_add32(FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out)
+//------------------------------------------------------------------------------
+{
+        // calculate sum
+    FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x + (FPGA_WORD_EXTENDED)y;
+
+        // process carry input
+    if (c_in) r++;
+
+        // store sum
+    *s = (FPGA_WORD)r;
+
+        // shift sum to obtain carry flag
+    r >>= FPGA_WORD_WIDTH;
+
+        // store carry
+    *c_out = (r != 0);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 32-bit subtractor with borrow input and borrow output.
+//
+// Borrows are 1 bit wide.
+//
+// {b_out, d} = x - y - b_in
+//
+void fpga_lowlevel_sub32(FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out)
+//------------------------------------------------------------------------------
+{
+        // calculate difference
+    FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x - (FPGA_WORD_EXTENDED)y;
+
+        // process borrow input
+    if (b_in) r--;
+
+        // store difference
+    *d = (FPGA_WORD)r;
+
+        // shift difference to obtain borrow flag
+    r >>= FPGA_WORD_WIDTH;
+
+        // store borrow
+    *b_out = (r != 0);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 16x16-bit multiplier.
+//
+// Inputs are 16-bit wide, output is 32-bit wide.
+//
+// p = x * y
+//
+void fpga_lowlevel_mul16(FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p)
+//------------------------------------------------------------------------------
+{
+        // multiply and store result
+    *p = (FPGA_WORD)x * (FPGA_WORD)y;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 47-bit adder without carries.
+//
+// s = (x + y)[46:0]
+//
+void fpga_lowlevel_add47(FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s)
+//------------------------------------------------------------------------------
+{
+        // add and store result truncated to 47 bits
+    *s = (x + y) & FPGA_MASK_ADDER47;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_lowlevel.h b/ecdsa_fpga_lowlevel.h
index eeb5c4f..8d0dccd 100644
--- a/fpga_lowlevel.h
+++ b/ecdsa_fpga_lowlevel.h
@@ -1,80 +1,79 @@
-//------------------------------------------------------------------------------
-//
-// fpga_lowlevel.h
-// -----------------------------------
-// Models of Low-level FPGA primitives
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// FPGA Pipeline Settings
-//------------------------------------------------------------------------------
-#define FPGA_WORD_WIDTH		(32)
-#define OPERAND_NUM_WORDS	(OPERAND_WIDTH / FPGA_WORD_WIDTH)
-
-
-//------------------------------------------------------------------------------
-// Word Types (Normal, Extended, Reduced)
-//------------------------------------------------------------------------------
-typedef uint32_t FPGA_WORD;
-typedef uint64_t FPGA_WORD_EXTENDED;
-typedef uint16_t FPGA_WORD_REDUCED;
-
-
-//------------------------------------------------------------------------------
-// Extended Adder Mask
-//------------------------------------------------------------------------------
-#define FPGA_MASK_ADDER48	((FPGA_WORD_EXTENDED)0x0000FFFFFFFFFFFF)
-
-
-//------------------------------------------------------------------------------
-struct FPGA_BUFFER
-//------------------------------------------------------------------------------
-{
-	FPGA_WORD	words[OPERAND_NUM_WORDS];
-};
-
-
-//------------------------------------------------------------------------------
-// Prototypes
-//------------------------------------------------------------------------------
-void	fpga_lowlevel_add32		(FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out);
-void	fpga_lowlevel_sub32		(FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out);
-void	fpga_lowlevel_mul16		(FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p);
-void	fpga_lowlevel_add48		(FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s);
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+//
+// fpga_ecdsa_lowlevel.h
+// -----------------------------------
+// Models of low-level FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+
+
+//------------------------------------------------------------------------------
+// FPGA Pipeline Settings
+//------------------------------------------------------------------------------
+#define FPGA_WORD_WIDTH (32)
+
+
+//------------------------------------------------------------------------------
+// Word Types (Normal 32-bit, Extended 47-bit, Reduced 16-bit)
+//------------------------------------------------------------------------------
+typedef uint32_t FPGA_WORD;
+typedef uint64_t FPGA_WORD_EXTENDED;
+typedef uint16_t FPGA_WORD_REDUCED;
+
+
+//------------------------------------------------------------------------------
+// Wide 47-bit Adder Mask
+//------------------------------------------------------------------------------
+#define FPGA_MASK_ADDER47   ((FPGA_WORD_EXTENDED)0x00007FFFFFFFFFFF)
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void fpga_lowlevel_add32 (FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out);
+void fpga_lowlevel_sub32 (FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out);
+
+void fpga_lowlevel_mul16 (FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p);
+
+void fpga_lowlevel_add47 (FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_microcode.cpp b/ecdsa_fpga_microcode.cpp
new file mode 100644
index 0000000..f02dc8a
--- /dev/null
+++ b/ecdsa_fpga_microcode.cpp
@@ -0,0 +1,432 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_microcode.cpp
+// --------------------------------
+// Microcode Architecture for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Required for Microcode Routines
+//------------------------------------------------------------------------------
+#define USE_MICROCODE
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_model.h"
+
+
+//------------------------------------------------------------------------------
+// Global Buffers
+//------------------------------------------------------------------------------
+FPGA_BUFFER BUF_LO[ECDSA_UOP_OPERAND_COUNT];
+FPGA_BUFFER BUF_HI[ECDSA_UOP_OPERAND_COUNT];
+
+
+//------------------------------------------------------------------------------
+// Global Flags
+//------------------------------------------------------------------------------
+bool uop_flagz_sz;
+bool uop_flagz_rz;
+bool uop_flagz_e;
+bool uop_flagz_f;
+
+
+//------------------------------------------------------------------------------
+void uop_move(UOP_BANK src, int s_op, UOP_BANK dst, int d_op)
+//------------------------------------------------------------------------------
+{
+	FPGA_BUFFER *s_ptr = NULL;
+	FPGA_BUFFER *d_ptr = NULL;
+
+	if (src == BANK_LO) s_ptr = &BUF_LO[s_op];
+	if (src == BANK_HI) s_ptr = &BUF_HI[s_op];
+	if (dst == BANK_LO) d_ptr = &BUF_LO[d_op];
+	if (dst == BANK_HI) d_ptr = &BUF_HI[d_op];
+
+	fpga_multiword_copy(s_ptr, d_ptr);
+}
+
+
+//------------------------------------------------------------------------------
+void uop_cmpz(UOP_BANK src, int s_op)
+//------------------------------------------------------------------------------
+{
+    bool flagz;
+
+    FPGA_BUFFER *s_ptr = NULL;
+
+	if (src == BANK_LO)	s_ptr = &BUF_LO[s_op];
+	if (src == BANK_HI) s_ptr = &BUF_HI[s_op];
+
+    flagz = fpga_multiword_is_zero(s_ptr);
+
+    switch (s_op)
+    {
+        case CYCLE_SZ:
+            uop_flagz_sz = flagz;
+            break;
+        case CYCLE_RZ:
+            uop_flagz_rz = flagz;
+            break;
+        case CYCLE_E:
+            uop_flagz_e = flagz;
+            break;
+        case CYCLE_F:
+            uop_flagz_f = flagz;
+            break;
+    }
+}
+
+
+//------------------------------------------------------------------------------
+void uop_calc(UOP_MATH math,
+              UOP_BANK src, int s_op1, int s_op2,
+              UOP_BANK dst, int d_op)
+//------------------------------------------------------------------------------
+{
+	FPGA_BUFFER *s_ptr1 = NULL;
+	FPGA_BUFFER *s_ptr2 = NULL;
+	FPGA_BUFFER *d_ptr = NULL;
+	FPGA_BUFFER *n_ptr = NULL;
+
+	if (src == BANK_LO)
+	{	s_ptr1 = &BUF_LO[s_op1];
+		s_ptr2 = &BUF_LO[s_op2];
+	}
+	if (src == BANK_HI)
+	{	s_ptr1 = &BUF_HI[s_op1];
+		s_ptr2 = &BUF_HI[s_op2];
+	}
+	if (dst == BANK_LO)
+	{	d_ptr = &BUF_LO[d_op];
+	}
+	if (dst == BANK_HI)
+	{	d_ptr = &BUF_HI[d_op];
+	}
+
+	if (math == ADD) fpga_modular_add(s_ptr1, s_ptr2, d_ptr);
+	if (math == SUB) fpga_modular_sub(s_ptr1, s_ptr2, d_ptr);
+	if (math == MUL) fpga_modular_mul(s_ptr1, s_ptr2, d_ptr);
+}
+
+
+//------------------------------------------------------------------------------
+void uop_load(const FPGA_BUFFER *mem, UOP_BANK dst, int d_op)
+//------------------------------------------------------------------------------
+{
+	FPGA_BUFFER *d_ptr = NULL;
+	if (dst == BANK_LO) d_ptr = &BUF_LO[d_op];
+	if (dst == BANK_HI) d_ptr = &BUF_HI[d_op];
+
+	fpga_multiword_copy(mem, d_ptr);
+}
+
+
+//------------------------------------------------------------------------------
+void uop_stor(UOP_BANK src, int s_op, FPGA_BUFFER *mem)
+//------------------------------------------------------------------------------
+{
+	FPGA_BUFFER *s_ptr = NULL;
+	if (src == BANK_LO)
+	{	s_ptr = &BUF_LO[s_op];
+	}
+	if (src == BANK_HI)
+	{	s_ptr = &BUF_HI[s_op];
+	}
+
+	fpga_multiword_copy(s_ptr, mem);
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_modular_inv23_p256_microcode()
+//------------------------------------------------------------------------------
+//
+// This computes A2 = RZ^-2 and A3 = RZ^-3.
+//
+// RZ is read from the lower bank, A2 and A3 are written to the upper bank.
+//
+//------------------------------------------------------------------------------
+{
+	uop_loop;
+
+    //
+    // operand placement map:
+    //
+    // X1  - LO,HI (RZ)
+    // X2  - LO,HI
+    // X3  - LO,HI
+    // X6  - LO
+    // X12 - HI
+    // X15 - LO,HI
+    // X30 - HI
+    // X32 - LO,HI
+
+    /* BEGIN_MICROCODE: INVERT_P256 */
+
+    // first obtain intermediate helper quantities (X#)
+
+	// mirror X1 to HI bank (don't waste time copying to X1, just use RZ)
+	uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_RZ);
+
+	// compute X2 and mirror to the other bank
+	uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ,  BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, CYCLE_RZ, INVERT_R1, BANK_LO, INVERT_X2);
+	uop_move(BANK_LO, INVERT_X2, BANK_HI, INVERT_X2);
+
+	// compute X3 and mirror to the other bank
+	uop_calc(MUL, BANK_LO, INVERT_X2, INVERT_X2, BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, INVERT_R1, CYCLE_RZ,  BANK_LO, INVERT_X3);
+	uop_move(BANK_LO, INVERT_X3, BANK_HI, INVERT_X3);
+
+	// compute X6 (stored in the lower bank)
+	uop_calc(MUL, BANK_LO, INVERT_X3, INVERT_X3, BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+	uop_calc(MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, INVERT_R1, INVERT_X3, BANK_LO, INVERT_X6);
+
+	// compute X12 (stored in the upper bank)
+	uop_calc(MUL, BANK_LO, INVERT_X6, INVERT_X6, BANK_HI, INVERT_R1);
+	uop_cycle(5);
+		uop_calc_if_even(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_repeat();
+	uop_calc(MUL, BANK_LO, INVERT_R2, INVERT_X6, BANK_HI, INVERT_X12);
+	
+	// compute X15 and mirror to the other bank
+	uop_calc(MUL, BANK_HI, INVERT_X12, INVERT_X12, BANK_LO, INVERT_R1);
+	uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+	uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_X3, BANK_HI, INVERT_X15);
+	uop_move(BANK_HI, INVERT_X15, BANK_LO, INVERT_X15);
+
+	// compute X30 (stored in the upper bank)
+	uop_calc(MUL, BANK_HI, INVERT_X15, INVERT_X15, BANK_LO, INVERT_R1);
+	uop_cycle(14);
+		uop_calc_if_even(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_repeat();
+	uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_X15, BANK_HI, INVERT_X30);
+
+	// compute X32 and mirror to the other bank
+	uop_calc(MUL, BANK_HI, INVERT_X30, INVERT_X30, BANK_LO, INVERT_R1);
+	uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+	uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_X2, BANK_LO, INVERT_X32);
+	uop_move(BANK_LO, INVERT_X32, BANK_HI, INVERT_X32);
+
+	// now compute the final results
+
+	uop_calc(MUL, BANK_LO, INVERT_X32, INVERT_X32, BANK_HI, INVERT_R1);
+
+	uop_cycle(31);
+		uop_calc_if_even(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_repeat();
+
+	uop_calc(MUL, BANK_LO, INVERT_R2, CYCLE_RZ, BANK_HI, INVERT_R1);
+
+	uop_cycle(128);
+		uop_calc_if_even(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_repeat();
+
+	uop_calc(MUL, BANK_HI, INVERT_R1, INVERT_X32, BANK_LO, INVERT_R2);
+
+	uop_cycle(32);
+		uop_calc_if_even(MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+		uop_calc_if_odd (MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+	uop_repeat();
+
+	uop_calc(MUL, BANK_LO, INVERT_R2, INVERT_X32, BANK_HI, INVERT_R1);
+
+	uop_cycle(30);
+		uop_calc_if_even(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_repeat();
+
+	uop_calc(MUL, BANK_HI, INVERT_R1, INVERT_X30, BANK_LO, INVERT_R2);
+	uop_calc(MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+
+    // move A2 into the upper bank
+	uop_move(BANK_LO, INVERT_R2, BANK_HI, INVERT_A2);
+
+    // A3 ends up in the upper bank by itself
+	uop_calc(MUL, BANK_HI, INVERT_A2, INVERT_A2, BANK_LO, INVERT_R1);
+	uop_calc(MUL, BANK_LO, INVERT_R1, CYCLE_RZ, BANK_HI, INVERT_A3);
+
+    /* END_MICROCODE */
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_modular_inv23_p384_microcode()
+//------------------------------------------------------------------------------
+//
+// This computes A2 = RZ^-2 and A3 = RZ^-3.
+//
+// RZ is read from the lower bank, A2 and A3 are written to the upper bank.
+//
+//------------------------------------------------------------------------------
+{
+	uop_loop;
+
+    //
+    // operand placement map:
+    //
+    // X1  - LO,HI (RZ)
+    // X2  - LO,HI
+    // X3  - LO,HI
+    // X6  - LO
+    // X12 - HI
+    // X15 - LO,HI
+    // X30 - HI
+    // X32 - LO,HI
+
+    /* BEGIN_MICROCODE: INVERT_P384 */
+
+    // first obtain intermediate helper quantities (X#)
+
+	// mirror X1 to HI bank (don't waste time copying to X1, just use RZ)
+	uop_move(BANK_LO, CYCLE_RZ, BANK_HI, CYCLE_RZ);
+
+	// compute X2 and mirror to the other bank
+	uop_calc(MUL, BANK_LO, CYCLE_RZ, CYCLE_RZ,  BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, CYCLE_RZ, INVERT_R1, BANK_LO, INVERT_X2);
+	uop_move(BANK_LO, INVERT_X2, BANK_HI, INVERT_X2);
+
+	// compute X3 and mirror to the other bank
+	uop_calc(MUL, BANK_LO, INVERT_X2, INVERT_X2, BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, INVERT_R1, CYCLE_RZ,  BANK_LO, INVERT_X3);
+	uop_move(BANK_LO, INVERT_X3, BANK_HI, INVERT_X3);
+
+	// compute X6 (stored in the lower bank)
+	uop_calc(MUL, BANK_LO, INVERT_X3, INVERT_X3, BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+	uop_calc(MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_calc(MUL, BANK_HI, INVERT_R1, INVERT_X3, BANK_LO, INVERT_X6);
+
+	// compute X12 (stored in the upper bank)
+	uop_calc(MUL, BANK_LO, INVERT_X6, INVERT_X6, BANK_HI, INVERT_R1);
+	uop_cycle(5);
+		uop_calc_if_even(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_repeat();
+	uop_calc(MUL, BANK_LO, INVERT_R2, INVERT_X6, BANK_HI, INVERT_X12);
+	
+	// compute X15 and mirror to the other bank
+	uop_calc(MUL, BANK_HI, INVERT_X12, INVERT_X12, BANK_LO, INVERT_R1);
+	uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+	uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_X3, BANK_HI, INVERT_X15);
+	uop_move(BANK_HI, INVERT_X15, BANK_LO, INVERT_X15);
+
+	// compute X30 (stored in the upper bank)
+	uop_calc(MUL, BANK_HI, INVERT_X15, INVERT_X15, BANK_LO, INVERT_R1);
+	uop_cycle(14);
+		uop_calc_if_even(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_repeat();
+	uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_X15, BANK_HI, INVERT_X30);
+
+    // compute X60 (stored in the lower bank)
+	uop_calc(MUL, BANK_HI, INVERT_X30, INVERT_X30, BANK_LO, INVERT_R1);
+	uop_cycle(29);
+		uop_calc_if_even(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_repeat();
+	uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_X30, BANK_LO, INVERT_X60);
+
+    // compute X120 (stored in the upper bank)
+	uop_calc(MUL, BANK_LO, INVERT_X60, INVERT_X60, BANK_HI, INVERT_R1);
+	uop_cycle(59);
+		uop_calc_if_even(MUL, BANK_HI, INVERT_R1, INVERT_R1, BANK_LO, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_LO, INVERT_R2, INVERT_R2, BANK_HI, INVERT_R1);
+	uop_repeat();
+	uop_calc(MUL, BANK_LO, INVERT_R2, INVERT_X60, BANK_HI, INVERT_X120);
+
+	// now compute the final results
+
+	uop_calc(MUL, BANK_HI, INVERT_X120, INVERT_X120, BANK_LO, INVERT_R1);
+
+	uop_cycle(119);
+		uop_calc_if_even(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_repeat();
+
+    uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_X120, BANK_LO, INVERT_R1);
+
+	uop_cycle(15);
+		uop_calc_if_even(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+		uop_calc_if_odd (MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_repeat();
+
+    uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_X15, BANK_LO, INVERT_R1);
+
+    uop_cycle(31);
+	    uop_calc_if_even(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+        uop_calc_if_odd (MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+	uop_repeat();
+
+    uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_X30, BANK_LO, INVERT_R1);
+    uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+    uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+    uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_X2, BANK_HI, INVERT_R2);
+
+    uop_cycle(94);
+	    uop_calc_if_even(MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+        uop_calc_if_odd (MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+	uop_repeat();
+
+    uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_X30, BANK_LO, INVERT_R1);
+    uop_calc(MUL, BANK_LO, INVERT_R1, INVERT_R1, BANK_HI, INVERT_R2);
+    uop_calc(MUL, BANK_HI, INVERT_R2, INVERT_R2, BANK_LO, INVERT_R1);
+
+    // move A2 into the upper bank
+	uop_move(BANK_LO, INVERT_R1, BANK_HI, INVERT_A2);
+
+    // A3 ends up in the upper bank by itself
+	uop_calc(MUL, BANK_HI, INVERT_A2, INVERT_A2, BANK_LO, INVERT_R1);
+	uop_calc(MUL, BANK_LO, INVERT_R1, CYCLE_RZ, BANK_HI, INVERT_A3);
+
+    /* END_MICROCODE */
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_microcode.h b/ecdsa_fpga_microcode.h
new file mode 100644
index 0000000..f551d96
--- /dev/null
+++ b/ecdsa_fpga_microcode.h
@@ -0,0 +1,164 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_microcode.h
+// --------------------------------
+// Microcode Architecture for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdlib.h>	// NULL
+
+
+//------------------------------------------------------------------------------
+enum UOP_BANK
+//------------------------------------------------------------------------------
+{
+	BANK_LO, BANK_HI
+};
+
+//--------------------------
+enum UOP_OPERAND
+//--------------------------
+{
+	CONST_ZERO,     //  0
+	CONST_ONE,      //  1
+    CONST_DELTA,    //  2
+
+	CONST_GX,       //  3
+	CONST_GY,       //  4
+    
+	CONST_HX,       //  5
+	CONST_HY,       //  6
+
+	CYCLE_RX,       //  7
+	CYCLE_RY,       //  8
+	CYCLE_RZ,       //  9
+
+	CYCLE_SX,       // 10
+	CYCLE_SY,       // 11
+	CYCLE_SZ,       // 12
+
+    CYCLE_A,        // 13
+    CYCLE_A2,       // 14
+    CYCLE_B,        // 15
+    CYCLE_C,        // 16
+    CYCLE_C2,       // 17
+    CYCLE_C2_2,     // 18
+    CYCLE_D,        // 19
+    CYCLE_E,        // 20
+    CYCLE_F,        // 21
+    CYCLE_G,        // 22
+    CYCLE_H,        // 23
+    CYCLE_J,        // 24
+
+    CYCLE_Z2,       // 25
+
+    CYCLE_T1,       // 26
+    CYCLE_T2,       // 27
+    CYCLE_T3,       // 28
+    CYCLE_T4,       // 29
+
+	INVERT_R1,      // 30
+    INVERT_R2,      // 31
+
+	INVERT_X2,      // 32
+	INVERT_X3,      // 33
+	INVERT_X6,      // 34
+	INVERT_X12,     // 35
+	INVERT_X15,     // 36
+	INVERT_X30,     // 37
+    INVERT_X32,     // 38
+    INVERT_X60,     // 39
+	INVERT_X120,    // 40
+
+	INVERT_A2,      // 41
+    INVERT_A3,      // 42
+
+	ECDSA_UOP_OPERAND_COUNT
+};
+
+//------------------------------------------------------------------------------
+enum UOP_MATH
+//------------------------------------------------------------------------------
+{
+	ADD, SUB, MUL
+};
+
+
+//------------------------------------------------------------------------------
+// Global Storage Buffers
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER BUF_LO[ECDSA_UOP_OPERAND_COUNT];
+extern FPGA_BUFFER BUF_HI[ECDSA_UOP_OPERAND_COUNT];
+
+
+//------------------------------------------------------------------------------
+// Global Flags
+//------------------------------------------------------------------------------
+extern bool uop_flagz_sz;
+extern bool uop_flagz_rz;
+extern bool uop_flagz_e;
+extern bool uop_flagz_f;
+
+
+//------------------------------------------------------------------------------
+// Loop Macros
+//------------------------------------------------------------------------------
+#define uop_loop                int uop_cnt
+#define uop_cycle(iters);		for (uop_cnt=0; uop_cnt<iters; uop_cnt++) {
+#define uop_repeat();			}
+#define uop_calc_if_even(...)	if (!(uop_cnt % 2)) uop_calc(__VA_ARGS__)
+#define uop_calc_if_odd(...)	else uop_calc(__VA_ARGS__)
+
+
+//------------------------------------------------------------------------------
+// Prototypes (Micro-Operations)
+//------------------------------------------------------------------------------
+void uop_move(enum UOP_BANK src, int s_op1, UOP_BANK dst, int d_op1);
+void uop_cmpz(UOP_BANK src, int s_op);
+
+void uop_calc (UOP_MATH math,
+               UOP_BANK src, int s_op1, int s_op2,
+               UOP_BANK dst, int d_op);
+
+void uop_load(const FPGA_BUFFER *mem, UOP_BANK dst, int d_op);
+void uop_stor(UOP_BANK src, int s_op, FPGA_BUFFER *mem);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_model.cpp b/ecdsa_fpga_model.cpp
new file mode 100644
index 0000000..182c2ab
--- /dev/null
+++ b/ecdsa_fpga_model.cpp
@@ -0,0 +1,539 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_model.cpp
+// --------------------------------------------
+// Base point scalar multiplier model for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdlib.h>
+
+
+//------------------------------------------------------------------------------
+// Microcode Switch
+//------------------------------------------------------------------------------
+#define USE_MICROCODE
+
+
+//------------------------------------------------------------------------------
+// More Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_model.h"
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void fpga_model_init ();
+
+bool test_base_point_multiplier (const FPGA_BUFFER *k,
+                                 const FPGA_BUFFER *qx,
+                                 const FPGA_BUFFER *qy);
+
+bool abuse_internal_point_adder   ();
+bool abuse_internal_point_doubler ();
+
+
+//------------------------------------------------------------------------------
+// Locals
+//------------------------------------------------------------------------------
+static FPGA_BUFFER ecdsa_d_nsa;
+static FPGA_BUFFER ecdsa_qx_nsa;
+static FPGA_BUFFER ecdsa_qy_nsa;
+
+static FPGA_BUFFER ecdsa_k_nsa;
+static FPGA_BUFFER ecdsa_rx_nsa;
+static FPGA_BUFFER ecdsa_ry_nsa;
+
+static FPGA_BUFFER ecdsa_d_random;
+static FPGA_BUFFER ecdsa_qx_random;
+static FPGA_BUFFER ecdsa_qy_random;
+
+
+//------------------------------------------------------------------------------
+int main()
+//------------------------------------------------------------------------------
+{
+    bool    ok;     // flag
+
+        // initialize buffers
+    fpga_multiword_init();
+    fpga_modular_init();
+    fpga_curve_init();
+    fpga_model_init();
+
+	printf("ECDSA_UOP_OPERAND_COUNT == %d\n\n", ECDSA_UOP_OPERAND_COUNT);
+
+		// test base point multiplier: Q = d * G
+    printf("Trying to derive public key from private key (NSA test vector)...\n\n");
+    ok = test_base_point_multiplier(&ecdsa_d_nsa, &ecdsa_qx_nsa, &ecdsa_qy_nsa);
+    if (!ok) return EXIT_FAILURE;
+
+        // test base point multiplier: R = k * G
+    printf("Trying to sign something (NSA test vector)...\n\n");
+    ok = test_base_point_multiplier(&ecdsa_k_nsa, &ecdsa_rx_nsa, &ecdsa_ry_nsa);
+    if (!ok) return EXIT_FAILURE;
+
+		// test base point multiplier: Q = d * G
+    printf("Trying to derive public key from private key (random test vector)...\n\n");
+    ok = test_base_point_multiplier(&ecdsa_d_random, &ecdsa_qx_random, &ecdsa_qy_random);
+    if (!ok) return EXIT_FAILURE;
+
+        // test base point multiplier: O = n * G
+    printf("Trying to multiply the base point by its order...\n\n");
+    ok = test_base_point_multiplier(&ECDSA_N, &ECDSA_ZERO, &ECDSA_ZERO);
+    if (!ok) return EXIT_FAILURE;
+
+        // now run some intricate tests...
+
+        /* Excerpt from the commit message e718fdfae6443466e566ed6ce1515cdecc215ac0:
+
+        The model does multiplication using the double-and-add algorithm. When adding
+        two points P and Q on curves P-256 and P-384, four special cases must be
+        considered. One of them is P = Q, in that situation the explicit addition
+        formulae don't work and either 2*P or 2*Q must be returned from the addition
+        routine. In this model Q is always the base point G, so when P = G, then 2*G
+        must be returned. Since G is fixed, this model stores precomputed point H = 2*G
+        and returns it when adding G + G for true constant-time operation.
+    
+        During multiplication the bits of k are scanned left-to-right, so doubling is
+        done before addition. This way the only situation when both inputs to the
+        addition routine are equal to G is when after doubling the result is G. This in
+        its turn is only possible when k = n + 2 (where n is the order of the base
+        point G). ECDSA requires integer k to be [1, n-1], one of the side effects
+        is that the model has a code path that will never be used under normal
+        operation. This code path can be verified by first multiplying by k = 2
+        (special handling for P = G not triggered), then multiplying by k = n + 2
+        (special handling for P = G triggered). Both multiplications should produce
+        the same output. In the former case the output will be calculated on-the-fly,
+        in the latter case the pre-computed coordinates of H will be used.
+         */
+
+
+		// test base point multiplier: H = 2 * G
+    FPGA_BUFFER two;
+    fpga_modular_add(&ECDSA_ONE, &ECDSA_ONE, &two);
+
+    printf("Trying to double the base point...\n\n");
+    ok = test_base_point_multiplier(&two, &ECDSA_HX, &ECDSA_HY);
+    if (!ok) return EXIT_FAILURE;
+
+		// test base point multiplier: G = (n + 1) * G
+    FPGA_BUFFER n1;
+    fpga_modular_add(&ECDSA_N, &ECDSA_ONE, &n1);	// n1 = n + 1
+
+    printf("Trying to multiply the base point by its order plus one...\n\n");
+    ok = test_base_point_multiplier(&n1, &ECDSA_GX, &ECDSA_GY);
+    if (!ok) return EXIT_FAILURE;
+
+        // test base point multiplier: H = (n + 2) * G
+    FPGA_BUFFER n2;
+    fpga_modular_add(&ECDSA_N, &two, &n2);	// n2 = n + two
+
+    printf("Trying to multiply the base point by its order plus two...\n\n");
+    ok = test_base_point_multiplier(&n2, &ECDSA_HX, &ECDSA_HY);
+    if (!ok) return EXIT_FAILURE;
+
+        // ..and some more tests
+
+        // try to abuse internal point doubler
+    ok = abuse_internal_point_doubler();
+    if (!ok) return EXIT_FAILURE;
+    
+        // try to abuse internal point adder
+    ok = abuse_internal_point_adder();
+    if (!ok) return EXIT_FAILURE;
+
+        // everything went just fine
+    return EXIT_SUCCESS;
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_model_init()
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+    FPGA_BUFFER tmp_d_nsa  = ECDSA_D_NSA_INIT, tmp_k_nsa = ECDSA_K_NSA_INIT;
+    FPGA_BUFFER tmp_qx_nsa = ECDSA_QX_NSA_INIT, tmp_rx_nsa = ECDSA_RX_NSA_INIT;
+    FPGA_BUFFER tmp_qy_nsa = ECDSA_QY_NSA_INIT, tmp_ry_nsa = ECDSA_RY_NSA_INIT;
+
+	FPGA_BUFFER tmp_d_random  = ECDSA_D_RANDOM_INIT;
+    FPGA_BUFFER tmp_qx_random = ECDSA_QX_RANDOM_INIT;
+    FPGA_BUFFER tmp_qy_random = ECDSA_QY_RANDOM_INIT;
+
+        // fill buffers for large multi-word integers
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {
+        ecdsa_d_nsa.words[w] = tmp_d_nsa.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ecdsa_qx_nsa.words[w] = tmp_qx_nsa.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ecdsa_qy_nsa.words[w] = tmp_qy_nsa.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+
+        ecdsa_k_nsa.words[w] = tmp_k_nsa.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ecdsa_rx_nsa.words[w] = tmp_rx_nsa.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ecdsa_ry_nsa.words[w] = tmp_ry_nsa.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+
+		ecdsa_d_random.words[w] = tmp_d_random.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ecdsa_qx_random.words[w] = tmp_qx_random.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+        ecdsa_qy_random.words[w] = tmp_qy_random.words[FPGA_OPERAND_NUM_WORDS - (w+1)];
+    }
+}
+
+
+//------------------------------------------------------------------------------
+bool test_base_point_multiplier (const FPGA_BUFFER *k,
+                                 const FPGA_BUFFER *qx,
+                                 const FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+//
+// k - multiplier
+//
+// qx, qy - expected coordinates of product
+//
+// Returns true when point (rx,ry) = k * G matches the point (qx,qy).
+//
+//------------------------------------------------------------------------------
+{
+    bool ok;                // flag
+    FPGA_BUFFER rx, ry;     // result
+
+        // run the model
+    fpga_curve_base_scalar_multiply(k, &rx, &ry);
+
+        // handle result
+    ok = compare_fpga_buffers(qx, qy, &rx, &ry);
+
+    if (!ok)
+    {   printf("\n    ERROR\n\n");
+        return false;
+    }
+    else printf("\n    OK\n\n");
+
+        // everything went just fine
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool compare_fpga_buffers(  const FPGA_BUFFER *az,
+                            const FPGA_BUFFER *bz)
+//------------------------------------------------------------------------------
+//
+// Compare z-coordinates of two points and return true when they match.
+//
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        // print all the values
+    print_fpga_buffer("  Expected:   Z = ", az);
+    print_fpga_buffer("  Calculated: Z = ", bz);
+
+        // compare values
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {
+            // compare x
+        if (az->words[w] != bz->words[w]) return false;
+    }
+
+        // values are the same
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool compare_fpga_buffers(  const FPGA_BUFFER *ax,
+                            const FPGA_BUFFER *ay,
+                            const FPGA_BUFFER *bx,
+                            const FPGA_BUFFER *by)
+//------------------------------------------------------------------------------
+//
+// Compare affine coordinates of two points and return true when they match.
+//
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        // print all the values
+    print_fpga_buffer("  Expected:   X = ", ax);
+    print_fpga_buffer("  Calculated: X = ", bx);
+    printf("\n");
+    print_fpga_buffer("  Expected:   Y = ", ay);
+    print_fpga_buffer("  Calculated: Y = ", by);
+
+        // compare values
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {
+            // compare x
+        if (ax->words[w] != bx->words[w]) return false;
+
+            // compare y
+        if (ay->words[w] != by->words[w]) return false;
+    }
+
+        // values are the same
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool compare_fpga_buffers(  const FPGA_BUFFER *ax,
+                            const FPGA_BUFFER *ay,
+                            const FPGA_BUFFER *az,
+                            const FPGA_BUFFER *bx,
+                            const FPGA_BUFFER *by,
+                            const FPGA_BUFFER *bz)
+//------------------------------------------------------------------------------
+//
+// Compare projective coordinates of two points and return true when they match.
+//
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        // print all the values
+    print_fpga_buffer("  Expected:   X = ", ax);
+    print_fpga_buffer("  Calculated: X = ", bx);
+    printf("\n");
+    print_fpga_buffer("  Expected:   Y = ", ay);
+    print_fpga_buffer("  Calculated: Y = ", by);
+    printf("\n");
+    print_fpga_buffer("  Expected:   Z = ", az);
+    print_fpga_buffer("  Calculated: Z = ", bz);
+
+        // compare values
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {
+            // compare x
+        if (ax->words[w] != bx->words[w]) return false;
+
+            // compare y
+        if (ay->words[w] != by->words[w]) return false;
+
+            // compare z
+        if (az->words[w] != bz->words[w]) return false;
+    }
+
+        // values are the same
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+void print_fpga_buffer(const char *s, const FPGA_BUFFER *buf)
+//------------------------------------------------------------------------------
+//
+// Pretty print large multi-word integer.
+//
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        // print header
+    printf("%s", s);
+
+        // print all bytes
+    for (w=FPGA_OPERAND_NUM_WORDS; w>0; w--)
+    {   
+        printf("%08x", buf->words[w-1]);
+
+            // insert space after every group of 4 bytes
+        if (w > 1) printf(" ");
+    }
+
+        // print footer
+    printf("\n");
+}
+
+
+//------------------------------------------------------------------------------
+void print_fpga_buffer_nodelim(const char *s, const FPGA_BUFFER *buf)
+//------------------------------------------------------------------------------
+//
+// Pretty print large multi-word integer.
+//
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        // print header
+    printf("%s", s);
+
+        // print all bytes
+    for (w=FPGA_OPERAND_NUM_WORDS; w>0; w--)
+        printf("%08x", buf->words[w-1]);
+
+        // print footer
+    printf("\n");
+}
+
+
+//------------------------------------------------------------------------------
+bool abuse_internal_point_doubler()
+//------------------------------------------------------------------------------
+//
+// This routine tries to abuse the internal curve point doubler by forcing it
+// to double point at infinity.
+//
+//------------------------------------------------------------------------------
+{
+    int w;      // word counter
+    bool ok;    // flag
+    
+    FPGA_BUFFER px, py, pz;     // input
+    FPGA_BUFFER qx, qy, qz;     // output
+
+        // set P.X and P.Y to some "random" garbage and P.Z to zero
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {   px.words[w] = ECDSA_GX.words[w] ^ ECDSA_HX.words[w];
+        py.words[w] = ECDSA_GY.words[w] ^ ECDSA_HY.words[w];
+    }
+    fpga_multiword_copy(&ECDSA_ZERO, &pz);
+
+        // try to double point at infinity (should produce point at infinity)
+    printf("Trying to double something at infinity...\n\n");
+    fpga_curve_double_jacobian(&px, &py, &pz, &qx, &qy, &qz);
+
+        // handle result
+    ok = compare_fpga_buffers(&ECDSA_ZERO, &qz);
+    if (! ok)
+    {   printf("\n    ERROR\n\n");
+        return false;
+    }
+    else printf("\n    OK\n\n");
+
+        // everything went just fine
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool abuse_internal_point_adder()
+//------------------------------------------------------------------------------
+//
+// This routine tries to abuse the internal curve point adder by forcing it to
+// go throgh all the possible "corner cases".
+//
+//------------------------------------------------------------------------------
+{
+    int w;      // word counter 
+    bool ok;    // flag
+
+    FPGA_BUFFER px, py, pz;     // input
+    FPGA_BUFFER rx, ry, rz;     // output
+
+    //
+    // try to add point at infinity to the base point
+    //
+    {
+            // set P.X and P.Y to some "random" garbage and P.Z to zero
+        for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+        {   px.words[w] = ECDSA_GX.words[w] ^ ECDSA_HX.words[w];
+            py.words[w] = ECDSA_GY.words[w] ^ ECDSA_HY.words[w];
+        }
+        fpga_multiword_copy(&ECDSA_ZERO, &pz);
+
+            // run addition proceduce
+        printf("Trying to add something at infinity to the base point...\n\n");
+        fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
+
+            // handle result
+        ok = compare_fpga_buffers(&ECDSA_GX, &ECDSA_GY, &ECDSA_ONE, &rx, &ry, &rz);
+        if (! ok)
+        {   printf("\n    ERROR\n\n");
+            return false;
+        }
+        else printf("\n    OK\n\n");
+    }
+
+    //
+    // try to add the base point to itself
+    //
+    {
+            // set P to G
+        fpga_multiword_copy(&ECDSA_GX, &px);
+        fpga_multiword_copy(&ECDSA_GY, &py);
+        fpga_multiword_copy(&ECDSA_ONE, &pz);
+
+            // run addition proceduce
+        printf("Trying to add the base point to itself...\n\n");
+        fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
+
+            // handle result
+        ok = compare_fpga_buffers(&ECDSA_HX, &ECDSA_HY, &ECDSA_ONE, &rx, &ry, &rz);
+        if (! ok)
+        {   printf("\n    ERROR\n\n");
+            return false;
+        }
+        else printf("\n    OK\n\n");
+    }
+
+    //
+    // try to add the base point to its opposite
+    //
+    {
+            // set P to (G.X, -G.Y, 1)
+        fpga_multiword_copy(&ECDSA_ZERO, &px);
+        fpga_multiword_copy(&ECDSA_ZERO, &py);
+        fpga_multiword_copy(&ECDSA_ONE,  &pz);
+
+        fpga_modular_add(&ECDSA_ZERO, &ECDSA_GX, &px);
+        fpga_modular_sub(&ECDSA_ZERO, &ECDSA_GY, &py);
+
+            // run addition proceduce
+        printf("Trying to add the base point to its opposite...\n\n");
+        fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
+
+            // handle result
+        ok = compare_fpga_buffers(&ECDSA_ONE, &ECDSA_ONE, &ECDSA_ZERO, &rx, &ry, &rz);
+        if (! ok)
+        {   printf("\n    ERROR\n\n");
+            return false;
+        }
+        else printf("\n    OK\n\n");
+    }
+
+        // everything went just fine
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_model.h b/ecdsa_fpga_model.h
new file mode 100644
index 0000000..ea045b9
--- /dev/null
+++ b/ecdsa_fpga_model.h
@@ -0,0 +1,141 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_model.h
+// --------------------------------------------
+// Base point scalar multiplier model for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+//
+// Curve Selection (don't override if already selected, handy for testing
+// from the STM32 sample driver)
+//
+// USE_CURVE == 1  -> P-256
+// USE_CURVE == 2  -> P-384
+//
+//------------------------------------------------------------------------------
+#ifndef USE_CURVE
+#define USE_CURVE   2
+#endif
+//------------------------------------------------------------------------------
+#define BAD_CURVE   #error USE_CURVE must be either 1 or 2!
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_lowlevel.h"
+#include "ecdsa_fpga_multiword.h"
+#include "ecdsa_fpga_modular.h"
+#include "ecdsa_fpga_curve.h"
+#ifdef USE_MICROCODE
+#include "ecdsa_fpga_microcode.h"
+#endif
+
+
+//------------------------------------------------------------------------------
+// Test Vectors
+//------------------------------------------------------------------------------
+#include "ecdsa_test_vector_nsa.h"
+#include "ecdsa_test_vector_randomized.h"
+
+
+//------------------------------------------------------------------------------
+// Test Vectors Switch
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+
+#define ECDSA_D_NSA_INIT		ECDSA_P256_D_NSA_INIT
+#define ECDSA_QX_NSA_INIT		ECDSA_P256_QX_NSA_INIT
+#define ECDSA_QY_NSA_INIT		ECDSA_P256_QY_NSA_INIT
+
+#define ECDSA_K_NSA_INIT		ECDSA_P256_K_NSA_INIT
+#define ECDSA_RX_NSA_INIT		ECDSA_P256_RX_NSA_INIT
+#define ECDSA_RY_NSA_INIT		ECDSA_P256_RY_NSA_INIT
+
+#define ECDSA_D_RANDOM_INIT		ECDSA_P256_D_RANDOM_INIT
+#define ECDSA_QX_RANDOM_INIT	ECDSA_P256_QX_RANDOM_INIT
+#define ECDSA_QY_RANDOM_INIT	ECDSA_P256_QY_RANDOM_INIT
+
+#elif USE_CURVE == 2
+
+#define ECDSA_D_NSA_INIT		ECDSA_P384_D_NSA_INIT
+#define ECDSA_QX_NSA_INIT		ECDSA_P384_QX_NSA_INIT
+#define ECDSA_QY_NSA_INIT		ECDSA_P384_QY_NSA_INIT
+
+#define ECDSA_K_NSA_INIT		ECDSA_P384_K_NSA_INIT
+#define ECDSA_RX_NSA_INIT		ECDSA_P384_RX_NSA_INIT
+#define ECDSA_RY_NSA_INIT		ECDSA_P384_RY_NSA_INIT
+
+#define ECDSA_D_RANDOM_INIT		ECDSA_P384_D_RANDOM_INIT
+#define ECDSA_QX_RANDOM_INIT	ECDSA_P384_QX_RANDOM_INIT
+#define ECDSA_QY_RANDOM_INIT	ECDSA_P384_QY_RANDOM_INIT
+
+#else
+
+BAD_CURVE
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void print_fpga_buffer    (const char        *s,
+                           const FPGA_BUFFER *v);
+
+void print_fpga_buffer_nodelim    (const char        *s,
+                           const FPGA_BUFFER *v);
+
+bool compare_fpga_buffers (const FPGA_BUFFER *az,
+                           const FPGA_BUFFER *bz);
+
+bool compare_fpga_buffers (const FPGA_BUFFER *ax,
+                           const FPGA_BUFFER *ay,
+                           const FPGA_BUFFER *bx,
+                           const FPGA_BUFFER *by);
+
+bool compare_fpga_buffers (const FPGA_BUFFER *ax,
+                           const FPGA_BUFFER *ay,
+                           const FPGA_BUFFER *az,
+                           const FPGA_BUFFER *bx,
+                           const FPGA_BUFFER *by,
+                           const FPGA_BUFFER *bz);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_modular.cpp b/ecdsa_fpga_modular.cpp
new file mode 100644
index 0000000..9d22c05
--- /dev/null
+++ b/ecdsa_fpga_modular.cpp
@@ -0,0 +1,723 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_modular.cpp
+// -------------------------------------
+// Modular arithmetic routines for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_model.h"
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+FPGA_BUFFER ECDSA_Q;
+FPGA_BUFFER ECDSA_DELTA;
+
+
+//------------------------------------------------------------------------------
+void fpga_modular_init()
+//------------------------------------------------------------------------------
+{
+    int w_src, w_dst;   // word counters
+
+        // temporary things
+    FPGA_WORD TMP_Q    [FPGA_OPERAND_NUM_WORDS] = ECDSA_Q_INIT;
+    FPGA_WORD TMP_DELTA[FPGA_OPERAND_NUM_WORDS] = ECDSA_DELTA_INIT;
+
+        /* fill buffers for large multi-word integers, we need to fill them in
+         * reverse order because of the way C arrays are initialized
+         */
+    for (   w_src = 0, w_dst = FPGA_OPERAND_NUM_WORDS - 1;
+            w_src < FPGA_OPERAND_NUM_WORDS;
+            w_src++, w_dst--)
+    {
+        ECDSA_Q.words[w_dst] = TMP_Q[w_src];
+        ECDSA_DELTA.words[w_dst] = TMP_DELTA[w_src];
+    }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular addition.
+//
+// This routine implements algorithm 3. from "Ultra High Performance ECC over
+// NIST Primes on Commercial FPGAs".
+//
+// s = (a + b) mod q
+//
+// The naive approach is like the following:
+//
+// 1. s = a + b
+// 2. if (s >= q) s -= q
+//
+// The speed-up trick is to simultaneously calculate (a + b) and (a + b - q)
+// and then select the right variant.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_add(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *s)
+//------------------------------------------------------------------------------
+{
+    int w;                  // word counter
+    FPGA_BUFFER ab, ab_n;   // intermediate buffers
+    bool c_in, c_out;       // carries
+    bool b_in, b_out;       // borrows
+
+    c_in = false;           // first word has no carry
+    b_in = false;           // first word has no borrow
+    
+        // run parallel addition and subtraction
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {
+        fpga_lowlevel_add32(a->words[w], b->words[w],      c_in, &ab.words[w],   &c_out);
+        fpga_lowlevel_sub32(ab.words[w], ECDSA_Q.words[w], b_in, &ab_n.words[w], &b_out);
+
+        c_in = c_out;   // propagate carry
+        b_in = b_out;   // propagate borrow
+    }
+
+        // now select the right buffer
+
+        /*
+         * We select the right variant based on borrow and carry flags after
+         * addition and subtraction of the very last pair of words. Note, that
+         * we only need to select the first variant (a + b) when (a + b) < q.
+         * This way if we get negative number after subtraction, we discard it
+         * and use the output of the adder instead. The subtractor output is
+         * negative when borrow flag is set *and* carry flag is not set. When
+         * both borrow and carry are set, the number is non-negative, because
+         * borrow and carry cancel each other out.
+         */
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+        s->words[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular subtraction.
+//
+// This routine implements algorithm 3. from "Ultra High Performance ECC over
+// NIST Primes on Commercial FPGAs".
+//
+// d = (a - b) mod q
+//
+// The naive approach is like the following:
+//
+// 1. d = a - b
+// 2. if (a < b) d += q
+//
+// The speed-up trick is to simultaneously calculate (a - b) and (a - b + q)
+// and then select the right variant.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_sub(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *d)
+//------------------------------------------------------------------------------
+{
+    int w;                  // word counter
+    FPGA_BUFFER ab, ab_n;   // intermediate buffers
+    bool c_in, c_out;       // carries
+    bool b_in, b_out;       // borrows
+
+    c_in = false;           // first word has no carry
+    b_in = false;           // first word has no borrow
+    
+        // run parallel subtraction and addition
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+    {
+        fpga_lowlevel_sub32(a->words[w], b->words[w],      b_in, &ab.words[w],   &b_out);
+        fpga_lowlevel_add32(ab.words[w], ECDSA_Q.words[w], c_in, &ab_n.words[w], &c_out);
+
+        b_in = b_out;   // propagate borrow
+        c_in = c_out;   // propagate carry
+    }
+
+        // now select the right buffer
+
+        /*
+         * We select the right variant based on borrow flag after subtraction
+         * and addition of the very last pair of words. Note, that we only
+         * need to select the second variant (a - b + q) when a < b. This way
+         * if we get negative number after subtraction, we discard it
+         * and use the output of the adder instead. The Subtractor output is
+         * negative when borrow flag is set.
+         */
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+        d->words[w] = b_out ? ab_n.words[w] : ab.words[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular multiplication.
+//
+// This routine implements modular multiplication algorithm from "Ultra High
+// Performance ECC over NIST Primes on Commercial FPGAs".
+//
+// p = (a * b) mod q
+//
+// The complex algorithm is split into three parts:
+//
+// 1. Calculation of partial words
+// 2. Acccumulation of partial words into full-size product
+// 3. Modular reduction of the full-size product
+//
+// See comments for corresponding helper routines for more information.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_BUFFER *p)
+//------------------------------------------------------------------------------
+{
+    FPGA_WORD_EXTENDED si[4*FPGA_OPERAND_NUM_WORDS-1];  // parts of intermediate product
+    FPGA_WORD c[2*FPGA_OPERAND_NUM_WORDS];              // full-size intermediate product
+
+        /* multiply to get partial words */
+    fpga_modular_mul_helper_multiply(a, b, si);
+
+        /* accumulate partial words into full-size product */
+    fpga_modular_mul_helper_accumulate(si, c);
+
+        /* reduce full-size product using special routine */
+    fpga_modular_mul_helper_reduce(c, p);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Parallelized multiplication.
+//
+// This routine implements the algorithm in Fig. 3. from "Ultra High
+// Performance ECC over NIST Primes on Commercial FPGAs".
+//
+// Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2
+// bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide
+// inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS
+// multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial
+// output words si[].
+//
+// The order of loading a and b into the multipliers is a bit complicated,
+// during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is
+// obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the
+// remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul_helper_multiply(const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si)
+//------------------------------------------------------------------------------
+{
+    int w;          // counter
+    int t, x;       // more counters
+    int j, i;       // word indices
+    FPGA_WORD p;    // product
+
+        // buffers for smaller words that multipliers can handle
+    FPGA_WORD_REDUCED ai[2*FPGA_OPERAND_NUM_WORDS];
+    FPGA_WORD_REDUCED bj[2*FPGA_OPERAND_NUM_WORDS];
+    
+        // split a and b into smaller words
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+        ai[2*w] = (FPGA_WORD_REDUCED)a->words[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)),
+        bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2));
+
+        // accumulators
+    FPGA_WORD_EXTENDED mac[2*FPGA_OPERAND_NUM_WORDS];
+    
+        // clear accumulators
+    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++) mac[w] = 0;
+
+        // run the crazy algorithm :)
+    for (t=0; t<(2*FPGA_OPERAND_NUM_WORDS); t++)
+    {
+            // save upper half of si[] (one word per cycle)
+        if (t > 0)
+        {   si[4*FPGA_OPERAND_NUM_WORDS - (t+1)] = mac[t];
+            mac[t] = 0;
+        }
+
+            // update index
+        j = 2*FPGA_OPERAND_NUM_WORDS - (t+1);
+
+            // parallel multiplication
+        for (x=0; x<(2*FPGA_OPERAND_NUM_WORDS); x++)
+        {
+                // update index
+            i = t - x;
+            if (i < 0) i += 2*FPGA_OPERAND_NUM_WORDS;
+
+                // multiply...
+            fpga_lowlevel_mul16(ai[i], bj[j], &p);
+
+                // ...accumulate
+            mac[x] += p;
+        }
+
+    }
+
+        // now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once)
+    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++)
+        si[w] = mac[2*FPGA_OPERAND_NUM_WORDS - (w+1)];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Accumulation of partial words into full-size product.
+//
+// This routine implements the Algorithm 4. from "Ultra High Performance ECC
+// over NIST Primes on Commercial FPGAs".
+//
+// Input words si[] are accumulated into full-size product c[].
+//
+// The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in
+// si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are
+// summed in full, odd words are split into two parts. During every cycle the
+// upper part of the previous word and the lower part of the following word are
+// summed too.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul_helper_accumulate(const FPGA_WORD_EXTENDED *si, FPGA_WORD *c)
+//------------------------------------------------------------------------------
+{
+    int w;                          // word counter
+    FPGA_WORD_EXTENDED cw0, cw1;    // intermediate sums
+    FPGA_WORD_REDUCED  cw_carry;    // wide carry
+
+        // clear carry
+    cw_carry = 0;
+
+        // execute the algorithm
+    for (w=0; w<(2*FPGA_OPERAND_NUM_WORDS); w++)
+    {
+            // handy flags
+        bool w_is_first = (w == 0);
+        bool w_is_last  = (w == (2*FPGA_OPERAND_NUM_WORDS-1));
+
+            // accumulate full current even word...
+            // ...and also the upper part of the previous odd word (if not the first word)
+        fpga_lowlevel_add47(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0);
+
+            // generate another word from "carry" part of the previous even word...
+            // ...and also the lower part of the following odd word (if not the last word)
+        cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2));
+        cw1 |= (FPGA_WORD_EXTENDED)cw_carry;
+
+            // accumulate once again
+        fpga_lowlevel_add47(cw0, cw1, &cw1);
+
+            // store current word...
+        c[w] = (FPGA_WORD)cw1;
+
+            // ...and carry
+        cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH);
+    }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Fast modular reduction for NIST prime P-256.
+//
+// p = c mod p256
+//
+// This routine implements the algorithm 2.29 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
+// on the other hand is the output of the parallelized Comba multiplier, so it
+// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
+//
+// To save FPGA resources, the calculation is done using only two adders and
+// one subtractor. The algorithm is split into five steps.
+//
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+void fpga_modular_mul_helper_reduce_p256(const FPGA_WORD *c, FPGA_BUFFER *p)
+{
+        // "funny" words
+    FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9;
+
+        // compose "funny" words out of input words
+    s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0];
+    s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0,     s2.words[1] = 0,     s2.words[0] = 0;
+    s3.words[7] = 0,     s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0,     s3.words[1] = 0,     s3.words[0] = 0;
+    s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0,     s4.words[4] = 0,     s4.words[3] = 0,     s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8];
+    s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9];
+    s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0,     s6.words[4] = 0,     s6.words[3] = 0,     s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11];
+    s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0,     s7.words[4] = 0,     s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12];
+    s8.words[7] = c[12], s8.words[6] = 0,     s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13];
+    s9.words[7] = c[13], s9.words[6] = 0,     s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0,     s9.words[1] = c[15], s9.words[0] = c[14];
+    
+        // intermediate results
+    FPGA_BUFFER sum0, sum1, difference;
+
+        /* Step 1. */
+    fpga_modular_add(&s2,         &s2,         &sum0);          // sum0 = 2*s2
+    fpga_modular_add(&s3,         &s3,         &sum1);          // sum1 = 2*s3
+    fpga_modular_sub(&ECDSA_ZERO, &s6,         &difference);    // difference = -s6
+
+        /* Step 2. */
+    fpga_modular_add(&sum0,       &s1,         &sum0);          // sum0 = s1 + 2*s2
+    fpga_modular_add(&sum1,       &s4,         &sum1);          // sum1 = s4 + 2*s3
+    fpga_modular_sub(&difference, &s7,         &difference);    // difference = -(s6 + s7)
+
+        /* Step 3. */
+    fpga_modular_add(&sum0,       &s5,         &sum0);          // sum0 = s1 + 2*s2 + s5
+    fpga_modular_add(&sum1,       &ECDSA_ZERO, &sum1);          // compulsory cycle to keep sum1 constant for next stage
+    fpga_modular_sub(&difference, &s8,         &difference);    // difference = -(s6 + s7 + s8)
+
+        /* Step 4. */
+    fpga_modular_add(&sum0,       &sum1,       &sum0);          // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
+//  fpga_modular_add(<dummy>,     <dummy>,     &sum1);          // dummy cycle, result ignored
+    fpga_modular_sub(&difference, &s9,         &difference);    // difference = -(s6 + s7 + s8 + s9)
+
+        /* Step 5. */
+    fpga_modular_add(&sum0,       &difference, p);              // p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9
+//  fpga_modular_add(<dummy>,     <dummy>,     &sum1);          // dummy cycle, result ignored
+//  fpga_modular_add(<dummy>,     <dummy>,     &difference);    // dummy cycle, result ignored
+}
+#endif
+
+
+//------------------------------------------------------------------------------
+//
+// Fast modular reduction for NIST prime P-384.
+//
+// p = c mod p384
+//
+// This routine implements the algorithm 2.30 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
+// on the other hand is the output of the parallelized Comba multiplier, so it
+// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
+//
+// To save FPGA resources, the calculation is done using only two adders and
+// one subtractor. The algorithm is split into five steps.
+//
+//------------------------------------------------------------------------------
+#if USE_CURVE == 2
+void fpga_modular_mul_helper_reduce_p384(const FPGA_WORD *c, FPGA_BUFFER *p)
+{
+		// "funny" words
+	FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+		// compose "funny" words
+	 s1.words[11] = c[11],   s1.words[10] = c[10],   s1.words[ 9] = c[ 9],   s1.words[ 8] = c[ 8],   s1.words[ 7] = c[ 7],   s1.words[ 6] = c[ 6],   s1.words[ 5] = c[ 5],   s1.words[ 4] = c[ 4],   s1.words[ 3] = c[ 3],   s1.words[ 2] = c[ 2],   s1.words[ 1] = c[ 1],   s1.words[ 0] = c[ 0];
+	 s2.words[11] = 0,       s2.words[10] = 0,       s2.words[ 9] = 0,       s2.words[ 8] = 0,       s2.words[ 7] = 0,       s2.words[ 6] = c[23],   s2.words[ 5] = c[22],   s2.words[ 4] = c[21],   s2.words[ 3] = 0,       s2.words[ 2] = 0,       s2.words[ 1] = 0,       s2.words[ 0] = 0;
+	 s3.words[11] = c[23],   s3.words[10] = c[22],   s3.words[ 9] = c[21],   s3.words[ 8] = c[20],   s3.words[ 7] = c[19],   s3.words[ 6] = c[18],   s3.words[ 5] = c[17],   s3.words[ 4] = c[16],   s3.words[ 3] = c[15],   s3.words[ 2] = c[14],   s3.words[ 1] = c[13],   s3.words[ 0] = c[12];
+	 s4.words[11] = c[20],   s4.words[10] = c[19],   s4.words[ 9] = c[18],   s4.words[ 8] = c[17],   s4.words[ 7] = c[16],   s4.words[ 6] = c[15],   s4.words[ 5] = c[14],   s4.words[ 4] = c[13],   s4.words[ 3] = c[12],   s4.words[ 2] = c[23],   s4.words[ 1] = c[22],   s4.words[ 0] = c[21];
+	 s5.words[11] = c[19],   s5.words[10] = c[18],   s5.words[ 9] = c[17],   s5.words[ 8] = c[16],   s5.words[ 7] = c[15],   s5.words[ 6] = c[14],   s5.words[ 5] = c[13],   s5.words[ 4] = c[12],   s5.words[ 3] = c[20],   s5.words[ 2] = 0,       s5.words[ 1] = c[23],   s5.words[ 0] = 0;
+	 s6.words[11] = 0,       s6.words[10] = 0,       s6.words[ 9] = 0,       s6.words[ 8] = 0,       s6.words[ 7] = c[23],   s6.words[ 6] = c[22],   s6.words[ 5] = c[21],   s6.words[ 4] = c[20],   s6.words[ 3] = 0,       s6.words[ 2] = 0,       s6.words[ 1] = 0,       s6.words[ 0] = 0;
+	 s7.words[11] = 0,       s7.words[10] = 0,       s7.words[ 9] = 0,       s7.words[ 8] = 0,       s7.words[ 7] = 0,       s7.words[ 6] = 0,       s7.words[ 5] = c[23],   s7.words[ 4] = c[22],   s7.words[ 3] = c[21],   s7.words[ 2] = 0,       s7.words[ 1] = 0,       s7.words[ 0] = c[20];
+	 s8.words[11] = c[22],   s8.words[10] = c[21],   s8.words[ 9] = c[20],   s8.words[ 8] = c[19],   s8.words[ 7] = c[18],   s8.words[ 6] = c[17],   s8.words[ 5] = c[16],   s8.words[ 4] = c[15],   s8.words[ 3] = c[14],   s8.words[ 2] = c[13],   s8.words[ 1] = c[12],   s8.words[ 0] = c[23];
+	 s9.words[11] = 0,       s9.words[10] = 0,       s9.words[ 9] = 0,       s9.words[ 8] = 0,       s9.words[ 7] = 0,       s9.words[ 6] = 0,       s9.words[ 5] = 0,       s9.words[ 4] = c[23],   s9.words[ 3] = c[22],   s9.words[ 2] = c[21],   s9.words[ 1] = c[20],   s9.words[ 0] = 0;
+	s10.words[11] = 0,      s10.words[10] = 0,      s10.words[ 9] = 0,      s10.words[ 8] = 0,      s10.words[ 7] = 0,      s10.words[ 6] = 0,      s10.words[ 5] = 0,      s10.words[ 4] = c[23],  s10.words[ 3] = c[23],  s10.words[ 2] = 0,      s10.words[ 1] = 0,      s10.words[ 0] = 0;
+
+		// intermediate results
+	FPGA_BUFFER sum0, sum1, difference;
+
+		/* Step 1. */
+	fpga_modular_add(&s1,         &s3,         &sum0);			// sum0 = s1 + s3
+	fpga_modular_add(&s2,         &s2,         &sum1);			// sum1 = 2*s2
+	fpga_modular_sub(&ECDSA_ZERO, &s8,         &difference);	// difference = -s8
+
+		/* Step 2. */
+	fpga_modular_add(&sum0,       &s4,         &sum0);			// sum0 = s1 + s3 + s4
+	fpga_modular_add(&sum1,       &s5,         &sum1);			// sum1 = 2*s2 + s5
+	fpga_modular_sub(&difference, &s9,         &difference);	// difference = -(s8 + s9)
+
+		/* Step 3. */
+	fpga_modular_add(&sum0,       &s6,         &sum0);			// sum0 = s1 + s3 + s4 + s6
+	fpga_modular_add(&sum1,       &s7,         &sum1);			// sum1 = 2*s2 + s5 + s7
+	fpga_modular_sub(&difference, &s10,        &difference);	// difference = -(s8 + s9 + s10)
+
+		/* Step 4. */
+	fpga_modular_add(&sum0,       &sum1,       &sum0);			// sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
+//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
+	fpga_modular_sub(&difference, &ECDSA_ZERO, &difference);	// compulsory cycle to keep difference constant for next stage
+
+		/* Step 5. */
+	fpga_modular_add(&sum0,       &difference, p);				// p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
+//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
+//	fpga_modular_add(<dummy>,     <dummy>,     &difference);	// dummy cycle, result ignored
+}
+#endif
+
+
+#if USE_CURVE == 1
+//------------------------------------------------------------------------------
+void fpga_modular_inv23_p256(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3)
+//------------------------------------------------------------------------------
+//
+// This uses the addition chain from
+//
+// < https://briansmith.org/ecc-inversion-addition-chains-01 >
+//
+// to calculate A2 = A^-2 and A3 = A^-3.
+//
+//------------------------------------------------------------------------------
+{
+    // counter
+	int cyc_cnt;
+
+    // working variables
+	FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X32;
+
+    // first obtain intermediate helper quantities (X1..X32)
+
+    // X1
+	fpga_multiword_copy(A, &X1);
+
+    // X2
+	fpga_modular_mul(&X1, &X1, &R1);
+	fpga_modular_mul(&R1, &X1, &X2);
+
+    // X3
+	fpga_modular_mul(&X2, &X2, &R1);
+	fpga_modular_mul(&R1, &X1, &X3);
+
+    // X6
+	fpga_multiword_copy(&X3, &R1);
+	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R2, &X3, &X6);
+
+    // X12
+	fpga_multiword_copy(&X6, &R1);
+	for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X6, &X12);
+
+    // X15
+	fpga_multiword_copy(&X12, &R1);
+	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R2, &X3, &X15);
+
+    // X30
+	fpga_multiword_copy(&X15, &R1);
+	for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R2, &X15, &X30);
+
+    // X32
+	fpga_multiword_copy(&X30, &R1);
+	for (cyc_cnt=0; cyc_cnt<2; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X2, &X32);
+
+	// now compute the final results
+
+	fpga_multiword_copy(&X32, &R1);
+	for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X1, &R2);
+
+	for (cyc_cnt=0; cyc_cnt<128; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
+		else			    fpga_modular_mul(&R1, &R1, &R2);
+	}
+	fpga_modular_mul(&R2, &X32, &R1);
+
+	for (cyc_cnt=0; cyc_cnt<32; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X32, &R2);
+
+	for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
+		else			    fpga_modular_mul(&R1, &R1, &R2);
+	}
+	fpga_modular_mul(&R2, &X30, &R1);
+
+	fpga_modular_mul(&R1, &R1, &R2);
+	fpga_modular_mul(&R2, &R2, &R1);
+
+    // A2 obtained
+	fpga_multiword_copy(&R1, A2);
+
+    // now calculate compute inverse cubed from inverse squared
+	fpga_modular_mul(&R1, &R1, &R2);
+	fpga_modular_mul(&R2, A,   &R1);
+
+    // A3 obtained
+	fpga_multiword_copy(&R1, A3);
+}
+#endif
+
+
+#if USE_CURVE == 2
+//------------------------------------------------------------------------------
+void fpga_modular_inv23_p384(const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3)
+//------------------------------------------------------------------------------
+//
+// This uses the addition chain from
+//
+// < https://briansmith.org/ecc-inversion-addition-chains-01 >
+//
+// to calculate A2 = A^-2 and A3 = A^-3.
+//
+//------------------------------------------------------------------------------
+{
+    // counter
+	int cyc_cnt;
+
+    // working variables
+	FPGA_BUFFER R1, R2, X1, X2, X3, X6, X12, X15, X30, X60, X120;
+
+    // first obtain intermediate helper quantities (X1..X120)
+
+    // X1
+	fpga_multiword_copy(A, &X1);
+
+    // X2
+	fpga_modular_mul(&X1, &X1, &R1);
+	fpga_modular_mul(&R1, &X1, &X2);
+
+    // X3
+	fpga_modular_mul(&X2, &X2, &R1);
+	fpga_modular_mul(&R1, &X1, &X3);
+
+    // X6
+	fpga_multiword_copy(&X3, &R1);
+	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R2, &X3, &X6);
+
+    // X12
+	fpga_multiword_copy(&X6, &R1);
+	for (cyc_cnt=0; cyc_cnt<6; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X6, &X12);
+
+    // X15
+	fpga_multiword_copy(&X12, &R1);
+	for (cyc_cnt=0; cyc_cnt<3; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R2, &X3, &X15);
+
+    // X30
+	fpga_multiword_copy(&X15, &R1);
+	for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R2, &X15, &X30);
+
+    // X60
+	fpga_multiword_copy(&X30, &R1);
+	for (cyc_cnt=0; cyc_cnt<30; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X30, &X60);
+
+    // X120
+	fpga_multiword_copy(&X60, &R1);
+	for (cyc_cnt=0; cyc_cnt<60; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X60, &X120);
+
+	// now compute the final results
+
+	fpga_multiword_copy(&X120, &R1);
+	for (cyc_cnt=0; cyc_cnt<120; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+	fpga_modular_mul(&R1, &X120, &R2);
+
+    for (cyc_cnt=0; cyc_cnt<15; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
+		else			    fpga_modular_mul(&R1, &R1, &R2);
+	}
+    fpga_modular_mul(&R1, &X15, &R2);
+
+    for (cyc_cnt=0; cyc_cnt<31; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R2, &R2, &R1);
+		else			    fpga_modular_mul(&R1, &R1, &R2);
+	}
+    fpga_modular_mul(&R1, &X30, &R2);
+    fpga_modular_mul(&R2, &R2, &R1);
+    fpga_modular_mul(&R1, &R1, &R2);
+    fpga_modular_mul(&R2, &X2, &R1);
+
+    for (cyc_cnt=0; cyc_cnt<94; cyc_cnt++)
+	{	if (!(cyc_cnt % 2))	fpga_modular_mul(&R1, &R1, &R2);
+		else			    fpga_modular_mul(&R2, &R2, &R1);
+	}
+    fpga_modular_mul(&R1, &X30, &R2);
+    fpga_modular_mul(&R2, &R2, &R1);
+    fpga_modular_mul(&R1, &R1, &R2);
+
+    // A2 obtained
+	fpga_multiword_copy(&R2, A2);
+
+    // now calculate compute inverse cubed from inverse squared
+	fpga_modular_mul(&R2, &R2, &R1);
+	fpga_modular_mul(&R1, A,   &R2);
+
+    // A3 obtained
+	fpga_multiword_copy(&R2, A3);
+}
+#endif
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_fpga_modular.h b/ecdsa_fpga_modular.h
new file mode 100644
index 0000000..3b75779
--- /dev/null
+++ b/ecdsa_fpga_modular.h
@@ -0,0 +1,144 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_fpga_modular.h
+// -------------------------------------
+// Modular arithmetic routines for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// ECDSA Parameters (P-256)
+//------------------------------------------------------------------------------
+
+/* Field Size */
+#define ECDSA_P256_Q_INIT \
+    {0xffffffff, 0x00000001, 0x00000000, 0x00000000, \
+     0x00000000, 0xffffffff, 0xffffffff, 0xffffffff}
+
+/* Division Factor  */
+#define ECDSA_P256_DELTA_INIT \
+    {0x7fffffff, 0x80000000, 0x80000000, 0x00000000, \
+     0x00000000, 0x80000000, 0x00000000, 0x00000000}
+
+
+//------------------------------------------------------------------------------
+// ECDSA Parameters (P-384)
+//------------------------------------------------------------------------------
+
+/* Field Size */
+#define ECDSA_P384_Q_INIT \
+    {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+     0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, \
+     0xffffffff, 0x00000000, 0x00000000, 0xffffffff}
+
+/* Division Factor  */
+#define ECDSA_P384_DELTA_INIT \
+    {0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+     0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+     0x7fffffff, 0x80000000, 0x00000000, 0x80000000}
+
+
+//------------------------------------------------------------------------------
+// ECDSA Parameters Switch
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+
+#define ECDSA_Q_INIT        ECDSA_P256_Q_INIT
+#define ECDSA_DELTA_INIT    ECDSA_P256_DELTA_INIT
+
+#elif USE_CURVE == 2
+
+#define ECDSA_Q_INIT        ECDSA_P384_Q_INIT
+#define ECDSA_DELTA_INIT    ECDSA_P384_DELTA_INIT
+
+#else
+
+BAD_CURVE
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER ECDSA_Q;
+extern FPGA_BUFFER ECDSA_DELTA;
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void fpga_modular_init ();
+
+void fpga_modular_add (const FPGA_BUFFER *A, const FPGA_BUFFER *B, FPGA_BUFFER *S);
+void fpga_modular_sub (const FPGA_BUFFER *A, const FPGA_BUFFER *B, FPGA_BUFFER *D);
+void fpga_modular_mul (const FPGA_BUFFER *A, const FPGA_BUFFER *B, FPGA_BUFFER *P);
+
+void fpga_modular_mul_helper_multiply    (const FPGA_BUFFER *a, const FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si);
+void fpga_modular_mul_helper_accumulate  (const FPGA_WORD_EXTENDED *si, FPGA_WORD *c);
+void fpga_modular_mul_helper_reduce_p256 (const FPGA_WORD *c, FPGA_BUFFER *p);
+void fpga_modular_mul_helper_reduce_p384 (const FPGA_WORD *c, FPGA_BUFFER *p);
+
+void fpga_modular_inv23_p256 (const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3);
+void fpga_modular_inv23_p384 (const FPGA_BUFFER *A, FPGA_BUFFER *A2, FPGA_BUFFER *A3);
+
+void fpga_modular_inv23_p256_microcode ();
+void fpga_modular_inv23_p384_microcode ();
+
+
+//------------------------------------------------------------------------------
+// ECDSA Prototype Switch
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+
+#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p256
+#define fpga_modular_inv23             fpga_modular_inv23_p256
+#define fpga_modular_inv23_microcode   fpga_modular_inv23_p256_microcode
+
+#elif USE_CURVE == 2
+
+#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p384
+#define fpga_modular_inv23             fpga_modular_inv23_p384
+#define fpga_modular_inv23_microcode   fpga_modular_inv23_p384_microcode
+
+#else
+
+BAD_CURVE
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_modular.h b/ecdsa_fpga_multiword.cpp
index 4d31725..ab65e7a 100644
--- a/fpga_modular.h
+++ b/ecdsa_fpga_multiword.cpp
@@ -1,87 +1,107 @@
-//------------------------------------------------------------------------------
-//
-// fpga_modular.h
-// ---------------------------
-// Modular arithmetic routines
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Globals
-//------------------------------------------------------------------------------
-extern FPGA_BUFFER ecdsa_q;
-extern FPGA_BUFFER ecdsa_zero;
-extern FPGA_BUFFER ecdsa_one;
-extern FPGA_BUFFER ecdsa_delta;
-
-
-//------------------------------------------------------------------------------
-// Prototypes
-//------------------------------------------------------------------------------
-void	fpga_modular_init						();
-
-void	fpga_modular_add						(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *s);
-void	fpga_modular_sub						(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *d);
-void	fpga_modular_mul						(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *p);
-void	fpga_modular_inv						(FPGA_BUFFER *a, FPGA_BUFFER *a1);
-
-void	fpga_modular_mul_helper_multiply		(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si);
-void	fpga_modular_mul_helper_accumulate		(FPGA_WORD_EXTENDED *si, FPGA_WORD *c);
-void	fpga_modular_mul_helper_reduce_p256		(FPGA_WORD *c, FPGA_BUFFER *p);
-void	fpga_modular_mul_helper_reduce_p384		(FPGA_WORD *c, FPGA_BUFFER *p);
-
-void	fpga_modular_inv_helper_shl				(FPGA_WORD *x, FPGA_WORD *y);
-void	fpga_modular_inv_helper_shr				(FPGA_WORD *x, FPGA_WORD *y);
-void	fpga_modular_inv_helper_add				(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *s);
-void	fpga_modular_inv_helper_sub				(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *d);
-void	fpga_modular_inv_helper_cpy				(FPGA_WORD *dst, FPGA_WORD *src);
-void	fpga_modular_inv_helper_cmp				(FPGA_WORD *a, FPGA_WORD *b, int *c);
-
-
-//------------------------------------------------------------------------------
-// Reduction Routine Selection
-//------------------------------------------------------------------------------
-
-#if USE_CURVE == 1
-#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p256
-#elif USE_CURVE == 2
-#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p384
-#else
-#error USE_CURVE must be either 1 or 2!
-#endif
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+//
+// fpga_ecdsa_multiword.cpp
+// -----------------------------------------
+// Models of multi-precision FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "ecdsa_fpga_model.h"
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+FPGA_BUFFER ECDSA_ZERO;
+FPGA_BUFFER ECDSA_ONE;
+
+
+//------------------------------------------------------------------------------
+void fpga_multiword_init()
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        /* fill buffers for large multi-word integers */
+    for (   w = FPGA_OPERAND_NUM_WORDS - 1;
+            w >= 0;
+            w -= 1)
+    {
+        ECDSA_ZERO.words[w] = 0;            // all words are zero
+        ECDSA_ONE.words[w]  = w ? 0 : 1;    // only the lowest word is 1
+    }
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_multiword_copy(const FPGA_BUFFER *src, FPGA_BUFFER *dst)
+//------------------------------------------------------------------------------
+//
+// Copies large multi-word integer from src into dst.
+//
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        // copy all the words from src into dst
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+        dst->words[w] = src->words[w];
+}
+
+
+//------------------------------------------------------------------------------
+bool fpga_multiword_is_zero(const FPGA_BUFFER *x)
+//------------------------------------------------------------------------------
+//
+// Returns true when large multi-word integer x is zero.
+//
+//------------------------------------------------------------------------------
+{
+    int w;  // word counter
+
+        // try to find non-zero words
+    for (w=0; w<FPGA_OPERAND_NUM_WORDS; w++)
+        if (x->words[w]) return false;
+
+        // no non-zero words found
+    return true;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_util.cpp b/ecdsa_fpga_multiword.h
index 6f7a834..9190fd5 100644
--- a/fpga_util.cpp
+++ b/ecdsa_fpga_multiword.h
@@ -1,86 +1,91 @@
-//------------------------------------------------------------------------------
-//
-// fpga_util.cpp
-// ---------------------
-// Utility FPGA routines
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Headers
-//------------------------------------------------------------------------------
-#include <stdint.h>
-#include "ecdsa_model.h"
-#include "fpga_lowlevel.h"
-#include "fpga_util.h"
-
-
-//------------------------------------------------------------------------------
-bool fpga_buffer_is_zero(FPGA_BUFFER *x)
-//------------------------------------------------------------------------------
-//
-// Returns true when large multi-word integer x is zero.
-//
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-		// try to find non-zero words
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		if (x->words[w]) return false;
-
-		// no non-zero words found
-	return true;
-}
-
-
-//------------------------------------------------------------------------------
-void fpga_buffer_copy(FPGA_BUFFER *src, FPGA_BUFFER *dst)
-//------------------------------------------------------------------------------
-//
-// Copies large multi-word integer from src into dst.
-//
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-		// copy all the words from src into dst
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		dst->words[w] = src->words[w];
-}
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+//
+// fpga_ecdsa_multiword.h
+// -----------------------------------------
+// Models of multi-precision FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Model Parameters
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+
+#define FPGA_OPERAND_WIDTH  (256)   // largest supported operand width in bits
+
+#elif USE_CURVE == 2
+
+#define FPGA_OPERAND_WIDTH  (384)   // largest supported operand width in bits
+
+#else
+
+BAD_CURVE
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// FPGA Pipeline Settings
+//------------------------------------------------------------------------------
+#define FPGA_OPERAND_NUM_WORDS  (FPGA_OPERAND_WIDTH / FPGA_WORD_WIDTH)
+
+
+//------------------------------------------------------------------------------
+// Operand Data Type
+//------------------------------------------------------------------------------
+typedef struct FPGA_BUFFER
+{
+    FPGA_WORD words[FPGA_OPERAND_NUM_WORDS];
+}
+FPGA_BUFFER;
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER ECDSA_ZERO;
+extern FPGA_BUFFER ECDSA_ONE;
+
+
+//------------------------------------------------------------------------------
+// Prototpes
+//------------------------------------------------------------------------------
+void fpga_multiword_init    ();
+void fpga_multiword_copy    (const FPGA_BUFFER *src, FPGA_BUFFER *dst);
+bool fpga_multiword_is_zero (const FPGA_BUFFER *x);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_microcode_parser.py b/ecdsa_microcode_parser.py
new file mode 100644
index 0000000..56aa94c
--- /dev/null
+++ b/ecdsa_microcode_parser.py
@@ -0,0 +1,694 @@
+#
+# ecdsa_microcode_parser.py
+#
+# Author: Pavel Shatov
+#
+# Copyright (c) 2018, NORDUnet A/S
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of the NORDUnet A/S nor the
+#   names of its contributors may be used to endorse or promote products
+#   derived from this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Imports
+import re
+import sys
+from enum import Enum
+
+# Source File
+C_FILES = [ "ecdsa_fpga_curve_microcode.cpp",
+            "ecdsa_fpga_microcode.cpp"]
+
+class MICROCODE_PARSER:
+
+    # enumerate microcode pieces
+    class MICROCODE_PIECE_ENUM(Enum):
+    
+        NONE                    = -1
+        
+        PREPARE                 =  0
+        
+        CYCLE_DOUBLE            =  1
+        CYCLE_ADD               =  2
+        
+        CYCLE_ADD_AT_INFINITY   =  3
+        CYCLE_ADD_SAME_X_SAME_Y =  4
+        CYCLE_ADD_SAME_X        =  5
+        CYCLE_ADD_REGULAR       =  6
+        
+        CYCLE_K0                =  7
+        CYCLE_K1                =  8
+                
+        CONVERT                 =  9
+        
+        CONVERT_AT_INFINITY     = 10
+        CONVERT_REGULAR         = 11
+        
+        INVERT_P256             = 12
+        INVERT_P384             = 13
+
+
+    # magic pair of begin/end markers
+    MARKER_BEGIN_MICROCODE  = "BEGIN_MICROCODE:"
+    MARKER_END_MICROCODE    = "END_MICROCODE"
+
+    # names of banks
+    MICROCODE_C_NAME_BANK_LO    = "BANK_LO"
+    MICROCODE_C_NAME_BANK_HI    = "BANK_HI"
+
+    # micro-operation names
+    MICROCODE_C_NAME_UOP_CMPZ       = "uop_cmpz"
+    MICROCODE_C_NAME_UOP_MOVE       = "uop_move"
+    MICROCODE_C_NAME_UOP_CALC       = "uop_calc"
+    MICROCODE_C_NAME_UOP_CYCLE      = "uop_cycle"
+    MICROCODE_C_NAME_UOP_REPEAT     = "uop_repeat"
+    MICROCODE_C_NAME_UOP_CALC_EVEN  = "uop_calc_if_even"
+    MICROCODE_C_NAME_UOP_CALC_ODD   = "uop_calc_if_odd"
+
+    # calculate micro-operations
+    MICROCODE_C_NAME_UOP_MATH_MUL   = "MUL"
+    MICROCODE_C_NAME_UOP_MATH_ADD   = "ADD"
+    MICROCODE_C_NAME_UOP_MATH_SUB   = "SUB"
+    
+    # names of banks in C source
+    MICROCODE_C_NAMES_BANKS = [MICROCODE_C_NAME_BANK_LO, MICROCODE_C_NAME_BANK_HI]
+
+    # names of operands in C source
+    MICROCODE_C_NAMES_OPERANDS = [  "CONST_ZERO",  "CONST_ONE",
+                                    "CONST_DELTA",
+                                    "CONST_GX",    "CONST_GY",
+                                    "CONST_HX",    "CONST_HY",
+                                    "INVERT_R1",   "INVERT_R2",
+                                    "INVERT_X2",   "INVERT_X3",  "INVERT_X6",
+                                    "INVERT_X12",  "INVERT_X15", "INVERT_X30",
+                                    "INVERT_X32",  "INVERT_X60", "INVERT_X120",
+                                    "INVERT_A2",   "INVERT_A3",
+                                    "CYCLE_RX",    "CYCLE_RY",   "CYCLE_RZ",
+                                    "CYCLE_SX",    "CYCLE_SY",   "CYCLE_SZ",
+                                    "CYCLE_A",     "CYCLE_A2",   "CYCLE_B",
+                                    "CYCLE_C",     "CYCLE_C2",   "CYCLE_C2_2",
+                                    "CYCLE_D",     "CYCLE_E",    "CYCLE_F",
+                                    "CYCLE_G",     "CYCLE_H",    "CYCLE_J",
+                                    "CYCLE_Z2",
+                                    "CYCLE_T1",    "CYCLE_T2",
+                                    "CYCLE_T3",    "CYCLE_T4"]
+
+    # names of banks in Verilog source
+    MICROCODE_V_NAME_BANKS_DUMMY = "UOP_BANKS_DUMMY"
+    
+    MICROCODE_V_NAMES_BANKS = ["UOP_BANKS_LO2HI", "UOP_BANKS_HI2LO"]
+
+    # names of operands in Verilog source
+    MICROCODE_V_NAMES_OPERANDS = ["UOP_OPERAND_" + op for op in MICROCODE_C_NAMES_OPERANDS]
+
+    # names of opcodes in Verilog source
+    MICROCODE_V_NAME_OPCODE_CMPZ        = "UOP_OPCODE_CMPZ"
+    MICROCODE_V_NAME_OPCODE_MOVE        = "UOP_OPCODE_COPY"
+    MICROCODE_V_NAME_OPCODE_MUL         = "UOP_OPCODE_MUL"
+    MICROCODE_V_NAME_OPCODE_ADD         = "UOP_OPCODE_ADD"
+    MICROCODE_V_NAME_OPCODE_SUB         = "UOP_OPCODE_SUB"
+    MICROCODE_V_NAME_OPCODE_STOP        = "UOP_OPCODE_STOP"
+    
+    MICROCODE_V_NAME_OPERAND_DONTCARE   = "UOP_OPERAND_DONTCARE"
+
+    # match group to catch operand names
+    MATCH_GROUP_09AZ        = "([0-9A-Z_]+)"
+    
+    # match group to catch bank names
+    MATCH_GROUP_BANK        = "(" + MICROCODE_C_NAME_BANK_LO + "|" + MICROCODE_C_NAME_BANK_HI + ")"
+
+    # match group to catch number of loop iterations
+    MATCH_GROUP_NUMBER      = "(\d+)"
+    
+    # match group to catch math instruction
+    MATCH_GROUP_MATH        = "(" + MICROCODE_C_NAME_UOP_MATH_MUL + "|" + MICROCODE_C_NAME_UOP_MATH_ADD + "|" + MICROCODE_C_NAME_UOP_MATH_SUB + ")"
+        
+    # match group to catch calculation micro-operation
+    MATCH_GROUP_CALC        = "(" + MICROCODE_C_NAME_UOP_CALC + "|" + MICROCODE_C_NAME_UOP_CALC_EVEN + "|" + MICROCODE_C_NAME_UOP_CALC_ODD + ")"
+    
+    # map string microcode piece names to enum values
+    MICROCODE_PIECE_DICT    = {     "PREPARE":                 MICROCODE_PIECE_ENUM.PREPARE,
+                                    
+                                    "CYCLE_DOUBLE":            MICROCODE_PIECE_ENUM.CYCLE_DOUBLE,
+                                    "CYCLE_ADD":               MICROCODE_PIECE_ENUM.CYCLE_ADD,
+                                    
+                                    "CYCLE_ADD_AT_INFINITY":   MICROCODE_PIECE_ENUM.CYCLE_ADD_AT_INFINITY,
+                                    "CYCLE_ADD_SAME_X_SAME_Y": MICROCODE_PIECE_ENUM.CYCLE_ADD_SAME_X_SAME_Y,
+                                    "CYCLE_ADD_SAME_X":        MICROCODE_PIECE_ENUM.CYCLE_ADD_SAME_X,
+                                    "CYCLE_ADD_REGULAR":       MICROCODE_PIECE_ENUM.CYCLE_ADD_REGULAR,
+                                    
+                                    "CYCLE_K0":                MICROCODE_PIECE_ENUM.CYCLE_K0,
+                                    "CYCLE_K1":                MICROCODE_PIECE_ENUM.CYCLE_K1,
+                                    
+                                    "INVERT_P256":             MICROCODE_PIECE_ENUM.INVERT_P256,
+                                    "INVERT_P384":             MICROCODE_PIECE_ENUM.INVERT_P384,
+                                    
+                                    "CONVERT":                 MICROCODE_PIECE_ENUM.CONVERT,
+                                    
+                                    "CONVERT_AT_INFINITY":     MICROCODE_PIECE_ENUM.CONVERT_AT_INFINITY,
+                                    "CONVERT_REGULAR":         MICROCODE_PIECE_ENUM.CONVERT_REGULAR}
+                                    
+                                    
+    # map C bank names to Verilog bank names
+    MICROCODE_BANK_DICT     = dict(zip(MICROCODE_C_NAMES_BANKS,    MICROCODE_V_NAMES_BANKS))
+
+    # map C operand names to Verilog operand names
+    MICROCODE_OPERAND_DICT  = dict(zip(MICROCODE_C_NAMES_OPERANDS, MICROCODE_V_NAMES_OPERANDS))
+
+    # map C calculation names to Verilog opcode names
+    MICROCODE_MATH_DICT = { MICROCODE_C_NAME_UOP_MATH_MUL:  MICROCODE_V_NAME_OPCODE_MUL,
+                            MICROCODE_C_NAME_UOP_MATH_ADD:  MICROCODE_V_NAME_OPCODE_ADD,
+                            MICROCODE_C_NAME_UOP_MATH_SUB:  MICROCODE_V_NAME_OPCODE_SUB}
+
+    # microcode format
+    MICROCODE_FORMAT_ADDR = "9'd%03d"
+    MICROCODE_FORMAT_LINE = MICROCODE_FORMAT_ADDR + ": data <= %s;"
+    MICROCODE_FORMAT_OFFSET = "localparam [UOP_ADDR_WIDTH-1:0] %s = " + MICROCODE_FORMAT_ADDR + ";"
+                            
+    # pieces of microcode
+    MICROCODE_LINES_PREPARE                 = []
+    
+    MICROCODE_LINES_CYCLE_DOUBLE            = []
+    MICROCODE_LINES_CYCLE_ADD               = []
+    
+    MICROCODE_LINES_CYCLE_ADD_AT_INFINITY   = []
+    MICROCODE_LINES_CYCLE_ADD_SAME_X_SAME_Y = []
+    MICROCODE_LINES_CYCLE_ADD_SAME_X        = []
+    MICROCODE_LINES_CYCLE_ADD_REGULAR       = []
+    
+    MICROCODE_LINES_CYCLE_K0                = []
+    MICROCODE_LINES_CYCLE_K1                = []
+    
+    MICROCODE_LINES_INVERT_P256             = []
+    MICROCODE_LINES_INVERT_P384             = []
+    
+    MICROCODE_LINES_CONVERT                 = []
+    
+    MICROCODE_LINES_CONVERT_AT_INFINITY     = []
+    MICROCODE_LINES_CONVERT_REGULAR         = []
+    
+    MICROCODE_LINE_STOP = "{%s, %s, %s, %s, %s}" % (    MICROCODE_V_NAME_OPCODE_STOP,
+                                                        MICROCODE_V_NAME_BANKS_DUMMY,
+                                                        MICROCODE_V_NAME_OPERAND_DONTCARE,
+                                                        MICROCODE_V_NAME_OPERAND_DONTCARE,
+                                                        MICROCODE_V_NAME_OPERAND_DONTCARE)
+    
+    def __init__(self, filenames):
+        self.__filenames = filenames
+    
+    def parse(self):
+        for next_filename in self.__filenames:
+            print("Parsing '%s'..." % (next_filename))
+            parsing_now = False
+            line_num = 0
+            with open(next_filename, "r") as c_file:
+                c_lines = c_file.readlines()
+                for next_c_line in c_lines:
+                    line_num += 1
+                    self.__line_num = line_num
+                    self.__next_c_line = next_c_line.strip()
+                    if len(self.__next_c_line) == 0: continue
+                    if self.__next_c_line.startswith("//"): continue
+                    if not parsing_now:
+                        self.__current_piece = self.__try_start_parsing()
+                        if self.__current_piece != self.MICROCODE_PIECE_ENUM.NONE:
+                            print("    Found piece of microcode: %s" % (str(self.__current_piece)))
+                            parsing_now = True
+                    else:
+                        parsing_now = self.__continue_parsing()
+        
+                    
+    def format(self):
+
+        mode = 0
+    
+        if len(sys.argv) == 2:
+            if sys.argv[1] == "P256": mode = 1
+            if sys.argv[1] == "P384": mode = 2
+    
+        if mode == 0:
+            sys.exit("sys.exit(): Usage: ecdsa_microcode_parser.py <P256|P384>!")
+    
+        if len(self.MICROCODE_LINES_PREPARE)                 == 0: sys.exit("sys.exit(): Empty PREPARE piece!")
+        
+        if len(self.MICROCODE_LINES_CYCLE_DOUBLE)            == 0: sys.exit("sys.exit(): Empty CYCLE_DOUBLE piece!")
+        if len(self.MICROCODE_LINES_CYCLE_ADD)               == 0: sys.exit("sys.exit(): Empty CYCLE_ADD piece!")
+        
+        if len(self.MICROCODE_LINES_CYCLE_ADD_AT_INFINITY)   == 0: sys.exit("sys.exit(): Empty CYCLE_ADD_AT_INFINITY piece!")
+        if len(self.MICROCODE_LINES_CYCLE_ADD_SAME_X_SAME_Y) == 0: sys.exit("sys.exit(): Empty CYCLE_ADD_SAME_X_SAME_Y piece!")
+        if len(self.MICROCODE_LINES_CYCLE_ADD_SAME_X)        == 0: sys.exit("sys.exit(): Empty CYCLE_ADD_SAME_X piece!")
+        if len(self.MICROCODE_LINES_CYCLE_ADD_REGULAR)       == 0: sys.exit("sys.exit(): Empty CYCLE_ADD_REGULAR piece!")
+        
+        if len(self.MICROCODE_LINES_CYCLE_K0)                == 0: sys.exit("sys.exit(): Empty CYCLE_K0 piece!")
+        if len(self.MICROCODE_LINES_CYCLE_K1)                == 0: sys.exit("sys.exit(): Empty CYCLE_K1 piece!")
+        
+        if len(self.MICROCODE_LINES_INVERT_P256)             == 0: sys.exit("sys.exit(): Empty INVERT_P256 piece!")
+        if len(self.MICROCODE_LINES_INVERT_P384)             == 0: sys.exit("sys.exit(): Empty INVERT_P384 piece!")
+        
+        if len(self.MICROCODE_LINES_CONVERT)                 == 0: sys.exit("sys.exit(): Empty CONVERT piece!")
+        
+        if len(self.MICROCODE_LINES_CONVERT_AT_INFINITY)     == 0: sys.exit("sys.exit(): Empty CONVERT_AT_INFINITY piece!")
+        if len(self.MICROCODE_LINES_CONVERT_REGULAR)         == 0: sys.exit("sys.exit(): Empty CONVERT_REGULAR piece!")
+        
+        length = 0
+        length += len(self.MICROCODE_LINES_PREPARE)
+
+        length += len(self.MICROCODE_LINES_CYCLE_DOUBLE)
+        length += len(self.MICROCODE_LINES_CYCLE_ADD)
+
+        length += len(self.MICROCODE_LINES_CYCLE_ADD_AT_INFINITY)
+        length += len(self.MICROCODE_LINES_CYCLE_ADD_SAME_X_SAME_Y)
+        length += len(self.MICROCODE_LINES_CYCLE_ADD_SAME_X)
+        length += len(self.MICROCODE_LINES_CYCLE_ADD_REGULAR)
+
+        length += len(self.MICROCODE_LINES_CYCLE_K0)
+        length += len(self.MICROCODE_LINES_CYCLE_K1)
+
+        length += len(self.MICROCODE_LINES_CONVERT)
+
+        length += len(self.MICROCODE_LINES_CONVERT_AT_INFINITY)
+        length += len(self.MICROCODE_LINES_CONVERT_REGULAR)
+        
+        if mode == 1: length += len(self.MICROCODE_LINES_INVERT_P256)
+        if mode == 2: length += len(self.MICROCODE_LINES_INVERT_P384)
+
+        print("Total number of micro-operations (w/o stops): %s" % (length))
+        
+        
+        self.__addr = 0
+        
+        print("\n -=-=-=-=-=- CUT AND PASTE BELOW -=-=-=-=-=-\n")
+        
+        num_mul_cycle = 0
+        num_mul_invert_p256 = 0
+        num_mul_invert_p384 = 0
+        
+        offset_prepare = self.__addr;
+        print("// PREPARE");
+        for line in self.MICROCODE_LINES_PREPARE:
+            self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_cycle_double = self.__addr;
+        print("// CYCLE_DOUBLE");
+        for line in self.MICROCODE_LINES_CYCLE_DOUBLE:
+            num_mul_cycle += self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_cycle_add = self.__addr;
+        print("// CYCLE_ADD");
+        for line in self.MICROCODE_LINES_CYCLE_ADD:
+            num_mul_cycle += self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_cycle_add_at_infinity = self.__addr;
+        print("// CYCLE_ADD_AT_INFINITY");
+        for line in self.MICROCODE_LINES_CYCLE_ADD_AT_INFINITY:
+            self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+        
+        offset_cycle_add_same_x_same_y = self.__addr;
+        print("// CYCLE_ADD_SAME_X_SAME_Y");
+        for line in self.MICROCODE_LINES_CYCLE_ADD_SAME_X_SAME_Y:
+            self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+        
+        offset_cycle_add_same_x = self.__addr;
+        print("// CYCLE_ADD_SAME_X");
+        for line in self.MICROCODE_LINES_CYCLE_ADD_SAME_X:
+            self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_cycle_add_regular = self.__addr;
+        print("// CYCLE_ADD_REGULAR");
+        for line in self.MICROCODE_LINES_CYCLE_ADD_REGULAR:
+            num_mul_cycle += self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_cycle_k0 = self.__addr;
+        print("// CYCLE_K0");
+        for line in self.MICROCODE_LINES_CYCLE_K0:
+            num_mul_cycle += self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_cycle_k1 = self.__addr;
+        print("// CYCLE_K1");
+        for line in self.MICROCODE_LINES_CYCLE_K1:
+            self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_convert = self.__addr;
+        print("// CONVERT");
+        for line in self.MICROCODE_LINES_CONVERT:
+            num_mul_cycle += self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_convert_at_infinity = self.__addr;
+        print("// CONVERT_AT_INFINITY");
+        for line in self.MICROCODE_LINES_CONVERT_AT_INFINITY:
+            self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        offset_convert_regular = self.__addr;
+        print("// CONVERT_REGULAR");
+        for line in self.MICROCODE_LINES_CONVERT_REGULAR:
+            num_mul_cycle += self.__format_line(line)
+        self.__format_line(self.MICROCODE_LINE_STOP)
+
+        if mode == 1:
+            offset_invert_p256 = self.__addr;
+            print("// INVERT_P256");
+            for line in self.MICROCODE_LINES_INVERT_P256:
+                num_mul_invert_p256 += self.__format_line(line)
+            self.__format_line(self.MICROCODE_LINE_STOP)
+
+        if mode == 2:
+            offset_invert_p384 = self.__addr;
+            print("// INVERT_P384");
+            for line in self.MICROCODE_LINES_INVERT_P384:
+                num_mul_invert_p384 += self.__format_line(line)
+            self.__format_line(self.MICROCODE_LINE_STOP)
+
+        print("\n")
+        self.__format_offset("UOP_OFFSET_PREPARE                ", offset_prepare)
+        self.__format_offset("UOP_OFFSET_CYCLE_DOUBLE           ", offset_cycle_double)
+        self.__format_offset("UOP_OFFSET_CYCLE_ADD              ", offset_cycle_add)
+        self.__format_offset("UOP_OFFSET_CYCLE_ADD_AT_INFINITY  ", offset_cycle_add_at_infinity)
+        self.__format_offset("UOP_OFFSET_CYCLE_ADD_SAME_X_SAME_Y", offset_cycle_add_same_x_same_y)
+        self.__format_offset("UOP_OFFSET_CYCLE_ADD_SAME_X       ", offset_cycle_add_same_x)
+        self.__format_offset("UOP_OFFSET_CYCLE_ADD_REGULAR      ", offset_cycle_add_regular)
+        self.__format_offset("UOP_OFFSET_CYCLE_K0               ", offset_cycle_k0)
+        self.__format_offset("UOP_OFFSET_CYCLE_K1               ", offset_cycle_k1)
+        self.__format_offset("UOP_OFFSET_CONVERT                ", offset_convert)
+        self.__format_offset("UOP_OFFSET_CONVERT_AT_INFINITY    ", offset_convert_at_infinity)
+        self.__format_offset("UOP_OFFSET_CONVERT_REGULAR        ", offset_convert_regular)
+        if mode == 1: self.__format_offset("UOP_OFFSET_INVERT_P256            ", offset_invert_p256)
+        if mode == 2: self.__format_offset("UOP_OFFSET_INVERT_P384            ", offset_invert_p384)
+        
+        print("")
+        print("num_mul_cycle = %d" % num_mul_cycle)
+        print("num_mul_invert_p256 = %d" % num_mul_invert_p256)
+        print("num_mul_invert_p384 = %d" % num_mul_invert_p384)
+                
+    def __format_line(self, line):
+        line = self.MICROCODE_FORMAT_LINE % (self.__addr, line) 
+        print(line)
+        self.__addr += 1
+        is_mul = line.find(self.MICROCODE_V_NAME_OPCODE_MUL)
+        if is_mul >= 0: return 1
+        return 0
+    
+    def __format_offset(self, name, offset):
+        print(self.MICROCODE_FORMAT_OFFSET % (name, offset))
+    
+    def __try_start_parsing(self):
+        piece = self.MICROCODE_PIECE_ENUM.NONE
+        begin_marker = re.match("^/\*\s*" + self.MARKER_BEGIN_MICROCODE + "\s*" +
+                                            self.MATCH_GROUP_09AZ + "\s*\*/$",
+                                            self.__next_c_line);
+        if begin_marker:
+            piece = self.MICROCODE_PIECE_DICT[begin_marker.group(1)]
+            
+        return piece
+    
+    def __encode_uop(self, opcode, banks, src1, src2, dst):
+        return "{%s, %s, %s, %s, %s}" % (opcode, banks, src1, src2, dst)
+    
+    def __continue_parsing(self):
+    
+        end_marker = re.match("^/\*\s*" +   self.MARKER_END_MICROCODE + "\s*\*/$",
+                                            self.__next_c_line);
+        if end_marker: return False
+        
+        # cmpz?
+        uop_cmpz = re.match("^" +   self.MICROCODE_C_NAME_UOP_CMPZ + "\(\s*" +
+                                    self.MATCH_GROUP_BANK          + "\,\s*" +
+                                    self.MATCH_GROUP_09AZ          + "\)\;$",
+                                    self.__next_c_line, re.IGNORECASE)
+        if uop_cmpz:
+            ok = self.__parse_uop_cmpz(uop_cmpz)
+            if ok: return True
+            else:
+                self.__print_parse_error("parse_uop_cmpz() failed!")
+                self.__abort()
+                
+        # move?
+        uop_move = re.match("^" +   self.MICROCODE_C_NAME_UOP_MOVE + "\(\s*" +
+                                    self.MATCH_GROUP_BANK          + "\,\s*" +
+                                    self.MATCH_GROUP_09AZ          + "\,\s*" +
+                                    self.MATCH_GROUP_BANK          + "\,\s*" +
+                                    self.MATCH_GROUP_09AZ          + "\)\;$",
+                                    self.__next_c_line, re.IGNORECASE)
+        if uop_move:
+            ok = self.__parse_uop_move(uop_move)
+            if ok: return True
+            else:
+                self.__print_parse_error("parse_uop_move() failed!")
+                self.__abort()
+        
+        # calc?
+        uop_calc = re.match("^" +   self.MATCH_GROUP_CALC           + "\s*\(" +
+                                    self.MATCH_GROUP_MATH           + "\,\s*" +
+                                    self.MATCH_GROUP_BANK           + "\,\s*" +
+                                    self.MATCH_GROUP_09AZ           + "\,\s*" +
+                                    self.MATCH_GROUP_09AZ           + "\,\s*" +
+                                    self.MATCH_GROUP_BANK           + "\,\s*" +
+                                    self.MATCH_GROUP_09AZ           + "\)\;$",
+                                    self.__next_c_line, re.IGNORECASE)
+        if uop_calc:
+            ok = self.__parse_uop_calc(uop_calc)
+            if ok: return True
+            else:
+                self.__print_parse_error("parse_uop_calc() failed!")
+                self.__abort()
+        
+        # cycle?
+        uop_cycle = re.match("^" +  self.MICROCODE_C_NAME_UOP_CYCLE + "\(" +
+                                    self.MATCH_GROUP_NUMBER         + "\)\;$",
+                                    self.__next_c_line, re.IGNORECASE)
+        if uop_cycle:
+            ok = self.__parse_uop_cycle(uop_cycle)
+            if ok: return True
+            else:
+                self.__print_parse_error("parse_uop_cycle() failed!")
+                self.__abort()
+
+        # repeat?
+        uop_repeat = re.match("^" + self.MICROCODE_C_NAME_UOP_REPEAT + "\(" + "\)\;$",
+                                    self.__next_c_line, re.IGNORECASE)
+        if uop_repeat:
+            ok = self.__parse_uop_repeat()
+            if ok: return True
+            else:
+                self.__print_parse_error("__parse_uop_repeat() failed!")
+                self.__abort()
+                
+        self.__print_parse_error("unknown micro-operation!")
+        self.__abort()
+
+    def __check_math(self, math):
+        if not math in self.MICROCODE_MATH_DICT:
+            print_parse_error("bad math!")
+            return False
+            
+        return True
+
+    def __check_banks(self, src, dst):
+        if not src in self.MICROCODE_BANK_DICT:
+            print_parse_error("bad src bank!")
+            return False
+
+        if not dst in self.MICROCODE_BANK_DICT:
+            print_parse_error("bad dst bank!")
+            return False
+
+        if src == dst:
+            print_parse_error("src bank == dst bank!")
+            return False
+            
+        return True
+        
+    def __check_bank(self, src):
+        if not src in self.MICROCODE_BANK_DICT:
+            print_parse_error("bad src bank!")
+            return False
+            
+        return True        
+
+    def __check_op3(self, src1, src2, dst):
+
+        if not src1 in self.MICROCODE_OPERAND_DICT:
+            self.__print_parse_error("bad src1 operand!")
+            return False
+
+        if src2 != "" and not src2 in self.MICROCODE_OPERAND_DICT:
+            self.__print_parse_error("bad src2 operand!")
+            return False
+
+        if dst != "" and not dst in self.MICROCODE_OPERAND_DICT:
+            self.__print_parse_error("bad dst operand!")
+            return False
+            
+        return True
+
+    def __check_op2(self, src, dst):
+        return self.__check_op3(src, "", dst)
+    
+    def __check_op1(self, src):
+        return self.__check_op2(src, "")
+    
+    def __parse_uop_move(self, params):
+        bank_src = params.group(1)
+        bank_dst = params.group(3)
+        op_src   = params.group(2)
+        op_dst   = params.group(4)
+    
+        if not self.__check_banks(bank_src, bank_dst):
+            self.__print_parse_error("check_banks() failed!")
+            return False
+            
+        if not self.__check_op2(op_src, op_dst):
+            self.__print_parse_error("check_op2() failed!")
+            return False
+    
+        opcode = self.MICROCODE_V_NAME_OPCODE_MOVE
+        bank = self.MICROCODE_BANK_DICT[bank_src]
+        src1 = self.MICROCODE_OPERAND_DICT[op_src]
+        src2 = self.MICROCODE_V_NAME_OPERAND_DONTCARE
+        dst = self.MICROCODE_OPERAND_DICT[op_dst]
+    
+        data = self.__encode_uop(opcode, bank, src1, src2, dst)
+        self.__store_uop(data)
+        return True
+        
+    def __parse_uop_cmpz(self, params):
+        bank_src = params.group(1)
+        op_src   = params.group(2)
+    
+        if not self.__check_bank(bank_src):
+            self.__print_parse_error("check_bank() failed!")
+            return False
+            
+        if not self.__check_op1(op_src):
+            self.__print_parse_error("check_op1() failed!")
+            return False
+    
+        opcode = self.MICROCODE_V_NAME_OPCODE_CMPZ
+        bank = self.MICROCODE_BANK_DICT[bank_src]
+        src1 = self.MICROCODE_OPERAND_DICT[op_src]
+        src2 = self.MICROCODE_V_NAME_OPERAND_DONTCARE
+        dst = self.MICROCODE_V_NAME_OPERAND_DONTCARE
+    
+        data = self.__encode_uop(opcode, bank, src1, src2, dst)
+        self.__store_uop(data)
+        return True        
+
+    def __parse_uop_calc(self, params):
+        calc     = params.group(1)
+        math     = params.group(2)
+        bank_src = params.group(3)
+        bank_dst = params.group(6)
+        op_src1  = params.group(4)
+        op_src2  = params.group(5)
+        op_dst   = params.group(7)
+    
+        if not self.__check_math(math):
+            self.__print_parse_error("check_calc() failed!")
+            return False
+    
+        if not self.__check_banks(bank_src, bank_dst):
+            self.__print_parse_error("check_banks() failed!")
+            return False
+            
+        if not self.__check_op3(op_src1, op_src2, op_dst):
+            self.__print_parse_error("check_op3() failed!")
+            return False
+    
+        opcode = self.MICROCODE_MATH_DICT[math]
+        banks = self.MICROCODE_BANK_DICT[bank_src]
+        src1 = self.MICROCODE_OPERAND_DICT[op_src1]
+        src2 = self.MICROCODE_OPERAND_DICT[op_src2]
+        dst = self.MICROCODE_OPERAND_DICT[op_dst]
+
+        data = self.__encode_uop(opcode, banks, src1, src2, dst)
+        if   calc == self.MICROCODE_C_NAME_UOP_CALC:        self.__store_uop(data)
+        elif calc == self.MICROCODE_C_NAME_UOP_CALC_EVEN:   self.__loop_even = data
+        elif calc == self.MICROCODE_C_NAME_UOP_CALC_ODD:    self.__loop_odd = data
+        else: return False
+        
+        return True
+
+    def __parse_uop_cycle(self, params):
+        self.__loop_iters = int(params.group(1))
+        return True
+
+    def __parse_uop_repeat(self):
+        print("        Unrolling loop (%d iters)..." % (self.__loop_iters))
+        
+        for i in range(0, self.__loop_iters):
+            if i % 2 == 0:  self.__store_uop(self.__loop_even)
+            else:           self.__store_uop(self.__loop_odd)
+        
+        return True
+        
+    def __store_uop(self, data):
+        #print("\t" + data)
+        if   self.__current_piece == self.MICROCODE_PIECE_ENUM.PREPARE:                 self.MICROCODE_LINES_PREPARE.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_DOUBLE:            self.MICROCODE_LINES_CYCLE_DOUBLE.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_ADD:               self.MICROCODE_LINES_CYCLE_ADD.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_ADD_AT_INFINITY:   self.MICROCODE_LINES_CYCLE_ADD_AT_INFINITY.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_ADD_SAME_X_SAME_Y: self.MICROCODE_LINES_CYCLE_ADD_SAME_X_SAME_Y.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_ADD_SAME_X:        self.MICROCODE_LINES_CYCLE_ADD_SAME_X.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_ADD_REGULAR:       self.MICROCODE_LINES_CYCLE_ADD_REGULAR.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_K0:                self.MICROCODE_LINES_CYCLE_K0.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CYCLE_K1:                self.MICROCODE_LINES_CYCLE_K1.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.INVERT_P256:             self.MICROCODE_LINES_INVERT_P256.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.INVERT_P384:             self.MICROCODE_LINES_INVERT_P384.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CONVERT:                 self.MICROCODE_LINES_CONVERT.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CONVERT_AT_INFINITY:     self.MICROCODE_LINES_CONVERT_AT_INFINITY.append(data)
+        elif self.__current_piece == self.MICROCODE_PIECE_ENUM.CONVERT_REGULAR:         self.MICROCODE_LINES_CONVERT_REGULAR.append(data)
+
+    def __print_parse_error(self, msg):
+        print("PARSE ERROR: %s" % (msg))
+  
+    def __abort(self):
+        sys.exit("Stopped at line #%d:\n%s" % (self.__line_num, self.__next_c_line))
+
+
+# -----------------------------------------------------------------------------
+def main(filenames):
+# -----------------------------------------------------------------------------
+    parser = MICROCODE_PARSER(filenames)
+    parser.parse()
+    parser.format()
+    
+# -----------------------------------------------------------------------------    
+if __name__ == "__main__":
+# -----------------------------------------------------------------------------
+    main(C_FILES)
+    
+#
+# End-of-File
+#
diff --git a/ecdsa_model.h b/ecdsa_model.h
deleted file mode 100644
index fc7e571..0000000
--- a/ecdsa_model.h
+++ /dev/null
@@ -1,205 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// ecdsa_model.h
-// --------------------------------------------
-// Base point scalar multiplier model for ECDSA
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-//
-// Curve Selection
-//
-// USE_CURVE == 1  -> P-256
-// USE_CURVE == 2  -> P-384
-//
-//------------------------------------------------------------------------------
-#define USE_CURVE	2
-
-
-//------------------------------------------------------------------------------
-// Model Parameters
-//------------------------------------------------------------------------------
-#if USE_CURVE == 1
-#define OPERAND_WIDTH	(256)	// largest supported operand width in bits
-#elif USE_CURVE == 2
-#define OPERAND_WIDTH	(384)	// largest supported operand width in bits
-#else
-#error USE_CURVE must be either 1 or 2!
-#endif
-
-
-//------------------------------------------------------------------------------
-// P-256 Parameters and Test Vectors
-//------------------------------------------------------------------------------
-
-/* Field Size */
-#define P_256_Q			{0xffffffff, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff}
-
-/* Generic Numbers */
-#define P_256_ZERO		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
-#define P_256_ONE		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001}
-
-/* Division Factor  */
-#define P_256_DELTA		{0x7fffffff, 0x80000000, 0x80000000, 0x00000000, 0x00000000, 0x80000000, 0x00000000, 0x00000000}
-
-/* Base Point */
-#define P_256_G_X		{0x6b17d1f2, 0xe12c4247, 0xf8bce6e5, 0x63a440f2, 0x77037d81, 0x2deb33a0, 0xf4a13945, 0xd898c296}
-#define P_256_G_Y		{0x4fe342e2, 0xfe1a7f9b, 0x8ee7eb4a, 0x7c0f9e16, 0x2bce3357, 0x6b315ece, 0xcbb64068, 0x37bf51f5}
-
-/* Doubled Base Point */
-#define P_256_H_X		{0x29d05c19, 0x3da77b71, 0x0e863235, 0x38b77e1b, 0x11f904fe, 0xa42998be, 0x16bd8d74, 0x4ece7ad0}
-#define P_256_H_Y		{0xb01cbd1c, 0x01e58065, 0x711814b5, 0x83f061e9, 0xd431cca9, 0x94cea131, 0x3449bf97, 0xc840ae07}
-
-/* Base Point Order */
-#define P_256_N			{0xffffffff, 0x00000000, 0xffffffff, 0xffffffff, 0xbce6faad, 0xa7179e84, 0xf3b9cac2, 0xfc632551}
-
-/* Private Key */
-#define P_256_D			{0x70a12c2d, 0xb16845ed, 0x56ff68cf, 0xc21a472b, 0x3f04d7d6, 0x851bf634, 0x9f2d7d5b, 0x3452b38a}
-
-/* Per-message Random Number */
-#define P_256_K			{0x580ec00d, 0x85643433, 0x4cef3f71, 0xecaed496, 0x5b12ae37, 0xfa47055b, 0x1965c7b1, 0x34ee45d0}
-
-/* Public Key */
-#define P_256_Q_X		{0x8101ece4, 0x7464a6ea, 0xd70cf69a, 0x6e2bd3d8, 0x8691a326, 0x2d22cba4, 0xf7635eaf, 0xf26680a8}
-#define P_256_Q_Y		{0xd8a12ba6, 0x1d599235, 0xf67d9cb4, 0xd58f1783, 0xd3ca43e7, 0x8f0a5aba, 0xa6240799, 0x36c0c3a9}
-
-/* Part of Signature */
-#define P_256_R_X		{0x7214bc96, 0x47160bbd, 0x39ff2f80, 0x533f5dc6, 0xddd70ddf, 0x86bb8156, 0x61e805d5, 0xd4e6f27c}
-#define P_256_R_Y		{0x8b81e3e9, 0x77597110, 0xc7cf2633, 0x435b2294, 0xb7264298, 0x7defd3d4, 0x007e1cfc, 0x5df84541}
-
-
-//------------------------------------------------------------------------------
-// P-384 Parameters and Test Vectors
-//------------------------------------------------------------------------------
-
-/* Field Size */
-#define P_384_Q			{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff}
-
-/* Generic Numbers */
-#define P_384_ZERO		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
-#define P_384_ONE		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001}
-
-/* Division Factor  */
-#define P_384_DELTA		{0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff, 0x80000000, 0x00000000, 0x80000000}
-
-/* Base Point */
-#define P_384_G_X		{0xaa87ca22, 0xbe8b0537, 0x8eb1c71e, 0xf320ad74, 0x6e1d3b62, 0x8ba79b98, 0x59f741e0, 0x82542a38, 0x5502f25d, 0xbf55296c, 0x3a545e38, 0x72760ab7}
-#define P_384_G_Y		{0x3617de4a, 0x96262c6f, 0x5d9e98bf, 0x9292dc29, 0xf8f41dbd, 0x289a147c, 0xe9da3113, 0xb5f0b8c0, 0x0a60b1ce, 0x1d7e819d, 0x7a431d7c, 0x90ea0e5f}
-
-/* Doubled Base Point */
-#define P_384_H_X		{0xaaf06bba, 0x82e9f590, 0xe29c71c2, 0x19bea517, 0x23c5893a, 0xe8b0c8cf, 0x4c117c3e, 0xfb57ab8d, 0x55fa1b42, 0x8155ad27, 0x8b574391, 0x1b13ea8a}
-#define P_384_H_Y		{0xc9e821b5, 0x69d9d390, 0xa2616740, 0x6d6d23d6, 0x070be242, 0xd765eb83, 0x1625ceec, 0x4a0f473e, 0xf59f4e30, 0xe2817e62, 0x85bce284, 0x6f15f19d}
-
-/* Base Point Order */
-#define P_384_N			{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xc7634d81, 0xf4372ddf, 0x581a0db2, 0x48b0a77a, 0xecec196a, 0xccc52973}
-
-/* Private Key */
-#define P_384_D			{0xc838b852, 0x53ef8dc7, 0x394fa580, 0x8a518398, 0x1c7deef5, 0xa69ba8f4, 0xf2117ffe, 0xa39cfcd9, 0x0e95f6cb, 0xc854abac, 0xab701d50, 0xc1f3cf24}
-
-/* Per-message Random Number */
-#define P_384_K			{0xdc6b4403, 0x6989a196, 0xe39d1cda, 0xc000812f, 0x4bdd8b2d, 0xb41bb33a, 0xf5137258, 0x5ebd1db6, 0x3f0ce827, 0x5aa1fd45, 0xe2d2a735, 0xf8749359}
-
-/* Public Key */
-#define P_384_Q_X		{0x1fbac8ee, 0xbd0cbf35, 0x640b39ef, 0xe0808dd7, 0x74debff2, 0x0a2a329e, 0x91713baf, 0x7d7f3c3e, 0x81546d88, 0x3730bee7, 0xe48678f8, 0x57b02ca0}
-#define P_384_Q_Y		{0xeb213103, 0xbd68ce34, 0x3365a8a4, 0xc3d4555f, 0xa385f533, 0x0203bdd7, 0x6ffad1f3, 0xaffb9575, 0x1c132007, 0xe1b24035, 0x3cb0a4cf, 0x1693bdf9}
-
-/* Part of Signature */
-#define P_384_R_X		{0xa0c27ec8, 0x93092dea, 0x1e1bd2cc, 0xfed3cf94, 0x5c8134ed, 0x0c9f8131, 0x1a0f4a05, 0x942db8db, 0xed8dd59f, 0x267471d5, 0x462aa14f, 0xe72de856}
-#define P_384_R_Y		{0x85564940, 0x9815bb91, 0x424eaca5, 0xfd76c973, 0x75d575d1, 0x422ec53d, 0x343bd33b, 0x847fdf0c, 0x11569685, 0xb528ab25, 0x49301542, 0x8d7cf72b}
-
-
-//------------------------------------------------------------------------------
-// Parameter and Test Vector Selection
-//------------------------------------------------------------------------------
-#if USE_CURVE == 1
-
-#define ECDSA_Q			P_256_Q
-
-#define ECDSA_ZERO		P_256_ZERO
-#define ECDSA_ONE		P_256_ONE
-
-#define ECDSA_DELTA		P_256_DELTA
-
-#define ECDSA_G_X		P_256_G_X
-#define ECDSA_G_Y		P_256_G_Y
-
-#define ECDSA_H_X		P_256_H_X
-#define ECDSA_H_Y		P_256_H_Y
-
-#define ECDSA_N			P_256_N
-#define ECDSA_D			P_256_D
-#define ECDSA_K			P_256_K
-
-#define ECDSA_Q_X		P_256_Q_X
-#define ECDSA_Q_Y		P_256_Q_Y
-
-#define ECDSA_R_X		P_256_R_X
-#define ECDSA_R_Y		P_256_R_Y
-
-#elif USE_CURVE == 2
-
-#define ECDSA_Q			P_384_Q
-
-#define ECDSA_ZERO		P_384_ZERO
-#define ECDSA_ONE		P_384_ONE
-
-#define ECDSA_DELTA		P_384_DELTA
-
-#define ECDSA_G_X		P_384_G_X
-#define ECDSA_G_Y		P_384_G_Y
-
-#define ECDSA_H_X		P_384_H_X
-#define ECDSA_H_Y		P_384_H_Y
-
-#define ECDSA_N			P_384_N
-#define ECDSA_D			P_384_D
-#define ECDSA_K			P_384_K
-
-#define ECDSA_Q_X		P_384_Q_X
-#define ECDSA_Q_Y		P_384_Q_Y
-
-#define ECDSA_R_X		P_384_R_X
-#define ECDSA_R_Y		P_384_R_Y
-
-#else
-
-#error USE_CURVE must be either 1 or 2!
-
-#endif
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/ecdsa_model_fpga.cpp b/ecdsa_model_fpga.cpp
deleted file mode 100644
index 8ad0962..0000000
--- a/ecdsa_model_fpga.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// ecdsa_model_fpga.cpp
-// --------------------------------------------
-// Base point scalar multiplier model for ECDSA
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Headers
-//------------------------------------------------------------------------------
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include "ecdsa_model.h"
-#include "fpga_lowlevel.h"
-#include "fpga_modular.h"
-#include "fpga_curve.h"
-#include "fpga_util.h"
-
-
-//------------------------------------------------------------------------------
-// Prototypes
-//------------------------------------------------------------------------------
-void	fpga_model_init					();
-bool	test_base_point_multiplier		(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
-
-bool	abuse_internal_point_adder		();
-bool	abuse_internal_point_doubler	();
-
-void	print_fpga_buffer				(const char *s, FPGA_BUFFER *v);
-
-bool	compare_fpga_buffers			(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *bx, FPGA_BUFFER *by);
-bool	compare_fpga_buffers			(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *az, FPGA_BUFFER *bx, FPGA_BUFFER *by, FPGA_BUFFER *bz);
-
-
-//------------------------------------------------------------------------------
-// Locals
-//------------------------------------------------------------------------------
-static FPGA_BUFFER ecdsa_d;
-static FPGA_BUFFER ecdsa_k;
-static FPGA_BUFFER ecdsa_n;
-
-
-//------------------------------------------------------------------------------
-int main()
-//------------------------------------------------------------------------------
-{
-	bool	ok;		// flag
-
-		//
-		// initialize buffers
-		//
-	fpga_model_init();
-	fpga_modular_init();
-	fpga_curve_init();
-
-	
-		//
-		// test base point multiplier: Q = d * G
-		//
-	printf("Trying to derive public key from private key...\n\n");
-	ok = test_base_point_multiplier(&ecdsa_d, &ecdsa_q_x, &ecdsa_q_y);
-	if (!ok) return EXIT_FAILURE;
-
-
-		//
-		// test base point multiplier: R = k * G
-		//
-	printf("Trying to sign something...\n\n");
-	ok = test_base_point_multiplier(&ecdsa_k, &ecdsa_r_x, &ecdsa_r_y);
-	if (!ok) return EXIT_FAILURE;
-
-
-		//
-		// test base point multiplier: O = n * G
-		//
-	printf("Trying to multiply the base point by its order...\n\n");
-	ok = test_base_point_multiplier(&ecdsa_n, &ecdsa_zero, &ecdsa_zero);
-	if (!ok) return EXIT_FAILURE;
-
-
-		//
-		// try to abuse internal point doubler
-		//
-	ok = abuse_internal_point_doubler();
-	if (!ok) return EXIT_FAILURE;
-	
-
-		//
-		// try to abuse internal point adder
-		//
-	ok = abuse_internal_point_adder();
-	if (!ok) return EXIT_FAILURE;
-	
-
-		//
-		// everything went just fine
-		//
-	return EXIT_SUCCESS;
-}
-
-
-//------------------------------------------------------------------------------
-void fpga_model_init()
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-	FPGA_BUFFER tmp_d = ECDSA_D;
-	FPGA_BUFFER tmp_k = ECDSA_K;
-	FPGA_BUFFER tmp_n = ECDSA_N;
-
-		/* fill buffers for large multi-word integers */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{	ecdsa_d.words[w] = tmp_d.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_k.words[w] = tmp_k.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_n.words[w] = tmp_n.words[OPERAND_NUM_WORDS - (w+1)];
-	}
-}
-
-
-//------------------------------------------------------------------------------
-bool test_base_point_multiplier(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
-//------------------------------------------------------------------------------
-//
-// k - multiplier
-//
-// qx, qy - expected coordinates of product
-//
-// Returns true when point (rx,ry) = k * G matches the point (qx,qy).
-//
-//------------------------------------------------------------------------------
-{
-	bool ok;				// flag
-	FPGA_BUFFER rx, ry;		// result
-
-		/* run the model */
-	fpga_curve_scalar_multiply(k, &rx, &ry);
-
-		/* handle result */
-	ok = compare_fpga_buffers(qx, qy, &rx, &ry);
-	if (!ok)
-	{	printf("\n    ERROR\n\n");
-		return false;
-	}
-	else printf("\n    OK\n\n");
-
-		// everything went just fine
-	return true;
-}
-
-
-//------------------------------------------------------------------------------
-bool abuse_internal_point_doubler()
-//------------------------------------------------------------------------------
-//
-// This routine tries to abuse the internal curve point doubler by forcing it
-// to double point at infinity.
-//
-//------------------------------------------------------------------------------
-{
-	int w;		// word counter
-	bool ok;	// flag
-	
-	FPGA_BUFFER px, py, pz;		// input
-	FPGA_BUFFER qx, qy, qz;		// output
-
-		// set P.X and P.Y to some "random" garbage and P.Z to zero
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{	px.words[w] = ecdsa_g_x.words[w] ^ ecdsa_h_x.words[w];
-		py.words[w] = ecdsa_g_y.words[w] ^ ecdsa_h_y.words[w];
-	}
-	fpga_buffer_copy(&ecdsa_zero, &pz);
-
-		// try to double point at infinity (should produce point at infinity)
-	printf("Trying to double something at infinity...\n\n");
-	fpga_curve_double_jacobian(&px, &py, &pz, &qx, &qy, &qz);
-
-		// handle result
-	ok = compare_fpga_buffers(&ecdsa_one, &ecdsa_one, &ecdsa_zero, &qx, &qy, &qz);
-	if (! ok)
-	{	printf("\n    ERROR\n\n");
-		return false;
-	}
-	else printf("\n    OK\n\n");
-
-		// everything went just fine
-	return true;
-}
-
-
-//------------------------------------------------------------------------------
-bool abuse_internal_point_adder()
-//------------------------------------------------------------------------------
-//
-// This routine tries to abuse the internal curve point adder by forcing it to
-// go throgh all the possible "corner cases".
-//
-//------------------------------------------------------------------------------
-{
-	int w;		// word counter	
-	bool ok;	// flag
-
-	FPGA_BUFFER px, py, pz;		// input
-	FPGA_BUFFER rx, ry, rz;		// output
-
-	//
-	// try to add point at infinity to the base point
-	//
-	{
-			// set P.X and P.Y to some "random" garbage and P.Z to zero
-		for (w=0; w<OPERAND_NUM_WORDS; w++)
-		{	px.words[w] = ecdsa_g_x.words[w] ^ ecdsa_h_x.words[w];
-			py.words[w] = ecdsa_g_y.words[w] ^ ecdsa_h_y.words[w];
-		}
-		fpga_buffer_copy(&ecdsa_zero, &pz);
-
-			// run addition proceduce
-		printf("Trying to add something at infinity to the base point...\n\n");
-		fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
-
-			// handle result
-		ok = compare_fpga_buffers(&ecdsa_g_x, &ecdsa_g_y, &ecdsa_one, &rx, &ry, &rz);
-		if (! ok)
-		{	printf("\n    ERROR\n\n");
-			return false;
-		}
-		else printf("\n    OK\n\n");
-	}
-
-	//
-	// try to add the base point to itself
-	//
-	{
-			// set P to G
-		fpga_buffer_copy(&ecdsa_g_x, &px);
-		fpga_buffer_copy(&ecdsa_g_y, &py);
-		fpga_buffer_copy(&ecdsa_one, &pz);
-
-			// run addition proceduce
-		printf("Trying to add the base point to itself...\n\n");
-		fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
-
-			// handle result
-		ok = compare_fpga_buffers(&ecdsa_h_x, &ecdsa_h_y, &ecdsa_one, &rx, &ry, &rz);
-		if (! ok)
-		{	printf("\n    ERROR\n\n");
-			return false;
-		}
-		else printf("\n    OK\n\n");
-	}
-
-	//
-	// try to add the base point to its opposite
-	//
-	{
-			// set P to (G.X, -G.Y, 1)
-		fpga_buffer_copy(&ecdsa_zero, &px);
-		fpga_buffer_copy(&ecdsa_zero, &py);
-		fpga_buffer_copy(&ecdsa_one,  &pz);
-
-		fpga_modular_add(&ecdsa_zero, &ecdsa_g_x, &px);
-		fpga_modular_sub(&ecdsa_zero, &ecdsa_g_y, &py);
-
-			// run addition proceduce
-		printf("Trying to add the base point to its opposite...\n\n");
-		fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
-
-			// handle result
-		ok = compare_fpga_buffers(&ecdsa_one, &ecdsa_one, &ecdsa_zero, &rx, &ry, &rz);
-		if (! ok)
-		{	printf("\n    ERROR\n\n");
-			return false;
-		}
-		else printf("\n    OK\n\n");
-	}
-
-		// everything went just fine
-	return true;
-}
-
-
-//------------------------------------------------------------------------------
-void print_fpga_buffer(const char *s, FPGA_BUFFER *buf)
-//------------------------------------------------------------------------------
-//
-// Pretty print large multi-word integer.
-//
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-		// print header
-	printf("%s", s);
-
-		// print all bytes
-	for (w=OPERAND_NUM_WORDS; w>0; w--)
-	{	
-		printf("%08x", buf->words[w-1]);
-
-			// insert space after every group of 4 bytes
-		if (w > 1) printf(" ");
-	}
-
-		// print footer
-	printf("\n");
-}
-
-
-//------------------------------------------------------------------------------
-bool compare_fpga_buffers(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *bx, FPGA_BUFFER *by)
-//------------------------------------------------------------------------------
-//
-// Compare affine coordinates of two points and return true when they match.
-//
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-		// print all the values
-	print_fpga_buffer("  Expected:   X = ", ax);
-	print_fpga_buffer("  Calculated: X = ", bx);
-	printf("\n");
-	print_fpga_buffer("  Expected:   Y = ", ay);
-	print_fpga_buffer("  Calculated: Y = ", by);
-
-		// compare values
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{
-			// compare x
-		if (ax->words[w] != bx->words[w]) return false;
-
-			// compare y
-		if (ay->words[w] != by->words[w]) return false;
-	}
-
-		// values are the same
-	return true;
-}
-
-
-//------------------------------------------------------------------------------
-bool compare_fpga_buffers(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *az, FPGA_BUFFER *bx, FPGA_BUFFER *by, FPGA_BUFFER *bz)
-//------------------------------------------------------------------------------
-//
-// Compare projective coordinates of two points and return true when they match.
-//
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-		// print all the values
-	print_fpga_buffer("  Expected:   X = ", ax);
-	print_fpga_buffer("  Calculated: X = ", bx);
-	printf("\n");
-	print_fpga_buffer("  Expected:   Y = ", ay);
-	print_fpga_buffer("  Calculated: Y = ", by);
-	printf("\n");
-	print_fpga_buffer("  Expected:   Z = ", az);
-	print_fpga_buffer("  Calculated: Z = ", bz);
-
-		// compare values
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{
-			// compare x
-		if (ax->words[w] != bx->words[w]) return false;
-
-			// compare y
-		if (ay->words[w] != by->words[w]) return false;
-
-			// compare z
-		if (az->words[w] != bz->words[w]) return false;
-	}
-
-		// values are the same
-	return true;
-}
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/fpga_curve.cpp b/fpga_curve.cpp
deleted file mode 100644
index 9cc8ec0..0000000
--- a/fpga_curve.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// fpga_curve.cpp
-// ------------------------------------
-// Elliptic curve arithmetic procedures
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Headers
-//------------------------------------------------------------------------------
-#include <stdint.h>
-#include "ecdsa_model.h"
-#include "fpga_lowlevel.h"
-#include "fpga_modular.h"
-#include "fpga_curve.h"
-#include "fpga_util.h"
-
-
-//------------------------------------------------------------------------------
-// Globals
-//------------------------------------------------------------------------------
-FPGA_BUFFER ecdsa_g_x, ecdsa_g_y;
-FPGA_BUFFER ecdsa_h_x, ecdsa_h_y;
-FPGA_BUFFER ecdsa_q_x, ecdsa_q_y;
-FPGA_BUFFER ecdsa_r_x, ecdsa_r_y;
-
-
-//------------------------------------------------------------------------------
-void fpga_curve_init()
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-	FPGA_BUFFER tmp_g_x = ECDSA_G_X, tmp_g_y = ECDSA_G_Y;
-	FPGA_BUFFER tmp_h_x = ECDSA_H_X, tmp_h_y = ECDSA_H_Y;
-	FPGA_BUFFER tmp_q_x = ECDSA_Q_X, tmp_q_y = ECDSA_Q_Y;
-	FPGA_BUFFER tmp_r_x = ECDSA_R_X, tmp_r_y = ECDSA_R_Y;
-
-		/* fill buffers for large multi-word integers */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{	ecdsa_g_x.words[w]	= tmp_g_x.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_g_y.words[w]	= tmp_g_y.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_h_x.words[w]	= tmp_h_x.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_h_y.words[w]	= tmp_h_y.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_r_x.words[w]	= tmp_r_x.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_r_y.words[w]	= tmp_r_y.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_q_x.words[w]	= tmp_q_x.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_q_y.words[w]	= tmp_q_y.words[OPERAND_NUM_WORDS - (w+1)];
-	}
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Elliptic curve point doubling routine.
-//
-// R(rx,ry,rz) = 2 * P(px,py,pz)
-//
-// Note, that P(px,py,pz) is supposed to be in projective Jacobian coordinates,
-// R will be in projective Jacobian coordinates.
-//
-// This routine implements algorithm 3.21 from "Guide to Elliptic Curve
-// Cryptography", the only difference is that step 6. does T1 = T2 + T2 and
-// then T2 = T2 + T1 instead of T2 = 3 * T2, because our addition is much
-// faster, than multiplication.
-//
-// Note, that this routine also handles one special case, namely when P is at
-// infinity.
-//
-// Instead of actual modular division, multiplication by pre-computed constant
-// (2^-1 mod q) is done.
-//
-// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
-// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
-// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
-// then fpga_modular_mul(pz, &t1, &t1) instead.
-//
-// WARNING: Though this procedure always does doubling steps, it does not take
-// any active measures to keep run-time constant. The main purpose of this
-// model is to help debug Verilog code for FPGA, so *DO NOT* use is anywhere
-// near production!
-//
-//------------------------------------------------------------------------------
-void fpga_curve_double_jacobian(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz)
-//------------------------------------------------------------------------------
-{
-	FPGA_BUFFER t1, t2, t3;	// temporary variables
-
-		// check, whether P is at infinity
-	bool pz_is_zero = fpga_buffer_is_zero(pz);
-
-	/*  2. */ fpga_buffer_copy(pz,  &t1);
-              fpga_modular_mul(pz,  &t1,          &t1);
-	/*  3. */ fpga_modular_sub(px,  &t1,          &t2);
-	/*  4. */ fpga_modular_add(px,  &t1,          &t1);
-	/*  5. */ fpga_modular_mul(&t1, &t2,          &t2);
-	/*  6. */ fpga_modular_add(&t2, &t2,          &t1);
-	/*     */ fpga_modular_add(&t1, &t2,          &t2);
-	/*  7. */ fpga_modular_add(py,  py,           ry);
-	/*  8. */ fpga_modular_mul(pz,  ry,           rz);
-	/*  9. */ fpga_buffer_copy(ry,  &t1);
-	          fpga_buffer_copy(ry,  &t3);
-	          fpga_modular_mul(&t1, &t3,          ry);
-	/* 10. */ fpga_modular_mul(px,  ry,           &t3);
-	/* 11. */ fpga_buffer_copy(ry,  &t1);
-	          fpga_modular_mul(ry,  &t1,          &t1);
-	/* 12. */ fpga_modular_mul(&t1, &ecdsa_delta, ry);
-	/* 13. */ fpga_buffer_copy(&t2, &t1);
-	          fpga_modular_mul(&t1, &t2,          rx);
-	/* 14. */ fpga_modular_add(&t3, &t3,          &t1);
-	/* 15. */ fpga_modular_sub(rx,  &t1,          rx);
-	/* 16. */ fpga_modular_sub(&t3, rx,           &t1);	
-	/* 17. */ fpga_modular_mul(&t1, &t2,          &t1);
-	/* 18. */ fpga_modular_sub(&t1, ry,           ry);	
-
-		// handle special case (input point is at infinity)
-	if (pz_is_zero)
-	{	fpga_buffer_copy(&ecdsa_one,  rx);
-		fpga_buffer_copy(&ecdsa_one,  ry);
-		fpga_buffer_copy(&ecdsa_zero, rz);
-	}
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Elliptic curve point addition routine.
-//
-// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy)
-//
-// Note, that P(px, py, pz) is supposed to be in projective Jacobian
-// coordinates, while Q(qx,qy) is supposed to be in affine coordinates,
-// R(rx, ry, rz) will be in projective Jacobian coordinates. Moreover, in this
-// particular implementation Q is always the base point G.
-//
-// This routine implements algorithm 3.22 from "Guide to Elliptic Curve
-// Cryptography". Differences from the original algorithm:
-// 
-// 1) Step 1. is omitted, because point Q is always the base point, which is
-//    not at infinity by definition.
-//
-// 2) Step 9.1 just returns the pre-computed double of the base point instead
-// of actually doubling it.
-//
-// Note, that this routine also handles three special cases:
-//
-// 1) P is at infinity
-// 2) P == Q
-// 3) P == -Q
-//
-// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
-// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
-// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
-// then fpga_modular_mul(pz, &t1, &t1) instead.
-//
-// WARNING: This procedure does not take any active measures to keep run-time
-// constant. The main purpose of this model is to help debug Verilog code for
-// FPGA, so *DO NOT* use is anywhere near production!
-//
-//------------------------------------------------------------------------------
-void fpga_curve_add_jacobian(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz)
-//------------------------------------------------------------------------------
-{
-	FPGA_BUFFER t1, t2, t3, t4;		// temporary variables
-	
-	bool pz_is_zero = fpga_buffer_is_zero(pz);		// Step 2.
-
-	/*  3. */ fpga_buffer_copy(pz,  &t1);
-	          fpga_modular_mul(pz,  &t1,        &t1);
-	/*  4. */ fpga_modular_mul(pz,  &t1,        &t2);
-	/*  5. */ fpga_modular_mul(&t1, &ecdsa_g_x, &t1);
-	/*  6. */ fpga_modular_mul(&t2, &ecdsa_g_y, &t2);
-	/*  7. */ fpga_modular_sub(&t1, px,         &t1);
-	/*  8. */ fpga_modular_sub(&t2, py,         &t2);
-
-	bool t1_is_zero = fpga_buffer_is_zero(&t1);		// | Step 9.
-	bool t2_is_zero = fpga_buffer_is_zero(&t2);		// |
-
-	/* 10. */ fpga_modular_mul(pz,  &t1,        rz);
-	/* 11. */ fpga_buffer_copy(&t1, &t3);
-	          fpga_modular_mul(&t1, &t3,        &t3);
-	/* 12. */ fpga_modular_mul(&t1, &t3,        &t4);
-	/* 13. */ fpga_modular_mul(px,  &t3,        &t3);
-	/* 14. */ fpga_modular_add(&t3, &t3,        &t1);
-	/* 15. */ fpga_buffer_copy(&t2, rx);
-	          fpga_modular_mul(rx,  &t2,        rx);
-	/* 16. */ fpga_modular_sub(rx,  &t1,        rx);
-	/* 17. */ fpga_modular_sub(rx,  &t4,        rx);
-	/* 18. */ fpga_modular_sub(&t3, rx,         &t3);
-	/* 19. */ fpga_modular_mul(&t2, &t3,        &t3);
-	/* 20. */ fpga_modular_mul(py,  &t4,        &t4);
-	/* 21. */ fpga_modular_sub(&t3, &t4,        ry);
-
-		//
-		// final selection
-		//
-	if (pz_is_zero)	// P at infinity ?
-	{	fpga_buffer_copy(&ecdsa_g_x, rx);
-		fpga_buffer_copy(&ecdsa_g_y, ry);
-		fpga_buffer_copy(&ecdsa_one, rz);
-	}
-	else if (t1_is_zero) // same x for P and Q ?
-	{
-		fpga_buffer_copy(t2_is_zero ? &ecdsa_h_x : &ecdsa_one,  rx);	// | same y ? (P==Q => R=2*G) : (P==-Q => R=O)
-		fpga_buffer_copy(t2_is_zero ? &ecdsa_h_y : &ecdsa_one,  ry);	// |
-		fpga_buffer_copy(t2_is_zero ? &ecdsa_one : &ecdsa_zero, rz);	// |
-	}
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Conversion from projective Jacobian to affine coordinates.
-//
-// P(px,py,pz) -> Q(qx,qy)
-//
-// Note, that qx = px / Z^2 and qy = py / Z^3. Division in modular arithmetic
-// is equivalent to multiplication by the inverse value of divisor, so
-// qx = px * (pz^-1)^2 and qy = py * (pz^-1)^3.
-//
-// Note, that this procedure does *NOT* handle points at infinity correctly. It
-// can only be called from the base point multiplication routine, that
-// specifically makes sure that P is not at infinity, so pz will always be
-// non-zero value.
-//
-//------------------------------------------------------------------------------
-void fpga_curve_point_to_affine(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
-//------------------------------------------------------------------------------
-{
-	FPGA_BUFFER pz1;					// inverse value of pz
-	FPGA_BUFFER t2, t3;					// intermediate values
-
-	fpga_modular_inv(pz, &pz1);			// pz1 = pz^-1 (mod q)
-
-	fpga_modular_mul(&pz1, &pz1, &t2);	// t2 = pz1 ^ 2 (mod q)
-	fpga_modular_mul(&pz1, &t2,  &t3);	// t3 = tz1 ^ 3 (mod q)
-
-	fpga_modular_mul(px, &t2, qx);		// qx = px * (pz^-1)^2 (mod q)
-	fpga_modular_mul(py, &t3, qy);		// qy = py * (pz^-1)^3 (mod q)
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Elliptic curve base point scalar multiplication routine.
-//
-// Q(qx,qy) = k * G(px,py)
-//
-// Note, that Q is supposed to be in affine coordinates. Multiplication is done
-// using the double-and-add algorithm 3.27 from "Guide to Elliptic Curve
-// Cryptography".
-//
-// WARNING: Though this procedure always does the addition step, it only
-// updates the result when current bit of k is set. It does not take any
-// active measures to keep run-time constant. The main purpose of this model
-// is to help debug Verilog code for FPGA, so *DO NOT* use it anywhere near
-// production!
-//
-//------------------------------------------------------------------------------
-void fpga_curve_scalar_multiply(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
-//------------------------------------------------------------------------------
-{
-	int word_count, bit_count;	// counters
-
-	FPGA_BUFFER rx, ry, rz;		// intermediate result
-	FPGA_BUFFER tx, ty, tz;		// temporary variable
-
-		/* set initial value of R to point at infinity */
-	fpga_buffer_copy(&ecdsa_one,  &rx);
-	fpga_buffer_copy(&ecdsa_one,  &ry);
-	fpga_buffer_copy(&ecdsa_zero, &rz);
-
-		/* process bits of k left-to-right */
-	for (word_count=OPERAND_NUM_WORDS; word_count>0; word_count--)
-		for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
-		{
-				/* calculate T = 2 * R */
-			fpga_curve_double_jacobian(&rx, &ry, &rz, &tx, &ty, &tz);
-
-				/* always calculate R = T + P for constant-time */
-			fpga_curve_add_jacobian(&tx, &ty, &tz, &rx, &ry, &rz);
-
-				/* revert to the value of T before addition if the current bit of k is not set */
-			if (!((k->words[word_count-1] >> (bit_count-1)) & 1))
-			{	fpga_buffer_copy(&tx, &rx);
-				fpga_buffer_copy(&ty, &ry);
-				fpga_buffer_copy(&tz, &rz);
-			}
-
-		}
-
-		// convert result to affine coordinates anyway
-	fpga_curve_point_to_affine(&rx, &ry, &rz, qx, qy);
-
-		// check, that rz is non-zero (not point at infinity)
-	bool rz_is_zero = fpga_buffer_is_zero(&rz);
-
-		// handle special case (result is point at infinity)
-	if (rz_is_zero)
-	{	fpga_buffer_copy(&ecdsa_zero, qx);
-		fpga_buffer_copy(&ecdsa_zero, qy);
-	}
-}
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/fpga_curve.h b/fpga_curve.h
deleted file mode 100644
index c90cc16..0000000
--- a/fpga_curve.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// fpga_curve.h
-// ------------------------------------
-// Elliptic curve arithmetic procedures
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Globals
-//------------------------------------------------------------------------------
-extern FPGA_BUFFER ecdsa_g_x, ecdsa_g_y;
-extern FPGA_BUFFER ecdsa_h_x, ecdsa_h_y;
-extern FPGA_BUFFER ecdsa_q_x, ecdsa_q_y;
-extern FPGA_BUFFER ecdsa_r_x, ecdsa_r_y;
-
-
-//------------------------------------------------------------------------------
-// Prototypes
-//------------------------------------------------------------------------------
-void	fpga_curve_init					();
-void	fpga_curve_scalar_multiply		(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
-void	fpga_curve_add_jacobian			(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz);
-void	fpga_curve_double_jacobian		(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz);
-void	fpga_curve_point_to_affine		(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/fpga_modular.cpp b/fpga_modular.cpp
deleted file mode 100644
index 9b01df0..0000000
--- a/fpga_modular.cpp
+++ /dev/null
@@ -1,838 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// fpga_modular.cpp
-// ---------------------------
-// Modular arithmetic routines
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Headers
-//------------------------------------------------------------------------------
-#include <stdint.h>
-#include "ecdsa_model.h"
-#include "fpga_lowlevel.h"
-#include "fpga_modular.h"
-
-
-//------------------------------------------------------------------------------
-// Globals
-//------------------------------------------------------------------------------
-FPGA_BUFFER ecdsa_q;
-FPGA_BUFFER ecdsa_zero;
-FPGA_BUFFER ecdsa_one;
-FPGA_BUFFER ecdsa_delta;
-
-
-//------------------------------------------------------------------------------
-void fpga_modular_init()
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-	FPGA_BUFFER tmp_q		= ECDSA_Q;
-	FPGA_BUFFER tmp_zero	= ECDSA_ZERO;
-	FPGA_BUFFER tmp_one		= ECDSA_ONE;
-	FPGA_BUFFER tmp_delta	= ECDSA_DELTA;
-
-		/* fill buffers for large multi-word integers */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{	ecdsa_q.words[w]		= tmp_q.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_zero.words[w]		= tmp_zero.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_one.words[w]		= tmp_one.words[OPERAND_NUM_WORDS - (w+1)];
-		ecdsa_delta.words[w]	= tmp_delta.words[OPERAND_NUM_WORDS - (w+1)];
-	}
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Modular addition.
-//
-// This routine implements algorithm 3. from "Ultra High Performance ECC over
-// NIST Primes on Commercial FPGAs".
-//
-// s = (a + b) mod q
-//
-// The naive approach is like the following:
-//
-// 1. s = a + b
-// 2. if (s >= q) s -= q
-//
-// The speed-up trick is to simultaneously calculate (a + b) and (a + b - q)
-// and then select the right variant.
-//
-//------------------------------------------------------------------------------
-void fpga_modular_add(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *s)
-//------------------------------------------------------------------------------
-{
-	int w;					// word counter
-	FPGA_BUFFER ab, ab_n;	// intermediate buffers
-	bool c_in, c_out;		// carries
-	bool b_in, b_out;		// borrows
-
-	c_in = false;			// first word has no carry
-	b_in = false;			// first word has no borrow
-	
-		// run parallel addition and subtraction
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{
-		fpga_lowlevel_add32(a->words[w], b->words[w],      c_in, &ab.words[w],   &c_out);
-		fpga_lowlevel_sub32(ab.words[w], ecdsa_q.words[w], b_in, &ab_n.words[w], &b_out);
-
-		c_in = c_out;	// propagate carry
-		b_in = b_out;	// propagate borrow
-	}
-
-		// now select the right buffer
-
-		/*
-		 * We select the right variant based on borrow and carry flags after
-		 * addition and subtraction of the very last pair of words. Note, that
-		 * we only need to select the first variant (a + b) when (a + b) < q.
-		 * This way if we get negative number after subtraction, we discard it
-		 * and use the output of the adder instead. The subtractor output is
-		 * negative when borrow flag is set *and* carry flag is not set. When
-		 * both borrow and carry are set, the number is non-negative, because
-		 * borrow and carry cancel each other out.
-		 */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		s->words[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w];
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Modular subtraction.
-//
-// This routine implements algorithm 3. from "Ultra High Performance ECC over
-// NIST Primes on Commercial FPGAs".
-//
-// d = (a - b) mod q
-//
-// The naive approach is like the following:
-//
-// 1. d = a - b
-// 2. if (a < b) d += q
-//
-// The speed-up trick is to simultaneously calculate (a - b) and (a - b + q)
-// and then select the right variant.
-//
-//------------------------------------------------------------------------------
-void fpga_modular_sub(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *d)
-//------------------------------------------------------------------------------
-{
-	int w;					// word counter
-	FPGA_BUFFER ab, ab_n;	// intermediate buffers
-	bool c_in, c_out;		// carries
-	bool b_in, b_out;		// borrows
-
-	c_in = false;			// first word has no carry
-	b_in = false;			// first word has no borrow
-	
-		// run parallel subtraction and addition
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-	{
-		fpga_lowlevel_sub32(a->words[w], b->words[w],          b_in, &ab.words[w],   &b_out);
-		fpga_lowlevel_add32(ab.words[w], ecdsa_q.words[w], c_in, &ab_n.words[w], &c_out);
-
-		b_in = b_out;	// propagate borrow
-		c_in = c_out;	// propagate carry
-	}
-
-		// now select the right buffer
-
-		/*
-		 * We select the right variant based on borrow flag after subtraction
-		 * and addition of the very last pair of words. Note, that we only
-		 * need to select the second variant (a - b + q) when a < b. This way
-		 * if we get negative number after subtraction, we discard it
-		 * and use the output of the adder instead. The Subtractor output is
-		 * negative when borrow flag is set.
-		 */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		d->words[w] = b_out ? ab_n.words[w] : ab.words[w];
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Modular multiplication.
-//
-// This routine implements modular multiplication algorithm from "Ultra High
-// Performance ECC over NIST Primes on Commercial FPGAs".
-//
-// p = (a * b) mod q
-//
-// The complex algorithm is split into three parts:
-//
-// 1. Calculation of partial words
-// 2. Acccumulation of partial words into full-size product
-// 3. Modular reduction of the full-size product
-//
-// See comments for corresponding helper routines for more information.
-//
-//------------------------------------------------------------------------------
-void fpga_modular_mul(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *p)
-//------------------------------------------------------------------------------
-{
-	FPGA_WORD_EXTENDED si[4*OPERAND_NUM_WORDS-1];	// parts of intermediate product
-	FPGA_WORD c[2*OPERAND_NUM_WORDS];				// full-size intermediate product
-
-		/* multiply to get partial words */
-	fpga_modular_mul_helper_multiply(a, b, si);
-
-		/* accumulate partial words into full-size product */
-	fpga_modular_mul_helper_accumulate(si, c);
-
-		/* reduce full-size product using special routine */
-	fpga_modular_mul_helper_reduce(c, p);
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Modular multiplicative inversion procedure.
-//
-// a1 = a^-1 (mod n)
-//
-// This routine implements the algorithm from "Constant Time Modular
-// Inversion" by Joppe W. Bos (http://www.joppebos.com/files/CTInversion.pdf)
-//
-// The algorithm has two phases: 1) calculation of "almost" modular inverse,
-// which is a^-1*2^k and 2) removal of redundant factor 2^k.
-//
-// The first stage has four temporary variables: r, s, u, v; they are updated
-// at every iteration. Depending on flags there can be four branches, FPGA
-// will pre-calculate all possible values in parallel and then use a multiplexor
-// to select the next value. This implementation also calculates all possible
-// outcomes. This is done for debugging purposes, *NOT* for constant run-time!
-//
-// The second part only works with s and k.
-//
-// Note, that k is a simple counter, and it can't exceed 2*OPERAND_WIDTH.
-//
-// The complex inversion algorithm uses helper routines. Note, that width of the
-// intermediate results can temporarily exceed OPERAND_WIDTH, so all the helper
-// routines process OPERAND_NUM_WORDS+1 words.
-//
-//------------------------------------------------------------------------------
-void fpga_modular_inv(FPGA_BUFFER *a, FPGA_BUFFER *a1)
-{
-	int i;	// round counter
-	int w;	// word counter
-	int k;	// redundant power of 2
-
-		/* q, 1 */
-	FPGA_WORD buf_q[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_1[OPERAND_NUM_WORDS+1];
-
-		/* r, s */
-	FPGA_WORD buf_r[OPERAND_NUM_WORDS+1],        buf_s[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_r_double[OPERAND_NUM_WORDS+1], buf_s_double[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_r_new[OPERAND_NUM_WORDS+1],    buf_s_new[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_r_plus_s[OPERAND_NUM_WORDS+1], buf_s_plus_r[OPERAND_NUM_WORDS+1];
-
-		/* u, v */
-	FPGA_WORD buf_u[OPERAND_NUM_WORDS+1],              buf_v[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_u_half[OPERAND_NUM_WORDS+1],         buf_v_half[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_u_minus_v[OPERAND_NUM_WORDS+1],      buf_v_minus_u[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_u_minus_v_half[OPERAND_NUM_WORDS+1], buf_v_minus_u_half[OPERAND_NUM_WORDS+1];
-	FPGA_WORD buf_u_new[OPERAND_NUM_WORDS+1],          buf_v_new[OPERAND_NUM_WORDS+1];
-
-		/* comparison */
-	int cmp_v_1, cmp_u_v;
-
-		/* clear buffers */
-	for (w=0; w<=OPERAND_NUM_WORDS; w++)
-		buf_r[w] = 0, buf_s[w] = 0,
-		buf_u[w] = 0, buf_v[w] = 0,
-		buf_q[w] = 0, buf_1[w] = 0;
-
-		/* initialize q, 1 */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		buf_q[w] = ecdsa_q.words[w], buf_1[w] = ecdsa_one.words[w];
-
-		/* initialize r, s */
-	buf_r[0] = 0, buf_s[0] = 1;
-
-		/* initialize u, v */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		buf_u[w] = ecdsa_q.words[w], buf_v[w] = a->words[w];
-
-		/* initialize k */
-	k = 0;
-
-		/* flags for the first stage */
-	bool v_is_1, u_is_greater_than_v, u_is_even, v_is_even;			
-
-		/* first stage */
-	for (i=0; i<(2*OPERAND_WIDTH); i++)
-	{
-			/* pre-calculate all possible values for r and s */
-		fpga_modular_inv_helper_shl(buf_r, buf_r_double);			// r_double = 2 * r
-		fpga_modular_inv_helper_shl(buf_s, buf_s_double);			// s_double = 2 * s
-		fpga_modular_inv_helper_add(buf_r, buf_s, buf_r_plus_s);	// r_plus_s = r + s
-		fpga_modular_inv_helper_add(buf_s, buf_r, buf_s_plus_r);	// s_plus_r = s + r
-
-			/* pre-calculate all possible values for u and v */
-		fpga_modular_inv_helper_shr(buf_u, buf_u_half);						// u_half = u / 2
-		fpga_modular_inv_helper_shr(buf_v, buf_v_half);						// v_half = v / 2
-		fpga_modular_inv_helper_sub(buf_u, buf_v, buf_u_minus_v);			// u_minus_v = u - v
-		fpga_modular_inv_helper_sub(buf_v, buf_u, buf_v_minus_u);			// v_minus_u = v - u
-		fpga_modular_inv_helper_shr(buf_u_minus_v, buf_u_minus_v_half);		// u_minus_v_half = u_minus_v / 2
-		fpga_modular_inv_helper_shr(buf_v_minus_u, buf_v_minus_u_half);		// v_minus_u_half = v_minus_u / 2
-
-			/* compare */
-		fpga_modular_inv_helper_cmp(buf_v, buf_1, &cmp_v_1);
-		fpga_modular_inv_helper_cmp(buf_u, buf_v, &cmp_u_v);
-
-			/* assign flags */
-		v_is_1				= (cmp_v_1 == 0);
-		u_is_greater_than_v	= (cmp_u_v  > 0);
-		u_is_even			= !(buf_u[0] & 1);
-		v_is_even			= !(buf_v[0] & 1);
-
-			/* select u */
-		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_u_new, buf_u_half);
-		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_u_new, buf_u);
-		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_u_new, u_is_greater_than_v ? buf_u_minus_v_half : buf_u);
-
-			/* select v */
-		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_v_new, buf_v);
-		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_v_new, buf_v_half);
-		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_v_new, u_is_greater_than_v ? buf_v : buf_v_minus_u_half);
-
-			/* select r */
-		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_r_new, buf_r);
-		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_r_new, buf_r_double);
-		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_r_new, u_is_greater_than_v ? buf_r_plus_s : buf_r_double);
-
-			/* select s */
-		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_s_new, buf_s_double);
-		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_s_new, buf_s);
-		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_s_new, u_is_greater_than_v ? buf_s_double : buf_s_plus_r);
-
-			/* update values */
-		if (!v_is_1)
-		{	fpga_modular_inv_helper_cpy(buf_u, buf_u_new);
-			fpga_modular_inv_helper_cpy(buf_v, buf_v_new);
-			fpga_modular_inv_helper_cpy(buf_r, buf_r_new);
-			fpga_modular_inv_helper_cpy(buf_s, buf_s_new);
-		}
-
-			/* update k */
-		if (!v_is_1) k++;
-	}
-
-		//
-		// Note, that to save FPGA resources, the second stage re-uses buffers
-		// used in the first stage.
-		//
-
-		/* flags for the second stage */
-	bool k_is_0, s_is_odd;
-
-		/* second stage */
-	for (i=0; i<(2*OPERAND_WIDTH); i++)
-	{
-			/* pre-calculate all possible values */
-		fpga_modular_inv_helper_shr(buf_s, buf_u);
-		fpga_modular_inv_helper_add(buf_s, buf_q, buf_r);
-		fpga_modular_inv_helper_shr(buf_r, buf_v);
-
-			//
-			// assign flags
-			//
-		s_is_odd = buf_s[0] & 1;
-		k_is_0   = (k == 0);
-
-			//
-			// select new values based on flags
-			//
-		fpga_modular_inv_helper_cpy(buf_s_new, s_is_odd ? buf_v : buf_u);
-
-			/* update s */
-		if (! k_is_0)
-			fpga_modular_inv_helper_cpy(buf_s, buf_s_new);
-
-			/* update k */
-		if (! k_is_0) k--;
-	}
-
-		/* done, copy s into output buffer */
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		a1->words[w] = buf_s[w];
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Parallelized multiplication.
-//
-// This routine implements the algorithm in Fig. 3. from "Ultra High
-// Performance ECC over NIST Primes on Commercial FPGAs".
-//
-// Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2
-// bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide
-// inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS
-// multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial
-// output words si[].
-//
-// The order of loading a and b into the multipliers is a bit complicated,
-// during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is
-// obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the
-// remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously.
-//
-//------------------------------------------------------------------------------
-void fpga_modular_mul_helper_multiply(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si)
-//------------------------------------------------------------------------------
-{
-	int w;			// counter
-	int t, x;		// more counters
-	int j, i;		// word indices
-	FPGA_WORD p;	// product
-
-		// buffers for smaller words that multipliers can handle
-	FPGA_WORD_REDUCED ai[2*OPERAND_NUM_WORDS];
-	FPGA_WORD_REDUCED bj[2*OPERAND_NUM_WORDS];
-	
-		// split a and b into smaller words
-	for (w=0; w<OPERAND_NUM_WORDS; w++)
-		ai[2*w] = (FPGA_WORD_REDUCED)a->words[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)),
-		bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2));
-
-		// accumulators
-	FPGA_WORD_EXTENDED mac[2*OPERAND_NUM_WORDS];
-	
-		// clear accumulators
-	for (w=0; w<(2*OPERAND_NUM_WORDS); w++) mac[w] = 0;
-
-		// run the crazy algorithm :)
-	for (t=0; t<(2*OPERAND_NUM_WORDS); t++)
-	{
-			// save upper half of si[] (one word per cycle)
-		if (t > 0)
-		{	si[4*OPERAND_NUM_WORDS - (t+1)] = mac[t];
-			mac[t] = 0;
-		}
-
-			// update index
-		j = 2*OPERAND_NUM_WORDS - (t+1);
-
-			// parallel multiplication
-		for (x=0; x<(2*OPERAND_NUM_WORDS); x++)
-		{
-				// update index
-			i = t - x;
-			if (i < 0) i += 2*OPERAND_NUM_WORDS;
-
-				// multiply...
-			fpga_lowlevel_mul16(ai[i], bj[j], &p);
-
-				// ...accumulate
-			mac[x] += p;
-		}
-
-	}
-
-		// now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once)
-	for (w=0; w<(2*OPERAND_NUM_WORDS); w++)
-		si[w] = mac[2*OPERAND_NUM_WORDS - (w+1)];
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Accumulation of partial words into full-size product.
-//
-// This routine implements the Algorithm 4. from "Ultra High Performance ECC
-// over NIST Primes on Commercial FPGAs".
-//
-// Input words si[] are accumulated into full-size product c[].
-//
-// The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in
-// si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are
-// summed in full, odd words are split into two parts. During every cycle the
-// upper part of the previous word and the lower part of the following word are
-// summed too.
-//
-//------------------------------------------------------------------------------
-void fpga_modular_mul_helper_accumulate(FPGA_WORD_EXTENDED *si, FPGA_WORD *c)
-//------------------------------------------------------------------------------
-{
-	int w;							// word counter
-	FPGA_WORD_EXTENDED cw0, cw1;	// intermediate sums
-	FPGA_WORD_REDUCED  cw_carry;	// wide carry
-
-		// clear carry
-	cw_carry = 0;
-
-		// execute the algorithm
-	for (w=0; w<(2*OPERAND_NUM_WORDS); w++)
-	{
-			// handy flags
-		bool w_is_first = (w == 0);
-		bool w_is_last  = (w == (2*OPERAND_NUM_WORDS-1));
-
-			// accumulate full current even word...
-			// ...and also the upper part of the previous odd word (if not the first word)
-		fpga_lowlevel_add48(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0);
-
-			// generate another word from "carry" part of the previous even word...
-			// ...and also the lower part of the following odd word (if not the last word)
-		cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2));
-		cw1 |= (FPGA_WORD_EXTENDED)cw_carry;
-
-			// accumulate once again
-		fpga_lowlevel_add48(cw0, cw1, &cw1);
-
-			// store current word...
-		c[w] = (FPGA_WORD)cw1;
-
-			// ...and carry
-		cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH);
-	}
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Fast modular reduction for NIST prime P-256.
-//
-// p = c mod p256
-//
-// This routine implements the algorithm 2.29 from "Guide to Elliptic Curve
-// Cryptography".
-//
-// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
-// on the other hand is the output of the parallelized Comba multiplier, so it
-// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
-//
-// To save FPGA resources, the calculation is done using only two adders and
-// one subtractor. The algorithm is split into five steps.
-//
-//------------------------------------------------------------------------------
-#if USE_CURVE == 1
-void fpga_modular_mul_helper_reduce_p256(FPGA_WORD *c, FPGA_BUFFER *p)
-{
-		// "funny" words
-	FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9;
-
-		// compose "funny" words out of input words
-	s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0];
-	s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0,     s2.words[1] = 0,     s2.words[0] = 0;
-	s3.words[7] = 0,     s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0,     s3.words[1] = 0,     s3.words[0] = 0;
-	s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0,     s4.words[4] = 0,     s4.words[3] = 0,     s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8];
-	s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9];
-	s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0,     s6.words[4] = 0,     s6.words[3] = 0,     s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11];
-	s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0,     s7.words[4] = 0,     s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12];
-	s8.words[7] = c[12], s8.words[6] = 0,     s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13];
-	s9.words[7] = c[13], s9.words[6] = 0,     s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0,     s9.words[1] = c[15], s9.words[0] = c[14];
-	
-		// intermediate results
-	FPGA_BUFFER sum0, sum1, difference;
-
-		/* Step 1. */
-	fpga_modular_add(&s2,         &s2,         &sum0);			// sum0 = 2*s2
-	fpga_modular_add(&s3,         &s3,         &sum1);			// sum1 = 2*s3
-	fpga_modular_sub(&ecdsa_zero, &s6,         &difference);	// difference = -s6
-
-		/* Step 2. */
-	fpga_modular_add(&sum0,       &s1,         &sum0);			// sum0 = s1 + 2*s2
-	fpga_modular_add(&sum1,       &s4,         &sum1);			// sum1 = s4 + 2*s3
-	fpga_modular_sub(&difference, &s7,         &difference);	// difference = -(s6 + s7)
-
-		/* Step 3. */
-	fpga_modular_add(&sum0,       &s5,         &sum0);			// sum0 = s1 + 2*s2 + s5
-	fpga_modular_add(&sum1,       &ecdsa_zero, &sum1);			// compulsory cycle to keep sum1 constant for next stage
-	fpga_modular_sub(&difference, &s8,         &difference);	// difference = -(s6 + s7 + s8)
-
-		/* Step 4. */
-	fpga_modular_add(&sum0,       &sum1,       &sum0);			// sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
-//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
-	fpga_modular_sub(&difference, &s9,         &difference);	// difference = -(s6 + s7 + s8 + s9)
-
-		/* Step 5. */
-	fpga_modular_add(&sum0,       &difference, p);				// p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9
-//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
-//	fpga_modular_add(<dummy>,     <dummy>,     &difference);	// dummy cycle, result ignored
-}
-#endif
-
-
-//------------------------------------------------------------------------------
-//
-// Fast modular reduction for NIST prime P-384.
-//
-// p = c mod p384
-//
-// This routine implements the algorithm 2.30 from "Guide to Elliptic Curve
-// Cryptography".
-//
-// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
-// on the other hand is the output of the parallelized Comba multiplier, so it
-// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
-//
-// To save FPGA resources, the calculation is done using only two adders and
-// one subtractor. The algorithm is split into five steps.
-//
-//------------------------------------------------------------------------------
-#if USE_CURVE == 2
-void fpga_modular_mul_helper_reduce_p384(FPGA_WORD *c, FPGA_BUFFER *p)
-{
-		// "funny" words
-	FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-
-		// compose "funny" words
-	 s1.words[11] = c[11],   s1.words[10] = c[10],   s1.words[ 9] = c[ 9],   s1.words[ 8] = c[ 8],   s1.words[ 7] = c[ 7],   s1.words[ 6] = c[ 6],   s1.words[ 5] = c[ 5],   s1.words[ 4] = c[ 4],   s1.words[ 3] = c[ 3],   s1.words[ 2] = c[ 2],   s1.words[ 1] = c[ 1],   s1.words[ 0] = c[ 0];
-	 s2.words[11] = 0,       s2.words[10] = 0,       s2.words[ 9] = 0,       s2.words[ 8] = 0,       s2.words[ 7] = 0,       s2.words[ 6] = c[23],   s2.words[ 5] = c[22],   s2.words[ 4] = c[21],   s2.words[ 3] = 0,       s2.words[ 2] = 0,       s2.words[ 1] = 0,       s2.words[ 0] = 0;
-	 s3.words[11] = c[23],   s3.words[10] = c[22],   s3.words[ 9] = c[21],   s3.words[ 8] = c[20],   s3.words[ 7] = c[19],   s3.words[ 6] = c[18],   s3.words[ 5] = c[17],   s3.words[ 4] = c[16],   s3.words[ 3] = c[15],   s3.words[ 2] = c[14],   s3.words[ 1] = c[13],   s3.words[ 0] = c[12];
-	 s4.words[11] = c[20],   s4.words[10] = c[19],   s4.words[ 9] = c[18],   s4.words[ 8] = c[17],   s4.words[ 7] = c[16],   s4.words[ 6] = c[15],   s4.words[ 5] = c[14],   s4.words[ 4] = c[13],   s4.words[ 3] = c[12],   s4.words[ 2] = c[23],   s4.words[ 1] = c[22],   s4.words[ 0] = c[21];
-	 s5.words[11] = c[19],   s5.words[10] = c[18],   s5.words[ 9] = c[17],   s5.words[ 8] = c[16],   s5.words[ 7] = c[15],   s5.words[ 6] = c[14],   s5.words[ 5] = c[13],   s5.words[ 4] = c[12],   s5.words[ 3] = c[20],   s5.words[ 2] = 0,       s5.words[ 1] = c[23],   s5.words[ 0] = 0;
-	 s6.words[11] = 0,       s6.words[10] = 0,       s6.words[ 9] = 0,       s6.words[ 8] = 0,       s6.words[ 7] = c[23],   s6.words[ 6] = c[22],   s6.words[ 5] = c[21],   s6.words[ 4] = c[20],   s6.words[ 3] = 0,       s6.words[ 2] = 0,       s6.words[ 1] = 0,       s6.words[ 0] = 0;
-	 s7.words[11] = 0,       s7.words[10] = 0,       s7.words[ 9] = 0,       s7.words[ 8] = 0,       s7.words[ 7] = 0,       s7.words[ 6] = 0,       s7.words[ 5] = c[23],   s7.words[ 4] = c[22],   s7.words[ 3] = c[21],   s7.words[ 2] = 0,       s7.words[ 1] = 0,       s7.words[ 0] = c[20];
-	 s8.words[11] = c[22],   s8.words[10] = c[21],   s8.words[ 9] = c[20],   s8.words[ 8] = c[19],   s8.words[ 7] = c[18],   s8.words[ 6] = c[17],   s8.words[ 5] = c[16],   s8.words[ 4] = c[15],   s8.words[ 3] = c[14],   s8.words[ 2] = c[13],   s8.words[ 1] = c[12],   s8.words[ 0] = c[23];
-	 s9.words[11] = 0,       s9.words[10] = 0,       s9.words[ 9] = 0,       s9.words[ 8] = 0,       s9.words[ 7] = 0,       s9.words[ 6] = 0,       s9.words[ 5] = 0,       s9.words[ 4] = c[23],   s9.words[ 3] = c[22],   s9.words[ 2] = c[21],   s9.words[ 1] = c[20],   s9.words[ 0] = 0;
-	s10.words[11] = 0,      s10.words[10] = 0,      s10.words[ 9] = 0,      s10.words[ 8] = 0,      s10.words[ 7] = 0,      s10.words[ 6] = 0,      s10.words[ 5] = 0,      s10.words[ 4] = c[23],  s10.words[ 3] = c[23],  s10.words[ 2] = 0,      s10.words[ 1] = 0,      s10.words[ 0] = 0;
-
-		// intermediate results
-	FPGA_BUFFER sum0, sum1, difference;
-
-		/* Step 1. */
-	fpga_modular_add(&s1,         &s3,         &sum0);			// sum0 = s1 + s3
-	fpga_modular_add(&s2,         &s2,         &sum1);			// sum1 = 2*s2
-	fpga_modular_sub(&ecdsa_zero, &s8,         &difference);	// difference = -s8
-
-		/* Step 2. */
-	fpga_modular_add(&sum0,       &s4,         &sum0);			// sum0 = s1 + s3 + s4
-	fpga_modular_add(&sum1,       &s5,         &sum1);			// sum1 = 2*s2 + s5
-	fpga_modular_sub(&difference, &s9,         &difference);	// difference = -(s8 + s9)
-
-		/* Step 3. */
-	fpga_modular_add(&sum0,       &s6,         &sum0);			// sum0 = s1 + s3 + s4 + s6
-	fpga_modular_add(&sum1,       &s7,         &sum1);			// sum1 = 2*s2 + s5 + s7
-	fpga_modular_sub(&difference, &s10,        &difference);	// difference = -(s8 + s9 + s10)
-
-		/* Step 4. */
-	fpga_modular_add(&sum0,       &sum1,       &sum0);			// sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
-//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
-	fpga_modular_sub(&difference, &ecdsa_zero, &difference);	// compulsory cycle to keep difference constant for next stage
-
-		/* Step 5. */
-	fpga_modular_add(&sum0,       &difference, p);				// p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
-//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
-//	fpga_modular_add(<dummy>,     <dummy>,     &difference);	// dummy cycle, result ignored
-}
-#endif
-
-
-//------------------------------------------------------------------------------
-//
-// Multi-word shift to the left by 1 bit.
-//
-// y = x << 1
-//
-//------------------------------------------------------------------------------
-void fpga_modular_inv_helper_shl(FPGA_WORD *x, FPGA_WORD *y)
-//------------------------------------------------------------------------------
-{
-	int w;							// word counter
-	FPGA_WORD carry_in, carry_out;	// carries
-
-	carry_in = 0;	// first word has no carry
-
-		// shift word-by-word
-	for (w=0; w<=OPERAND_NUM_WORDS; w++)
-		carry_out = x[w] >> (FPGA_WORD_WIDTH - 1),	// store next carry
-		y[w]  = x[w] << 1,							// shift
-		y[w] |= carry_in,							// process carry
-		carry_in = carry_out;						// propagate carry
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Multi-word shift to the right by 1 bit.
-//
-// y = x >> 1
-//
-//------------------------------------------------------------------------------
-void fpga_modular_inv_helper_shr(FPGA_WORD *x, FPGA_WORD *y)
-//------------------------------------------------------------------------------
-{
-	int w;							// word counter
-	FPGA_WORD carry_in, carry_out;	// carries
-
-	carry_in = 0;	// first word has no carry
-
-		// shift word-by-word
-	for (w=OPERAND_NUM_WORDS; w>=0; w--)
-		carry_out = x[w],							// store next carry
-		y[w] = x[w] >> 1,							// shift
-		y[w] |= carry_in << (FPGA_WORD_WIDTH - 1),	// process carry
-		carry_in = carry_out;						// propagate carry
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Multi-word addition.
-//
-// s = x + y
-//
-//------------------------------------------------------------------------------
-void fpga_modular_inv_helper_add(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *s)
-//------------------------------------------------------------------------------
-{
-	int w;						// word counter
-	bool carry_in, carry_out;	// carries
-
-		// lowest word has no carry
-	carry_in = false;
-
-		// sum a and b word-by-word
-	for (w=0; w<=OPERAND_NUM_WORDS; w++)
-	{
-			// low-level addition
-		fpga_lowlevel_add32(x[w], y[w], carry_in, &s[w], &carry_out);
-
-			// propagate carry bit
-		carry_in = carry_out;
-	}
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Multi-word subtraction .
-//
-// d = x - y
-//
-//------------------------------------------------------------------------------
-void fpga_modular_inv_helper_sub(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *d)
-//------------------------------------------------------------------------------
-{
-	int w;							// word counter
-	bool borrow_in, borrow_out;		// borrows
-
-		// lowest word has no borrow
-	borrow_in = false;				
-
-		// subtract b from a word-by-word
-	for (w=0; w<=OPERAND_NUM_WORDS; w++)
-	{
-			// low-level subtraction
-		fpga_lowlevel_sub32(x[w], y[w], borrow_in, &d[w], &borrow_out);
-
-			// propagate borrow bit
-		borrow_in = borrow_out;
-	}
-
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Multi-word copy.
-//
-// dst = src
-//
-//------------------------------------------------------------------------------
-void fpga_modular_inv_helper_cpy(FPGA_WORD *dst, FPGA_WORD *src)
-//------------------------------------------------------------------------------
-{
-	int w;	// word counter
-
-		// copy all the words from src into dst
-	for (w=0; w<=OPERAND_NUM_WORDS; w++)
-		dst[w] = src[w];
-}
-
-
-//------------------------------------------------------------------------------
-//
-// Multi-word comparison.
-//
-// The return value is -1 when a<b, 0 when a=b and 1 when a>b.
-//
-//------------------------------------------------------------------------------
-void fpga_modular_inv_helper_cmp(FPGA_WORD *a, FPGA_WORD *b, int *c)
-//------------------------------------------------------------------------------
-{
-	int w;					// word counter
-	int r, r0, ra, rb;		// result
-	bool borrow;			// borrow
-	FPGA_WORD d;			// partial difference
-
-		// result is unknown so far
-	r = 0;	
-
-		// clear borrow for the very first word
-	borrow = false;
-
-		// compare a and b word-by-word
-	for (w=OPERAND_NUM_WORDS; w>=0; w--)
-	{
-			// save result
-		r0 = r;
-
-			// subtract current words
-		fpga_lowlevel_sub32(a[w], b[w], false, &d, &borrow);
-
-			// analyze flags
-		rb = borrow ? -1 : 0;					// a[w] < b[w]
-		ra = (!borrow && (d != 0)) ? 1 : 0;		// a[w] > b[w]
-		
-			//
-			// Note, that ra is either 0 or 1, rb is either 0 or -1 and they
-			// can never be non-zero at the same time.
-			//
-			// Note, that r can only be updated if comparison result has not
-			// been resolved yet. Even if we already know comparison result,
-			// we continue doing dummy subtractions to keep run-time constant.
-			//
-
-			// update result
-		r = (r == 0) ? ra + rb : r0;
-	}
-
-		// done
-	*c = r;
-}
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/fpga_util.h b/fpga_util.h
deleted file mode 100644
index 24b2aa2..0000000
--- a/fpga_util.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// fpga_util.h
-// ---------------------
-// Utility FPGA routines
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-
-//------------------------------------------------------------------------------
-// Prototypes
-//------------------------------------------------------------------------------
-bool	fpga_buffer_is_zero		(FPGA_BUFFER *x);
-void	fpga_buffer_copy		(FPGA_BUFFER *src, FPGA_BUFFER *dst);
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/test_vectors/charlie_p256.key b/test_vectors/charlie_p256.key
new file mode 100644
index 0000000..34b5c20
--- /dev/null
+++ b/test_vectors/charlie_p256.key
@@ -0,0 +1,8 @@
+-----BEGIN EC PARAMETERS-----
+BggqhkjOPQMBBw==
+-----END EC PARAMETERS-----
+-----BEGIN EC PRIVATE KEY-----
+MHcCAQEEIFA+WK/POvMze/U0CURQErgftFSMTSsTAqYX+bHQDX3goAoGCCqGSM49
+AwEHoUQDQgAE2TSbSLDujDYTmx147cGRGyUId/t61Erhi7L4pvTcyuXgoFbotQkq
+bHKPS8iQQ/vifCYnRN+9rxeD/C4BsGB3Gw==
+-----END EC PRIVATE KEY-----
diff --git a/test_vectors/charlie_p384.key b/test_vectors/charlie_p384.key
new file mode 100644
index 0000000..8894220
--- /dev/null
+++ b/test_vectors/charlie_p384.key
@@ -0,0 +1,9 @@
+-----BEGIN EC PARAMETERS-----
+BgUrgQQAIg==
+-----END EC PARAMETERS-----
+-----BEGIN EC PRIVATE KEY-----
+MIGkAgEBBDC7SgnY5SfwYmZetNCzmh3OlNqixZNbMWOwkPB5PuNqWmKnBV1dhQ0b
+FdxlbWXYs6KgBwYFK4EEACKhZANiAAS4Aj5grkLqFGMw8sOIMJbKlhsR9d/qSh1l
+6Y5kszUn+1cibbSKUUMlHvBr3veOtXrTxmRpYlqqraNH4QM8FHS2NDqTaP8pRQG7
+1TscxJ/ZctpDnJ2oJ+IwJyDit43RT54=
+-----END EC PRIVATE KEY-----
diff --git a/test_vectors/ecdsa_test_vector_nsa.h b/test_vectors/ecdsa_test_vector_nsa.h
new file mode 100644
index 0000000..762f284
--- /dev/null
+++ b/test_vectors/ecdsa_test_vector_nsa.h
@@ -0,0 +1,56 @@
+/* Values from "Suite B Implementer's Guide to FIPS 186-3 (ECDSA)" */
+
+#define ECDSA_P256_D_NSA_INIT 	\
+	{0x70a12c2d, 0xb16845ed, 0x56ff68cf, 0xc21a472b, \
+	 0x3f04d7d6, 0x851bf634, 0x9f2d7d5b, 0x3452b38a}
+
+#define ECDSA_P256_QX_NSA_INIT	\
+	{0x8101ece4, 0x7464a6ea, 0xd70cf69a, 0x6e2bd3d8, \
+	 0x8691a326, 0x2d22cba4, 0xf7635eaf, 0xf26680a8}
+	 
+#define ECDSA_P256_QY_NSA_INIT	\
+	{0xd8a12ba6, 0x1d599235, 0xf67d9cb4, 0xd58f1783, \
+	 0xd3ca43e7, 0x8f0a5aba, 0xa6240799, 0x36c0c3a9}
+	 
+#define ECDSA_P256_K_NSA_INIT	\
+	{0x580ec00d, 0x85643433, 0x4cef3f71, 0xecaed496, \
+	 0x5b12ae37, 0xfa47055b, 0x1965c7b1, 0x34ee45d0}
+
+#define ECDSA_P256_RX_NSA_INIT	\
+	{0x7214bc96, 0x47160bbd, 0x39ff2f80, 0x533f5dc6, \
+	 0xddd70ddf, 0x86bb8156, 0x61e805d5, 0xd4e6f27c}
+	 
+#define ECDSA_P256_RY_NSA_INIT	\
+	{0x8b81e3e9, 0x77597110, 0xc7cf2633, 0x435b2294, \
+	 0xb7264298, 0x7defd3d4, 0x007e1cfc, 0x5df84541}
+
+
+#define ECDSA_P384_D_NSA_INIT 	\
+	{0xc838b852, 0x53ef8dc7, 0x394fa580, 0x8a518398, \
+	 0x1c7deef5, 0xa69ba8f4, 0xf2117ffe, 0xa39cfcd9, \
+	 0x0e95f6cb, 0xc854abac, 0xab701d50, 0xc1f3cf24}
+
+#define ECDSA_P384_QX_NSA_INIT	\
+	{0x1fbac8ee, 0xbd0cbf35, 0x640b39ef, 0xe0808dd7, \
+	 0x74debff2, 0x0a2a329e, 0x91713baf, 0x7d7f3c3e, \
+	 0x81546d88, 0x3730bee7, 0xe48678f8, 0x57b02ca0}
+	 
+#define ECDSA_P384_QY_NSA_INIT	\
+	{0xeb213103, 0xbd68ce34, 0x3365a8a4, 0xc3d4555f, \
+	 0xa385f533, 0x0203bdd7, 0x6ffad1f3, 0xaffb9575, \
+	 0x1c132007, 0xe1b24035, 0x3cb0a4cf, 0x1693bdf9}
+	 
+#define ECDSA_P384_K_NSA_INIT	\
+	{0xdc6b4403, 0x6989a196, 0xe39d1cda, 0xc000812f, \
+	 0x4bdd8b2d, 0xb41bb33a, 0xf5137258, 0x5ebd1db6, \
+	 0x3f0ce827, 0x5aa1fd45, 0xe2d2a735, 0xf8749359}
+
+#define ECDSA_P384_RX_NSA_INIT	\
+	{0xa0c27ec8, 0x93092dea, 0x1e1bd2cc, 0xfed3cf94, \
+	 0x5c8134ed, 0x0c9f8131, 0x1a0f4a05, 0x942db8db, \
+	 0xed8dd59f, 0x267471d5, 0x462aa14f, 0xe72de856}
+	 
+#define ECDSA_P384_RY_NSA_INIT	\
+	{0x85564940, 0x9815bb91, 0x424eaca5, 0xfd76c973, \
+	 0x75d575d1, 0x422ec53d, 0x343bd33b, 0x847fdf0c, \
+	 0x11569685, 0xb528ab25, 0x49301542, 0x8d7cf72b}
diff --git a/test_vectors/ecdsa_test_vector_randomized.h b/test_vectors/ecdsa_test_vector_randomized.h
new file mode 100644
index 0000000..1f334c2
--- /dev/null
+++ b/test_vectors/ecdsa_test_vector_randomized.h
@@ -0,0 +1,29 @@
+/* Generated automatically, do not edit. */
+
+#define ECDSA_P256_D_RANDOM_INIT \
+	{0x503e58af, 0xcf3af333, 0x7bf53409, 0x445012b8, \
+	 0x1fb4548c, 0x4d2b1302, 0xa617f9b1, 0xd00d7de0}
+
+#define ECDSA_P256_QX_RANDOM_INIT \
+	{0xd9349b48, 0xb0ee8c36, 0x139b1d78, 0xedc1911b, \
+	 0x250877fb, 0x7ad44ae1, 0x8bb2f8a6, 0xf4dccae5}
+
+#define ECDSA_P256_QY_RANDOM_INIT \
+	{0xe0a056e8, 0xb5092a6c, 0x728f4bc8, 0x9043fbe2, \
+	 0x7c262744, 0xdfbdaf17, 0x83fc2e01, 0xb060771b}
+
+#define ECDSA_P384_D_RANDOM_INIT \
+	{0xbb4a09d8, 0xe527f062, 0x665eb4d0, 0xb39a1dce, \
+	 0x94daa2c5, 0x935b3163, 0xb090f079, 0x3ee36a5a, \
+	 0x62a7055d, 0x5d850d1b, 0x15dc656d, 0x65d8b3a2}
+
+#define ECDSA_P384_QX_RANDOM_INIT \
+	{0xb8023e60, 0xae42ea14, 0x6330f2c3, 0x883096ca, \
+	 0x961b11f5, 0xdfea4a1d, 0x65e98e64, 0xb33527fb, \
+	 0x57226db4, 0x8a514325, 0x1ef06bde, 0xf78eb57a}
+
+#define ECDSA_P384_QY_RANDOM_INIT \
+	{0xd3c66469, 0x625aaaad, 0xa347e103, 0x3c1474b6, \
+	 0x343a9368, 0xff294501, 0xbbd53b1c, 0xc49fd972, \
+	 0xda439c9d, 0xa827e230, 0x2720e2b7, 0x8dd14f9e}
+
diff --git a/test_vectors/ecdsa_test_vector_randomized.vh b/test_vectors/ecdsa_test_vector_randomized.vh
new file mode 100644
index 0000000..6c5cf80
--- /dev/null
+++ b/test_vectors/ecdsa_test_vector_randomized.vh
@@ -0,0 +1,29 @@
+/* Generated automatically, do not edit. */
+
+localparam [255:0] ECDSA_P256_D_RANDOM  =
+	{32'h503e58af, 32'hcf3af333, 32'h7bf53409, 32'h445012b8, 
+	 32'h1fb4548c, 32'h4d2b1302, 32'ha617f9b1, 32'hd00d7de0};
+
+localparam [255:0] ECDSA_P256_QX_RANDOM =
+	{32'hd9349b48, 32'hb0ee8c36, 32'h139b1d78, 32'hedc1911b, 
+	 32'h250877fb, 32'h7ad44ae1, 32'h8bb2f8a6, 32'hf4dccae5};
+
+localparam [255:0] ECDSA_P256_QY_RANDOM =
+	{32'he0a056e8, 32'hb5092a6c, 32'h728f4bc8, 32'h9043fbe2, 
+	 32'h7c262744, 32'hdfbdaf17, 32'h83fc2e01, 32'hb060771b};
+
+localparam [383:0] ECDSA_P384_D_RANDOM  =
+	{32'hbb4a09d8, 32'he527f062, 32'h665eb4d0, 32'hb39a1dce, 
+	 32'h94daa2c5, 32'h935b3163, 32'hb090f079, 32'h3ee36a5a, 
+	 32'h62a7055d, 32'h5d850d1b, 32'h15dc656d, 32'h65d8b3a2};
+
+localparam [383:0] ECDSA_P384_QX_RANDOM =
+	{32'hb8023e60, 32'hae42ea14, 32'h6330f2c3, 32'h883096ca, 
+	 32'h961b11f5, 32'hdfea4a1d, 32'h65e98e64, 32'hb33527fb, 
+	 32'h57226db4, 32'h8a514325, 32'h1ef06bde, 32'hf78eb57a};
+
+localparam [383:0] ECDSA_P384_QY_RANDOM =
+	{32'hd3c66469, 32'h625aaaad, 32'ha347e103, 32'h3c1474b6, 
+	 32'h343a9368, 32'hff294501, 32'hbbd53b1c, 32'hc49fd972, 
+	 32'hda439c9d, 32'ha827e230, 32'h2720e2b7, 32'h8dd14f9e};
+
diff --git a/test_vectors/format_random_test_vector.py b/test_vectors/format_random_test_vector.py
new file mode 100644
index 0000000..861bab5
--- /dev/null
+++ b/test_vectors/format_random_test_vector.py
@@ -0,0 +1,327 @@
+#
+# format_random_test_vector.py
+# ------------------------------------------
+# Formats test vector for ecdsa_fpga_model
+#
+# Author: Pavel Shatov
+# Copyright (c) 2017-2018, NORDUnet A/S
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+#
+# - Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+#
+# - Neither the name of the NORDUnet nor the names of its contributors may
+#   be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+
+#
+USAGE = "USAGE: format_random_test_vector.py [openssl_binary]"
+#
+
+
+#
+# This script reads the test vector generated by the script
+# regenerate_random_test_vector.py and writes nicely formatted C header file
+# and Verilog include file.
+#
+
+
+#
+# imports
+#
+import sys
+import subprocess
+
+
+#
+# list of curve names of interest
+#
+CURVE_P256 = "p256"
+CURVE_P384 = "p384"
+
+
+#
+# variables
+#
+OPENSSL = ""
+
+
+#
+# format one test vector
+#
+def format_c_header(f, curve, da, qax, qay):
+
+    if curve == CURVE_P256: curve_str = "ECDSA_P256"
+    if curve == CURVE_P384: curve_str = "ECDSA_P384"
+    
+        # write all numbers in vector
+    format_c_array(f, da,  "#define " + curve_str + "_D_RANDOM_INIT"  + " \\\n")
+    format_c_array(f, qax, "#define " + curve_str + "_QX_RANDOM_INIT" + " \\\n")
+    format_c_array(f, qay, "#define " + curve_str + "_QY_RANDOM_INIT" + " \\\n")
+    
+    
+#
+# format one test vector
+#
+def format_verilog_include(f, curve, da, qax, qay):
+
+    if curve == CURVE_P256: curve_str, msb_index = "ECDSA_P256", "255"
+    if curve == CURVE_P384: curve_str, msb_index = "ECDSA_P384", "383"
+
+        # write all numbers in vector
+    format_verilog_concatenation(f, da,   "localparam [" + msb_index + ":0] " + curve_str + "_D_RANDOM " + " =\n")
+    format_verilog_concatenation(f, qax,  "localparam [" + msb_index + ":0] " + curve_str + "_QX_RANDOM" + " =\n")
+    format_verilog_concatenation(f, qay,  "localparam [" + msb_index + ":0] " + curve_str + "_QY_RANDOM" + " =\n")
+
+
+#
+# nicely format multi-word integer into C array initializer
+#
+def format_c_array(f, n, s):
+
+        # print '#define XYZ \'
+    f.write(s)
+
+        # convert number to hex string and prepend it with zeroes if necessary
+    n_hex = hex(n).lstrip("0x").rstrip("L")
+    while (len(n_hex) % 8) > 0:
+        n_hex = "0" + n_hex
+    
+        # get number of 32-bit words
+    num_words = len(n_hex) // 8
+
+        # print all words in n
+    w = 0
+    while w < num_words:
+    
+        n_part = ""
+
+            # add tab for every new line
+        if w == 0:
+            n_part += "\t{"
+        elif (w % 4) == 0:
+            n_part += "\t "
+            
+            # add current word
+        n_part += "0x" + n_hex[8 * w : 8 * (w + 1)]
+        
+            # add separator or newline
+        if (w + 1) == num_words:
+            n_part += "}\n"
+        else:
+            n_part += ", "
+            if (w % 4) == 3:
+                n_part += "\\\n"        
+                
+        w += 1
+        
+            # write current part
+        f.write(n_part)
+    
+        # write final newline
+    f.write("\n")
+
+
+def format_verilog_concatenation(f, n, s):
+
+        # print 'localparam ZZZ ='
+    f.write(s)
+    
+        # convert number to hex string and prepend it with zeroes if necessary
+    n_hex = hex(n).split("0x")[1]
+    while (len(n_hex) % 8) > 0:
+        n_hex = "0" + n_hex
+    
+        # get number of 32-bit words
+    num_words = len(n_hex) // 8
+
+        # print all words in n
+    w = 0
+    while w < num_words:
+    
+        n_part = ""
+        
+        if w == 0:
+            n_part += "\t{"
+        elif (w % 4) == 0:
+            n_part += "\t "
+            
+        n_part += "32'h" + n_hex[8 * w : 8 * (w + 1)]
+        
+        if (w + 1) == num_words:
+            n_part += "};\n"
+        else:
+            n_part += ", "
+            if (w % 4) == 3:
+                n_part += "\n"      
+        w += 1
+        
+        f.write(n_part)
+    
+    f.write("\n")
+
+
+    #
+    # returns d, qx, qy, where
+    # d is private key and qx, qy is the corresponding public key
+    #
+def get_key(openssl, party, curve):
+
+        # generate private key filename
+    key_file = party + "_" + curve + ".key"
+        
+        # retrieve key components using openssl
+    openssl_command = [openssl, "ec", "-in", key_file, "-noout", "-text"] 
+    openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+    stdout_lines = openssl_stdout.splitlines()
+    
+    found_priv = False
+    found_pub = False
+    
+    key_priv = ""
+    key_pub = ""
+    
+        # process lines looking for "priv:" and "pub:" markers
+    for line in stdout_lines:
+    
+            # found private key marker?
+        if line.strip() == "priv:":
+            found_priv = True
+            found_pub = False
+            continue
+            
+            # found public key marker?
+        if line.strip() == "pub:":      # openssl 1.0.2g prints 'pub: ' (extra space before newline),
+            found_pub = True            # so we need to compare against line.strip(), not just line
+            found_priv = False
+            continue
+            
+            # found part of private key?
+        if found_priv:
+            if not line.startswith(" "):
+                found_priv = False
+                continue
+            else:
+                key_priv += line.strip()
+            
+            # found part of public key?
+        if found_pub:
+            if not line.startswith(" "):
+                found_pub = False
+                continue
+            else:
+                key_pub += line.strip()
+            
+        # do some cleanup and sanity checking on private key
+        #  * remove extra leading zero byte if present
+        #  * remove colons
+        #  * check length (256 bits or 384 bits)
+    while key_priv.startswith("00"):
+        key_priv = key_priv[2:]
+        
+    key_priv = key_priv.replace(":", "")
+        
+    if curve == CURVE_P256 and len(key_priv) != 256 / 4: sys.exit()
+    if curve == CURVE_P384 and len(key_priv) != 384 / 4: sys.exit()
+    
+        # do some cleanup and sanity checking on public key
+        #  * make sure, that uncompressed form marker (0x04) is present and
+        #    then remove it
+        #  * remove colons
+        #  * check length (2x256 or 2x384 bits)
+    if not key_pub.startswith("04"): sys.exit()
+        
+    key_pub = key_pub[2:]
+    key_pub = key_pub.replace(":", "")
+
+    if curve == CURVE_P256 and len(key_pub) != 2 * 256 / 4: sys.exit()
+    if curve == CURVE_P384 and len(key_pub) != 2 * 384 / 4: sys.exit()
+    
+        # split public key into parts
+    if curve == CURVE_P256:
+        key_pub_x = key_pub[  0: 64]
+        key_pub_y = key_pub[ 64:128]
+    
+    if curve == CURVE_P384:
+        key_pub_x = key_pub[  0: 96]
+        key_pub_y = key_pub[ 96:192]
+    
+        # convert from strings to integers
+    key_priv = int(key_priv, 16)
+    key_pub_x = int(key_pub_x, 16)
+    key_pub_y = int(key_pub_y, 16)
+            
+        # done
+    return key_priv, key_pub_x, key_pub_y
+
+    
+if __name__ == "__main__":
+
+    # detect whether user requested some specific binary
+    if len(sys.argv) == 1:
+        OPENSSL = "openssl"
+        print("Using system OpenSSL library.")
+    elif len(sys.argv) == 2:
+        OPENSSL = sys.argv[1]
+        print("Using OpenSSL binary '" + OPENSSL + "'...")
+    else:
+        print(USAGE)
+    
+    if len(OPENSSL) > 0:
+    
+        # list of curves to process
+        curves = [CURVE_P256, CURVE_P384]
+    
+        # open output files
+        file_h = open('ecdsa_test_vector_randomized.h',  'w')
+        file_v = open('ecdsa_test_vector_randomized.vh', 'w')
+    
+        # write headers
+        file_h.write("/* Generated automatically, do not edit. */\n\n")
+        file_v.write("/* Generated automatically, do not edit. */\n\n")
+    
+        # process all the keys
+        for next_curve in curves:
+    
+            # load keys
+            da, qax, qay = get_key(OPENSSL, "charlie", next_curve)
+    
+            # format numbers and write to files
+
+            format_c_header(file_h, next_curve,
+                da, qax, qay);
+    
+            format_verilog_include(file_v, next_curve,
+                da, qax, qay);
+    
+        # done
+        file_h.close()
+        file_v.close()
+    
+        # everything went just fine
+        print("Test vector formatted.")
+
+#
+# End of file
+#
diff --git a/test_vectors/regenerate_random_test_vector.py b/test_vectors/regenerate_random_test_vector.py
new file mode 100644
index 0000000..75cb761
--- /dev/null
+++ b/test_vectors/regenerate_random_test_vector.py
@@ -0,0 +1,91 @@
+#
+# regenerate_test_vector.py
+# -----------------------------------------------------------
+# Generates a new randomized test vector for ecdsa_fpga_model
+#
+# Author: Pavel Shatov
+# Copyright (c) 2017-2018, NORDUnet A/S
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+#
+# - Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+#
+# - Neither the name of the NORDUnet nor the names of its contributors may
+#   be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+#
+USAGE = "USAGE: regenerate_test_vector.py [openssl_binary]"
+#
+
+# This script generates a test vector. The test vector contains two
+# private keys. One is for P-256, the other one is for P-384.
+#
+
+#
+# imports
+#
+import sys
+import subprocess
+
+OPENSSL = ""
+
+CURVE_P256 = "prime256v1"
+CURVE_P384 = "secp384r1"
+
+SECRET_CHARLIE_256 = "charlie_p256.key"
+SECRET_CHARLIE_384 = "charlie_p384.key"
+
+def openssl_ecparam_genkey(openssl, curve, file):
+    subprocess.call([openssl, "ecparam", "-genkey", "-name", curve, "-out", file])
+
+#
+# __main__
+#
+if __name__ == "__main__":
+
+    # detect whether user requested some specific binary
+    if len(sys.argv) == 1:
+        OPENSSL = "openssl"
+        print("Using system OpenSSL library.")
+    elif len(sys.argv) == 2:
+        OPENSSL = sys.argv[1]
+        print("Using OpenSSL binary '" + OPENSSL + "'...")
+    else:
+        print(USAGE)
+
+    if len(OPENSSL) > 0:
+        
+        # generate a new private key for P-256 curve
+        openssl_ecparam_genkey(OPENSSL, CURVE_P256, SECRET_CHARLIE_256)
+    
+        # generate a new private key for P-384 curve
+        openssl_ecparam_genkey(OPENSSL, CURVE_P384, SECRET_CHARLIE_384)
+        
+        # done
+        print("New randomized test vector generated.")
+
+
+#
+# End of file
+#
author	Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru>	2018-12-19 16:03:08 +0300
committer	Pavel V. Shatov (Meister) <meisterpaul1@yandex.ru>	2018-12-19 16:03:08 +0300
commit	1f8d13bf8d2e813f0c5da653c4abffb7a817db9a (patch)
tree	7b6290a838f460a9d104f28a32de08be8bcf8605
parent	cae8718217846cfaefcbfecd55f9a117731a8d99 (diff)