aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2018-02-26 13:05:17 +0300
committerPavel V. Shatov (Meister) <meisterpaul1@yandex.ru>2018-02-26 13:05:17 +0300
commit71fba5c6662d6bbe74366c03b531a5f86b25bbbb (patch)
tree6e15bce58eadd3a5d13535bd23f083f2bfb5cdd2
Initial commit of C reference model for ECDH cores.
-rw-r--r--Makefile24
-rw-r--r--README.md18
-rw-r--r--ecdh_fpga_model.cpp264
-rw-r--r--ecdh_fpga_model.h177
-rw-r--r--fpga_curve.cpp352
-rw-r--r--fpga_curve.h64
-rw-r--r--fpga_lowlevel.cpp137
-rw-r--r--fpga_lowlevel.h80
-rw-r--r--fpga_modular.cpp838
-rw-r--r--fpga_modular.h87
-rw-r--r--fpga_util.cpp86
-rw-r--r--fpga_util.h49
-rw-r--r--test_vectors/alice_p256.key8
-rw-r--r--test_vectors/alice_p384.key9
-rw-r--r--test_vectors/bob_p256.key8
-rw-r--r--test_vectors/bob_p384.key9
-rw-r--r--test_vectors/ecdh_test_vectors.h74
-rw-r--r--test_vectors/format_test_vectors.py450
-rw-r--r--test_vectors/regenerate_test_vectors.py71
19 files changed, 2805 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1740318
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,24 @@
+CC=gcc
+CFLAGS=-c
+LDFLAGS=
+
+ecdh_model_fpga: ecdh_model_fpga.o fpga_curve.o fpga_modular.o fpga_lowlevel.o fpga_util.o
+ $(CC) $(LDFLAGS) ecdh_model_fpga.o fpga_curve.o fpga_modular.o fpga_lowlevel.o fpga_util.o -o ecdh_model_fpga
+
+ecdh_model_fpga.o: ecdh_model_fpga.cpp ecdh_model_fpga.h fpga_curve.h fpga_modular.h fpga_lowlevel.h fpga_util.h
+ $(CC) $(CFLAGS) ecdh_model_fpga.cpp
+
+fpga_curve.o: fpga_curve.cpp ecdh_model_fpga.h fpga_lowlevel.h fpga_modular.h fpga_curve.h fpga_util.h
+ $(CC) $(CFLAGS) fpga_curve.cpp
+
+fpga_modular.o: fpga_modular.cpp ecdh_model_fpga.h fpga_lowlevel.h fpga_modular.h
+ $(CC) $(CFLAGS) fpga_modular.cpp
+
+fpga_lowlevel.o: fpga_lowlevel.cpp ecdh_model_fpga.h fpga_lowlevel.h
+ $(CC) $(CFLAGS) fpga_lowlevel.cpp
+
+fpga_util.o: fpga_util.cpp ecdh_model_fpga.h fpga_lowlevel.h fpga_util.h
+ $(CC) $(CFLAGS) fpga_util.cpp
+
+clean:
+ rm -f ecdh_model_fpga *.o
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..28306a9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,18 @@
+# ecdh_model_fpga
+
+This reference model was written to help debug Verilog code, it mimics how an FPGA would do elliptic curve point scalar multiplication for ECDH using curves P-256 and P-384. Note, that the model may do weird (from CPU point of view, of course) things at times. Another important thing is that while FPGA modules are actually written to operate in constant-time manner, this model itself doesn't take any active measures to keep run-time constant. Do **NOT** use it in production as-is!
+
+The model is split into 4 layers:
+
+ * Low-level primitives (32- and 48-bit adders, 32-bit subtractor, 16x16-bit multiplier, 48-bit accumulator)
+ * Utility routines (copier, comparator)
+ * Modular arithmetic (adder, subtractor, multiplier, invertor)
+ * EC arithmetic (adder, doubler, multiplier)
+
+Modular multiplier and invertor use complex algorithms and are thus further split into "helper" sub-routines.
+
+This model uses tips and tricks from the following sources:
+1. [Guide to Elliptic Curve Cryptography](http://diamond.boisestate.edu/~liljanab/MATH308/GuideToECC.pdf)
+2. [Ultra High Performance ECC over NIST Primes
+on Commercial FPGAs](https://www.iacr.org/archive/ches2008/51540064/51540064.pdf)
+3. [Constant Time Modular Inversion](http://joppebos.com/files/CTInversion.pdf)
diff --git a/ecdh_fpga_model.cpp b/ecdh_fpga_model.cpp
new file mode 100644
index 0000000..730292e
--- /dev/null
+++ b/ecdh_fpga_model.cpp
@@ -0,0 +1,264 @@
+//------------------------------------------------------------------------------
+//
+// ecdh_fpga_model.cpp
+// --------------------------------------------
+// Curve point scalar multiplier model for ECDH
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017-2018, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "ecdh_fpga_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_modular.h"
+#include "fpga_curve.h"
+#include "fpga_util.h"
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+bool test_point_multiplier (FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
+bool abuse_point_multiplier (FPGA_BUFFER *qx, FPGA_BUFFER *qy);
+
+void print_fpga_buffer (const char *s, FPGA_BUFFER *v);
+bool compare_fpga_buffers (FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *bx, FPGA_BUFFER *by);
+
+
+//------------------------------------------------------------------------------
+int main()
+//------------------------------------------------------------------------------
+{
+ bool ok_a, ok_b; // flags
+
+ //
+ // initialize buffers
+ //
+ fpga_modular_init();
+ fpga_curve_init();
+
+
+ //
+ // test point multiplier: QA = dA * G
+ // QB = dB * G
+ //
+ printf("Trying to derive public keys from private keys...\n\n");
+ ok_a = test_point_multiplier(&ecdsa_g_x, &ecdsa_g_y, &ecdh_da, &ecdh_qa_x, &ecdh_qa_y);
+ ok_b = test_point_multiplier(&ecdsa_g_x, &ecdsa_g_y, &ecdh_db, &ecdh_qb_x, &ecdh_qb_y);
+ if (!ok_a || !ok_b) return EXIT_FAILURE;
+
+
+ //
+ // test point multiplier: S = dA * QB
+ // S = dB * QA
+ //
+ printf("Trying to derive shared secret key...\n\n");
+ ok_a = test_point_multiplier(&ecdh_qa_x, &ecdh_qa_y, &ecdh_db, &ecdh_s_x, &ecdh_s_y);
+ ok_b = test_point_multiplier(&ecdh_qb_x, &ecdh_qb_y, &ecdh_da, &ecdh_s_x, &ecdh_s_y);
+ if (!ok_a || !ok_b) return EXIT_FAILURE;
+
+
+ //
+ // test point multiplier: O = 0 * QA
+ // O = 0 * QB
+ //
+ printf("Trying to multiply public keys by zero...\n\n");
+ ok_a = test_point_multiplier(&ecdh_qa_x, &ecdh_qa_y, &ecdsa_zero, &ecdsa_zero, &ecdsa_zero);
+ ok_b = test_point_multiplier(&ecdh_qb_x, &ecdh_qb_y, &ecdsa_zero, &ecdsa_zero, &ecdsa_zero);
+ if (!ok_a || !ok_b) return EXIT_FAILURE;
+
+
+ //
+ // test point multiplier: QA = 1 * QA
+ // QB = 1 * QB
+ //
+ printf("Trying to multiply public keys by one...\n\n");
+ ok_a = test_point_multiplier(&ecdh_qa_x, &ecdh_qa_y, &ecdsa_one, &ecdh_qa_x, &ecdh_qa_y);
+ ok_b = test_point_multiplier(&ecdh_qb_x, &ecdh_qb_y, &ecdsa_one, &ecdh_qb_x, &ecdh_qb_y);
+ if (!ok_a || !ok_b) return EXIT_FAILURE;
+
+
+ //
+ // abuse point multiplier
+ //
+ ok_a = abuse_point_multiplier(&ecdh_qa_x, &ecdh_qa_y);
+ ok_b = abuse_point_multiplier(&ecdh_qb_x, &ecdh_qb_y);
+ if (!ok_a || !ok_b) return EXIT_FAILURE;
+
+
+ //
+ // everything went just fine
+ //
+ return EXIT_SUCCESS;
+}
+
+
+//------------------------------------------------------------------------------
+bool test_point_multiplier(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+//
+// (px,py) - multiplicand
+// k - multiplier
+//
+// qx, qy - expected coordinates of product
+//
+// Returns true when point (rx,ry) = k * P matches the point (qx,qy).
+//
+//------------------------------------------------------------------------------
+{
+ bool ok; // flag
+ FPGA_BUFFER rx, ry; // result
+
+ /* run the model */
+ fpga_curve_scalar_multiply(px, py, k, &rx, &ry);
+
+ /* handle result */
+ ok = compare_fpga_buffers(qx, qy, &rx, &ry);
+ if (!ok)
+ { printf("\n ERROR\n\n");
+ return false;
+ }
+ else printf("\n OK\n\n");
+
+ // everything went just fine
+ return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool abuse_point_multiplier(FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+//
+// This routine tries to abuse the curve point multiplier by triggering the
+// rarely used code path where the internal adder has to add two identical
+// points.
+//
+//------------------------------------------------------------------------------
+{
+ bool ok; // flag
+
+ // obtain quantity n + 2
+ FPGA_BUFFER two, n2;
+ fpga_modular_add(&ecdsa_one, &ecdsa_one, &two); // n1 = n + 1
+ fpga_modular_add(&ecdsa_n, &two, &n2); // n2 = n1 + 1 = n + 2
+
+ printf("Trying to abuse point multiplier...\n\n");
+
+ // we first calculate 2 * Q
+ FPGA_BUFFER q2a_x, q2a_y;
+ fpga_curve_scalar_multiply(qx, qy, &two, &q2a_x, &q2a_y);
+
+ // we now calculate (n + 2) * Q
+ FPGA_BUFFER q2b_x, q2b_y;
+ fpga_curve_scalar_multiply(qx, qy, &n2, &q2b_x, &q2b_y);
+
+ // both calculations should produce the same point (Q2a == Q2b)
+ ok = compare_fpga_buffers(&q2a_x, &q2a_y, &q2b_x, &q2b_y);
+ if (! ok)
+ { printf("\n ERROR\n\n");
+ return false;
+ }
+ else printf("\n OK\n\n");
+
+ // everything went just fine
+ return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool compare_fpga_buffers(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *bx, FPGA_BUFFER *by)
+//------------------------------------------------------------------------------
+//
+// Compare affine coordinates of two points and return true when they match.
+//
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+
+ // print all the values
+ print_fpga_buffer(" Expected: X = ", ax);
+ print_fpga_buffer(" Calculated: X = ", bx);
+ printf("\n");
+ print_fpga_buffer(" Expected: Y = ", ay);
+ print_fpga_buffer(" Calculated: Y = ", by);
+
+ // compare values
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ {
+ // compare x
+ if (ax->words[w] != bx->words[w]) return false;
+
+ // compare y
+ if (ay->words[w] != by->words[w]) return false;
+ }
+
+ // values are the same
+ return true;
+}
+
+
+//------------------------------------------------------------------------------
+void print_fpga_buffer(const char *s, FPGA_BUFFER *buf)
+//------------------------------------------------------------------------------
+//
+// Pretty print large multi-word integer.
+//
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+
+ // print header
+ printf("%s", s);
+
+ // print all bytes
+ for (w=OPERAND_NUM_WORDS; w>0; w--)
+ {
+ printf("%08x", buf->words[w-1]);
+
+ // insert space after every group of 4 bytes
+ if (w > 1) printf(" ");
+ }
+
+ // print footer
+ printf("\n");
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdh_fpga_model.h b/ecdh_fpga_model.h
new file mode 100644
index 0000000..6b66c99
--- /dev/null
+++ b/ecdh_fpga_model.h
@@ -0,0 +1,177 @@
+//------------------------------------------------------------------------------
+//
+// ecdh_fpga_model.h
+// --------------------------------------------
+// Base point scalar multiplier model for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2017-2018, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include "test_vectors/ecdh_test_vectors.h"
+
+
+//------------------------------------------------------------------------------
+//
+// Curve Selection
+//
+// USE_CURVE == 1 -> P-256
+// USE_CURVE == 2 -> P-384
+//
+//------------------------------------------------------------------------------
+#define USE_CURVE 2
+
+
+//------------------------------------------------------------------------------
+// Model Parameters
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+#define OPERAND_WIDTH (256) // largest supported operand width in bits
+#elif USE_CURVE == 2
+#define OPERAND_WIDTH (384) // largest supported operand width in bits
+#else
+#error USE_CURVE must be either 1 or 2!
+#endif
+
+
+//------------------------------------------------------------------------------
+// P-256 Parameters and Test Vectors
+//------------------------------------------------------------------------------
+
+/* Field Size */
+#define P_256_Q {0xffffffff, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff}
+
+/* Generic Numbers */
+#define P_256_ZERO {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
+#define P_256_ONE {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001}
+
+/* Division Factor */
+#define P_256_DELTA {0x7fffffff, 0x80000000, 0x80000000, 0x00000000, 0x00000000, 0x80000000, 0x00000000, 0x00000000}
+
+/* Base Point */
+#define P_256_G_X {0x6b17d1f2, 0xe12c4247, 0xf8bce6e5, 0x63a440f2, 0x77037d81, 0x2deb33a0, 0xf4a13945, 0xd898c296}
+#define P_256_G_Y {0x4fe342e2, 0xfe1a7f9b, 0x8ee7eb4a, 0x7c0f9e16, 0x2bce3357, 0x6b315ece, 0xcbb64068, 0x37bf51f5}
+
+/* Base Point Order */
+#define P_256_N {0xffffffff, 0x00000000, 0xffffffff, 0xffffffff, 0xbce6faad, 0xa7179e84, 0xf3b9cac2, 0xfc632551}
+
+
+//------------------------------------------------------------------------------
+// P-384 Parameters and Test Vectors
+//------------------------------------------------------------------------------
+
+/* Field Size */
+#define P_384_Q {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff}
+
+/* Generic Numbers */
+#define P_384_ZERO {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
+#define P_384_ONE {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001}
+
+/* Division Factor */
+#define P_384_DELTA {0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff, 0x80000000, 0x00000000, 0x80000000}
+
+/* Base Point */
+#define P_384_G_X {0xaa87ca22, 0xbe8b0537, 0x8eb1c71e, 0xf320ad74, 0x6e1d3b62, 0x8ba79b98, 0x59f741e0, 0x82542a38, 0x5502f25d, 0xbf55296c, 0x3a545e38, 0x72760ab7}
+#define P_384_G_Y {0x3617de4a, 0x96262c6f, 0x5d9e98bf, 0x9292dc29, 0xf8f41dbd, 0x289a147c, 0xe9da3113, 0xb5f0b8c0, 0x0a60b1ce, 0x1d7e819d, 0x7a431d7c, 0x90ea0e5f}
+
+/* Base Point Order */
+#define P_384_N {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xc7634d81, 0xf4372ddf, 0x581a0db2, 0x48b0a77a, 0xecec196a, 0xccc52973}
+
+
+//------------------------------------------------------------------------------
+// Parameter and Test Vector Selection
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+
+#define ECDSA_Q P_256_Q
+
+#define ECDSA_ZERO P_256_ZERO
+#define ECDSA_ONE P_256_ONE
+
+#define ECDSA_DELTA P_256_DELTA
+
+#define ECDSA_G_X P_256_G_X
+#define ECDSA_G_Y P_256_G_Y
+
+#define ECDH_DA P_256_DA
+#define ECDH_DB P_256_DB
+
+#define ECDH_QA_X P_256_QA_X
+#define ECDH_QA_Y P_256_QA_Y
+
+#define ECDH_QB_X P_256_QB_X
+#define ECDH_QB_Y P_256_QB_Y
+
+#define ECDH_S_X P_256_S_X
+#define ECDH_S_Y P_256_S_Y
+
+#define ECDSA_N P_256_N
+
+#elif USE_CURVE == 2
+
+#define ECDSA_Q P_384_Q
+
+#define ECDSA_ZERO P_384_ZERO
+#define ECDSA_ONE P_384_ONE
+
+#define ECDSA_DELTA P_384_DELTA
+
+#define ECDSA_G_X P_384_G_X
+#define ECDSA_G_Y P_384_G_Y
+
+#define ECDH_DA P_384_DA
+#define ECDH_DB P_384_DB
+
+#define ECDH_QA_X P_384_QA_X
+#define ECDH_QA_Y P_384_QA_Y
+
+#define ECDH_QB_X P_384_QB_X
+#define ECDH_QB_Y P_384_QB_Y
+
+#define ECDH_S_X P_384_S_X
+#define ECDH_S_Y P_384_S_Y
+
+#define ECDSA_N P_384_N
+
+#else
+
+#error USE_CURVE must be either 1 or 2!
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_curve.cpp b/fpga_curve.cpp
new file mode 100644
index 0000000..46f6f73
--- /dev/null
+++ b/fpga_curve.cpp
@@ -0,0 +1,352 @@
+//------------------------------------------------------------------------------
+//
+// fpga_curve.cpp
+// ------------------------------------
+// Elliptic curve arithmetic procedures
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdh_fpga_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_modular.h"
+#include "fpga_curve.h"
+#include "fpga_util.h"
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+FPGA_BUFFER ecdsa_n;
+FPGA_BUFFER ecdsa_g_x, ecdsa_g_y;
+FPGA_BUFFER ecdh_d_x, ecdh_d_y;
+FPGA_BUFFER ecdh_da, ecdh_db;
+FPGA_BUFFER ecdh_qa_x, ecdh_qa_y;
+FPGA_BUFFER ecdh_qb_x, ecdh_qb_y;
+FPGA_BUFFER ecdh_s_x, ecdh_s_y;
+
+
+//------------------------------------------------------------------------------
+void fpga_curve_init()
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+
+ FPGA_BUFFER tmp_n = ECDSA_N;
+ FPGA_BUFFER tmp_g_x = ECDSA_G_X, tmp_g_y = ECDSA_G_Y;
+ FPGA_BUFFER tmp_da = ECDH_DA, tmp_db = ECDH_DB;
+ FPGA_BUFFER tmp_qa_x = ECDH_QA_X, tmp_qa_y = ECDH_QA_Y;
+ FPGA_BUFFER tmp_qb_x = ECDH_QB_X, tmp_qb_y = ECDH_QB_Y;
+ FPGA_BUFFER tmp_s_x = ECDH_S_X, tmp_s_y = ECDH_S_Y;
+
+ /* fill buffers for large multi-word integers */
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ { ecdsa_n.words[w] = tmp_n.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdsa_g_x.words[w] = tmp_g_x.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdsa_g_y.words[w] = tmp_g_y.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_da.words[w] = tmp_da.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_db.words[w] = tmp_db.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_qa_x.words[w] = tmp_qa_x.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_qa_y.words[w] = tmp_qa_y.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_qb_x.words[w] = tmp_qb_x.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_qb_y.words[w] = tmp_qb_y.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_s_x.words[w] = tmp_s_x.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdh_s_y.words[w] = tmp_s_y.words[OPERAND_NUM_WORDS - (w+1)];
+ }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve point doubling routine.
+//
+// R(rx,ry,rz) = 2 * P(px,py,pz)
+//
+// Note, that P(px,py,pz) is supposed to be in projective Jacobian coordinates,
+// R will be in projective Jacobian coordinates.
+//
+// This routine implements algorithm 3.21 from "Guide to Elliptic Curve
+// Cryptography", the only difference is that step 6. does T1 = T2 + T2 and
+// then T2 = T2 + T1 instead of T2 = 3 * T2, because our addition is much
+// faster, than multiplication.
+//
+// Note, that this routine also handles one special case, namely when P is at
+// infinity.
+//
+// Instead of actual modular division, multiplication by pre-computed constant
+// (2^-1 mod q) is done.
+//
+// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
+// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
+// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
+// then fpga_modular_mul(pz, &t1, &t1) instead.
+//
+// WARNING: Though this procedure always does doubling steps, it does not take
+// any active measures to keep run-time constant. The main purpose of this
+// model is to help debug Verilog code for FPGA, so *DO NOT* use is anywhere
+// near production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_double_jacobian(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz)
+//------------------------------------------------------------------------------
+{
+ FPGA_BUFFER t1, t2, t3; // temporary variables
+
+ // check, whether P is at infinity
+ bool pz_is_zero = fpga_buffer_is_zero(pz);
+
+ /* 2. */ fpga_buffer_copy(pz, &t1);
+ fpga_modular_mul(pz, &t1, &t1);
+ /* 3. */ fpga_modular_sub(px, &t1, &t2);
+ /* 4. */ fpga_modular_add(px, &t1, &t1);
+ /* 5. */ fpga_modular_mul(&t1, &t2, &t2);
+ /* 6. */ fpga_modular_add(&t2, &t2, &t1);
+ /* */ fpga_modular_add(&t1, &t2, &t2);
+ /* 7. */ fpga_modular_add(py, py, ry);
+ /* 8. */ fpga_modular_mul(pz, ry, rz);
+ /* 9. */ fpga_buffer_copy(ry, &t1);
+ fpga_buffer_copy(ry, &t3);
+ fpga_modular_mul(&t1, &t3, ry);
+ /* 10. */ fpga_modular_mul(px, ry, &t3);
+ /* 11. */ fpga_buffer_copy(ry, &t1);
+ fpga_modular_mul(ry, &t1, &t1);
+ /* 12. */ fpga_modular_mul(&t1, &ecdsa_delta, ry);
+ /* 13. */ fpga_buffer_copy(&t2, &t1);
+ fpga_modular_mul(&t1, &t2, rx);
+ /* 14. */ fpga_modular_add(&t3, &t3, &t1);
+ /* 15. */ fpga_modular_sub(rx, &t1, rx);
+ /* 16. */ fpga_modular_sub(&t3, rx, &t1);
+ /* 17. */ fpga_modular_mul(&t1, &t2, &t1);
+ /* 18. */ fpga_modular_sub(&t1, ry, ry);
+
+ // handle special case (input point is at infinity)
+ if (pz_is_zero)
+ { fpga_buffer_copy(&ecdsa_one, rx);
+ fpga_buffer_copy(&ecdsa_one, ry);
+ fpga_buffer_copy(&ecdsa_zero, rz);
+ }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve point addition routine.
+//
+// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy)
+//
+// Note, that P(px, py, pz) is supposed to be in projective Jacobian
+// coordinates, while Q(qx,qy) is supposed to be in affine coordinates,
+// R(rx, ry, rz) will be in projective Jacobian coordinates.
+//
+// This routine implements algorithm 3.22 from "Guide to Elliptic Curve
+// Cryptography". Differences from the original algorithm:
+//
+// 1) Step 1. is omitted, because the user-supplied point Q is supposed
+// to be not at infinity.
+//
+// 2) Step 9.1 just returns the pre-computed double of the base point instead
+// of actually doubling it.
+//
+// Note, that this routine also handles three special cases:
+//
+// 1) P is at infinity
+// 2) P == Q
+// 3) P == -Q
+//
+// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
+// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
+// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
+// then fpga_modular_mul(pz, &t1, &t1) instead.
+//
+// WARNING: This procedure does not take any active measures to keep run-time
+// constant. The main purpose of this model is to help debug Verilog code for
+// FPGA, so *DO NOT* use is anywhere near production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_add_jacobian(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz)
+//------------------------------------------------------------------------------
+{
+ FPGA_BUFFER t1, t2, t3, t4; // temporary variables
+
+ bool pz_is_zero = fpga_buffer_is_zero(pz); // Step 2.
+
+ /* 3. */ fpga_buffer_copy(pz, &t1);
+ fpga_modular_mul(pz, &t1, &t1);
+ /* 4. */ fpga_modular_mul(pz, &t1, &t2);
+ /* 5. */ fpga_modular_mul(&t1, qx, &t1);
+ /* 6. */ fpga_modular_mul(&t2, qy, &t2);
+ /* 7. */ fpga_modular_sub(&t1, px, &t1);
+ /* 8. */ fpga_modular_sub(&t2, py, &t2);
+
+ bool t1_is_zero = fpga_buffer_is_zero(&t1); // | Step 9.
+ bool t2_is_zero = fpga_buffer_is_zero(&t2); // |
+
+ /* 10. */ fpga_modular_mul(pz, &t1, rz);
+ /* 11. */ fpga_buffer_copy(&t1, &t3);
+ fpga_modular_mul(&t1, &t3, &t3);
+ /* 12. */ fpga_modular_mul(&t1, &t3, &t4);
+ /* 13. */ fpga_modular_mul(px, &t3, &t3);
+ /* 14. */ fpga_modular_add(&t3, &t3, &t1);
+ /* 15. */ fpga_buffer_copy(&t2, rx);
+ fpga_modular_mul(rx, &t2, rx);
+ /* 16. */ fpga_modular_sub(rx, &t1, rx);
+ /* 17. */ fpga_modular_sub(rx, &t4, rx);
+ /* 18. */ fpga_modular_sub(&t3, rx, &t3);
+ /* 19. */ fpga_modular_mul(&t2, &t3, &t3);
+ /* 20. */ fpga_modular_mul(py, &t4, &t4);
+ /* 21. */ fpga_modular_sub(&t3, &t4, ry);
+
+ //
+ // final selection
+ //
+ if (pz_is_zero) // P at infinity ?
+ { fpga_buffer_copy(qx, rx);
+ fpga_buffer_copy(qy, ry);
+ fpga_buffer_copy(&ecdsa_one, rz);
+ }
+ else if (t1_is_zero) // same x for P and Q ?
+ {
+ fpga_buffer_copy(t2_is_zero ? &ecdh_d_x : &ecdsa_one, rx); // | same y ? (P==Q => R=2*G) : (P==-Q => R=O)
+ fpga_buffer_copy(t2_is_zero ? &ecdh_d_y : &ecdsa_one, ry); // |
+ fpga_buffer_copy(t2_is_zero ? &ecdsa_one : &ecdsa_zero, rz); // |
+ }
+
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Conversion from projective Jacobian to affine coordinates.
+//
+// P(px,py,pz) -> Q(qx,qy)
+//
+// Note, that qx = px / Z^2 and qy = py / Z^3. Division in modular arithmetic
+// is equivalent to multiplication by the inverse value of divisor, so
+// qx = px * (pz^-1)^2 and qy = py * (pz^-1)^3.
+//
+// Note, that this procedure does *NOT* handle points at infinity correctly. It
+// can only be called from the base point multiplication routine, that
+// specifically makes sure that P is not at infinity, so pz will always be
+// non-zero value.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_point_to_affine(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+{
+ FPGA_BUFFER pz1; // inverse value of pz
+ FPGA_BUFFER t2, t3; // intermediate values
+
+ fpga_modular_inv(pz, &pz1); // pz1 = pz^-1 (mod q)
+
+ fpga_modular_mul(&pz1, &pz1, &t2); // t2 = pz1 ^ 2 (mod q)
+ fpga_modular_mul(&pz1, &t2, &t3); // t3 = tz1 ^ 3 (mod q)
+
+ fpga_modular_mul(px, &t2, qx); // qx = px * (pz^-1)^2 (mod q)
+ fpga_modular_mul(py, &t3, qy); // qy = py * (pz^-1)^3 (mod q)
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve point scalar multiplication routine.
+//
+// Q(qx,qy) = k * P(px,py)
+//
+// Note, that Q is supposed to be in affine coordinates. Multiplication is done
+// using the double-and-add algorithm 3.27 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// WARNING: Though this procedure always does the addition step, it only
+// updates the result when current bit of k is set. It does not take any
+// active measures to keep run-time constant. The main purpose of this model
+// is to help debug Verilog code for FPGA, so *DO NOT* use it anywhere near
+// production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_scalar_multiply(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+{
+ int word_count, bit_count; // counters
+
+ FPGA_BUFFER rx, ry, rz; // intermediate result
+ FPGA_BUFFER tx, ty, tz; // temporary variable
+
+ /* obtain quantity 2 * P */
+ fpga_curve_double_jacobian(px, py, &ecdsa_one, &tx, &ty, &tz);
+ fpga_curve_point_to_affine(&tx, &ty, &tz, &ecdh_d_x, &ecdh_d_y);
+
+ /* set initial value of R to point at infinity */
+ fpga_buffer_copy(&ecdsa_one, &rx);
+ fpga_buffer_copy(&ecdsa_one, &ry);
+ fpga_buffer_copy(&ecdsa_zero, &rz);
+
+ /* process bits of k left-to-right */
+ for (word_count=OPERAND_NUM_WORDS; word_count>0; word_count--)
+ for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
+ {
+ /* calculate T = 2 * R */
+ fpga_curve_double_jacobian(&rx, &ry, &rz, &tx, &ty, &tz);
+
+ /* always calculate R = T + P for constant-time */
+ fpga_curve_add_jacobian(&tx, &ty, &tz, px, py, &rx, &ry, &rz);
+
+ /* revert to the value of T before addition if the current bit of k is not set */
+ if (!((k->words[word_count-1] >> (bit_count-1)) & 1))
+ { fpga_buffer_copy(&tx, &rx);
+ fpga_buffer_copy(&ty, &ry);
+ fpga_buffer_copy(&tz, &rz);
+ }
+
+ }
+
+ // convert result to affine coordinates anyway
+ fpga_curve_point_to_affine(&rx, &ry, &rz, qx, qy);
+
+ // check, that rz is non-zero (not point at infinity)
+ bool rz_is_zero = fpga_buffer_is_zero(&rz);
+
+ // handle special case (result is point at infinity)
+ if (rz_is_zero)
+ { fpga_buffer_copy(&ecdsa_zero, qx);
+ fpga_buffer_copy(&ecdsa_zero, qy);
+ }
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_curve.h b/fpga_curve.h
new file mode 100644
index 0000000..db71e95
--- /dev/null
+++ b/fpga_curve.h
@@ -0,0 +1,64 @@
+//------------------------------------------------------------------------------
+//
+// fpga_curve.h
+// ------------------------------------
+// Elliptic curve arithmetic procedures
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER ecdsa_n; // order of the base point
+extern FPGA_BUFFER ecdsa_g_x, ecdsa_g_y; // the base point
+extern FPGA_BUFFER ecdh_d_x, ecdh_d_y; // double of the multiplicand
+extern FPGA_BUFFER ecdh_da, ecdh_db; // private keys
+extern FPGA_BUFFER ecdh_qa_x, ecdh_qa_y; // Alice's public key
+extern FPGA_BUFFER ecdh_qb_x, ecdh_qb_y; // Bob's public key
+extern FPGA_BUFFER ecdh_s_x, ecdh_s_y; // shared secret
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void fpga_curve_init ();
+void fpga_curve_scalar_multiply (FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
+void fpga_curve_add_jacobian (FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz);
+void fpga_curve_double_jacobian (FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz);
+void fpga_curve_point_to_affine (FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_lowlevel.cpp b/fpga_lowlevel.cpp
new file mode 100644
index 0000000..676d188
--- /dev/null
+++ b/fpga_lowlevel.cpp
@@ -0,0 +1,137 @@
+//------------------------------------------------------------------------------
+//
+// fpga_lowlevel.cpp
+// -----------------------------------
+// Models of Low-level FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdh_fpga_model.h"
+#include "fpga_lowlevel.h"
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 32-bit adder with carry input and carry output.
+//
+// Carries are 1 bit wide.
+//
+// {c_out, s} = x + y + c_in
+//
+//------------------------------------------------------------------------------
+void fpga_lowlevel_add32(FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out)
+//------------------------------------------------------------------------------
+{
+ // calculate sum
+ FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x + (FPGA_WORD_EXTENDED)y;
+
+ // process carry input
+ if (c_in) r++;
+
+ // store sum
+ *s = (FPGA_WORD)r;
+
+ // shift sum to obtain carry flag
+ r >>= FPGA_WORD_WIDTH;
+
+ // store carry
+ *c_out = (r != 0);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 32-bit subtractor with borrow input and borrow output.
+//
+// Borrows are 1 bit wide.
+//
+// {b_out, d} = x - y - b_in
+//
+void fpga_lowlevel_sub32(FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out)
+//------------------------------------------------------------------------------
+{
+ // calculate difference
+ FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x - (FPGA_WORD_EXTENDED)y;
+
+ // process borrow input
+ if (b_in) r--;
+
+ // store difference
+ *d = (FPGA_WORD)r;
+
+ // shift difference to obtain borrow flag
+ r >>= FPGA_WORD_WIDTH;
+
+ // store borrow
+ *b_out = (r != 0);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 16x16-bit multiplier.
+//
+// Inputs are 16-bit wide, outputs are 32-bit wide.
+//
+// p = x * y
+//
+void fpga_lowlevel_mul16(FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p)
+//------------------------------------------------------------------------------
+{
+ // multiply and store result
+ *p = (FPGA_WORD)x * (FPGA_WORD)y;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 48-bit adder without carries.
+//
+// s = (x + y)[47:0]
+//
+void fpga_lowlevel_add48(FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s)
+//------------------------------------------------------------------------------
+{
+ // add and store result truncated to 48 bits
+ *s = (x + y) & FPGA_MASK_ADDER48;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_lowlevel.h b/fpga_lowlevel.h
new file mode 100644
index 0000000..f3b6063
--- /dev/null
+++ b/fpga_lowlevel.h
@@ -0,0 +1,80 @@
+//------------------------------------------------------------------------------
+//
+// fpga_lowlevel.h
+// -----------------------------------
+// Models of Low-level FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// FPGA Pipeline Settings
+//------------------------------------------------------------------------------
+#define FPGA_WORD_WIDTH (32)
+#define OPERAND_NUM_WORDS (OPERAND_WIDTH / FPGA_WORD_WIDTH)
+
+
+//------------------------------------------------------------------------------
+// Word Types (Normal, Extended, Reduced)
+//------------------------------------------------------------------------------
+typedef uint32_t FPGA_WORD;
+typedef uint64_t FPGA_WORD_EXTENDED;
+typedef uint16_t FPGA_WORD_REDUCED;
+
+
+//------------------------------------------------------------------------------
+// Extended Adder Mask
+//------------------------------------------------------------------------------
+#define FPGA_MASK_ADDER48 ((FPGA_WORD_EXTENDED)0x0000FFFFFFFFFFFF)
+
+
+//------------------------------------------------------------------------------
+struct FPGA_BUFFER
+//------------------------------------------------------------------------------
+{
+ FPGA_WORD words[OPERAND_NUM_WORDS];
+};
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void fpga_lowlevel_add32 (FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out);
+void fpga_lowlevel_sub32 (FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out);
+void fpga_lowlevel_mul16 (FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p);
+void fpga_lowlevel_add48 (FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_modular.cpp b/fpga_modular.cpp
new file mode 100644
index 0000000..5f7f5b7
--- /dev/null
+++ b/fpga_modular.cpp
@@ -0,0 +1,838 @@
+//------------------------------------------------------------------------------
+//
+// fpga_modular.cpp
+// ---------------------------
+// Modular arithmetic routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdh_fpga_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_modular.h"
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+FPGA_BUFFER ecdsa_q;
+FPGA_BUFFER ecdsa_zero;
+FPGA_BUFFER ecdsa_one;
+FPGA_BUFFER ecdsa_delta;
+
+
+//------------------------------------------------------------------------------
+void fpga_modular_init()
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+
+ FPGA_BUFFER tmp_q = ECDSA_Q;
+ FPGA_BUFFER tmp_zero = ECDSA_ZERO;
+ FPGA_BUFFER tmp_one = ECDSA_ONE;
+ FPGA_BUFFER tmp_delta = ECDSA_DELTA;
+
+ /* fill buffers for large multi-word integers */
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ { ecdsa_q.words[w] = tmp_q.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdsa_zero.words[w] = tmp_zero.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdsa_one.words[w] = tmp_one.words[OPERAND_NUM_WORDS - (w+1)];
+ ecdsa_delta.words[w] = tmp_delta.words[OPERAND_NUM_WORDS - (w+1)];
+ }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular addition.
+//
+// This routine implements algorithm 3. from "Ultra High Performance ECC over
+// NIST Primes on Commercial FPGAs".
+//
+// s = (a + b) mod q
+//
+// The naive approach is like the following:
+//
+// 1. s = a + b
+// 2. if (s >= q) s -= q
+//
+// The speed-up trick is to simultaneously calculate (a + b) and (a + b - q)
+// and then select the right variant.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_add(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *s)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ FPGA_BUFFER ab, ab_n; // intermediate buffers
+ bool c_in, c_out; // carries
+ bool b_in, b_out; // borrows
+
+ c_in = false; // first word has no carry
+ b_in = false; // first word has no borrow
+
+ // run parallel addition and subtraction
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ {
+ fpga_lowlevel_add32(a->words[w], b->words[w], c_in, &ab.words[w], &c_out);
+ fpga_lowlevel_sub32(ab.words[w], ecdsa_q.words[w], b_in, &ab_n.words[w], &b_out);
+
+ c_in = c_out; // propagate carry
+ b_in = b_out; // propagate borrow
+ }
+
+ // now select the right buffer
+
+ /*
+ * We select the right variant based on borrow and carry flags after
+ * addition and subtraction of the very last pair of words. Note, that
+ * we only need to select the first variant (a + b) when (a + b) < q.
+ * This way if we get negative number after subtraction, we discard it
+ * and use the output of the adder instead. The subtractor output is
+ * negative when borrow flag is set *and* carry flag is not set. When
+ * both borrow and carry are set, the number is non-negative, because
+ * borrow and carry cancel each other out.
+ */
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ s->words[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular subtraction.
+//
+// This routine implements algorithm 3. from "Ultra High Performance ECC over
+// NIST Primes on Commercial FPGAs".
+//
+// d = (a - b) mod q
+//
+// The naive approach is like the following:
+//
+// 1. d = a - b
+// 2. if (a < b) d += q
+//
+// The speed-up trick is to simultaneously calculate (a - b) and (a - b + q)
+// and then select the right variant.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_sub(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *d)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ FPGA_BUFFER ab, ab_n; // intermediate buffers
+ bool c_in, c_out; // carries
+ bool b_in, b_out; // borrows
+
+ c_in = false; // first word has no carry
+ b_in = false; // first word has no borrow
+
+ // run parallel subtraction and addition
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ {
+ fpga_lowlevel_sub32(a->words[w], b->words[w], b_in, &ab.words[w], &b_out);
+ fpga_lowlevel_add32(ab.words[w], ecdsa_q.words[w], c_in, &ab_n.words[w], &c_out);
+
+ b_in = b_out; // propagate borrow
+ c_in = c_out; // propagate carry
+ }
+
+ // now select the right buffer
+
+ /*
+ * We select the right variant based on borrow flag after subtraction
+ * and addition of the very last pair of words. Note, that we only
+ * need to select the second variant (a - b + q) when a < b. This way
+ * if we get negative number after subtraction, we discard it
+ * and use the output of the adder instead. The Subtractor output is
+ * negative when borrow flag is set.
+ */
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ d->words[w] = b_out ? ab_n.words[w] : ab.words[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular multiplication.
+//
+// This routine implements modular multiplication algorithm from "Ultra High
+// Performance ECC over NIST Primes on Commercial FPGAs".
+//
+// p = (a * b) mod q
+//
+// The complex algorithm is split into three parts:
+//
+// 1. Calculation of partial words
+// 2. Acccumulation of partial words into full-size product
+// 3. Modular reduction of the full-size product
+//
+// See comments for corresponding helper routines for more information.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *p)
+//------------------------------------------------------------------------------
+{
+ FPGA_WORD_EXTENDED si[4*OPERAND_NUM_WORDS-1]; // parts of intermediate product
+ FPGA_WORD c[2*OPERAND_NUM_WORDS]; // full-size intermediate product
+
+ /* multiply to get partial words */
+ fpga_modular_mul_helper_multiply(a, b, si);
+
+ /* accumulate partial words into full-size product */
+ fpga_modular_mul_helper_accumulate(si, c);
+
+ /* reduce full-size product using special routine */
+ fpga_modular_mul_helper_reduce(c, p);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular multiplicative inversion procedure.
+//
+// a1 = a^-1 (mod n)
+//
+// This routine implements the algorithm from "Constant Time Modular
+// Inversion" by Joppe W. Bos (http://www.joppebos.com/files/CTInversion.pdf)
+//
+// The algorithm has two phases: 1) calculation of "almost" modular inverse,
+// which is a^-1*2^k and 2) removal of redundant factor 2^k.
+//
+// The first stage has four temporary variables: r, s, u, v; they are updated
+// at every iteration. Depending on flags there can be four branches, FPGA
+// will pre-calculate all possible values in parallel and then use a multiplexor
+// to select the next value. This implementation also calculates all possible
+// outcomes. This is done for debugging purposes, *NOT* for constant run-time!
+//
+// The second part only works with s and k.
+//
+// Note, that k is a simple counter, and it can't exceed 2*OPERAND_WIDTH.
+//
+// The complex inversion algorithm uses helper routines. Note, that width of the
+// intermediate results can temporarily exceed OPERAND_WIDTH, so all the helper
+// routines process OPERAND_NUM_WORDS+1 words.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv(FPGA_BUFFER *a, FPGA_BUFFER *a1)
+{
+ int i; // round counter
+ int w; // word counter
+ int k; // redundant power of 2
+
+ /* q, 1 */
+ FPGA_WORD buf_q[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_1[OPERAND_NUM_WORDS+1];
+
+ /* r, s */
+ FPGA_WORD buf_r[OPERAND_NUM_WORDS+1], buf_s[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_r_double[OPERAND_NUM_WORDS+1], buf_s_double[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_r_new[OPERAND_NUM_WORDS+1], buf_s_new[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_r_plus_s[OPERAND_NUM_WORDS+1], buf_s_plus_r[OPERAND_NUM_WORDS+1];
+
+ /* u, v */
+ FPGA_WORD buf_u[OPERAND_NUM_WORDS+1], buf_v[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_u_half[OPERAND_NUM_WORDS+1], buf_v_half[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_u_minus_v[OPERAND_NUM_WORDS+1], buf_v_minus_u[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_u_minus_v_half[OPERAND_NUM_WORDS+1], buf_v_minus_u_half[OPERAND_NUM_WORDS+1];
+ FPGA_WORD buf_u_new[OPERAND_NUM_WORDS+1], buf_v_new[OPERAND_NUM_WORDS+1];
+
+ /* comparison */
+ int cmp_v_1, cmp_u_v;
+
+ /* clear buffers */
+ for (w=0; w<=OPERAND_NUM_WORDS; w++)
+ buf_r[w] = 0, buf_s[w] = 0,
+ buf_u[w] = 0, buf_v[w] = 0,
+ buf_q[w] = 0, buf_1[w] = 0;
+
+ /* initialize q, 1 */
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ buf_q[w] = ecdsa_q.words[w], buf_1[w] = ecdsa_one.words[w];
+
+ /* initialize r, s */
+ buf_r[0] = 0, buf_s[0] = 1;
+
+ /* initialize u, v */
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ buf_u[w] = ecdsa_q.words[w], buf_v[w] = a->words[w];
+
+ /* initialize k */
+ k = 0;
+
+ /* flags for the first stage */
+ bool v_is_1, u_is_greater_than_v, u_is_even, v_is_even;
+
+ /* first stage */
+ for (i=0; i<(2*OPERAND_WIDTH); i++)
+ {
+ /* pre-calculate all possible values for r and s */
+ fpga_modular_inv_helper_shl(buf_r, buf_r_double); // r_double = 2 * r
+ fpga_modular_inv_helper_shl(buf_s, buf_s_double); // s_double = 2 * s
+ fpga_modular_inv_helper_add(buf_r, buf_s, buf_r_plus_s); // r_plus_s = r + s
+ fpga_modular_inv_helper_add(buf_s, buf_r, buf_s_plus_r); // s_plus_r = s + r
+
+ /* pre-calculate all possible values for u and v */
+ fpga_modular_inv_helper_shr(buf_u, buf_u_half); // u_half = u / 2
+ fpga_modular_inv_helper_shr(buf_v, buf_v_half); // v_half = v / 2
+ fpga_modular_inv_helper_sub(buf_u, buf_v, buf_u_minus_v); // u_minus_v = u - v
+ fpga_modular_inv_helper_sub(buf_v, buf_u, buf_v_minus_u); // v_minus_u = v - u
+ fpga_modular_inv_helper_shr(buf_u_minus_v, buf_u_minus_v_half); // u_minus_v_half = u_minus_v / 2
+ fpga_modular_inv_helper_shr(buf_v_minus_u, buf_v_minus_u_half); // v_minus_u_half = v_minus_u / 2
+
+ /* compare */
+ fpga_modular_inv_helper_cmp(buf_v, buf_1, &cmp_v_1);
+ fpga_modular_inv_helper_cmp(buf_u, buf_v, &cmp_u_v);
+
+ /* assign flags */
+ v_is_1 = (cmp_v_1 == 0);
+ u_is_greater_than_v = (cmp_u_v > 0);
+ u_is_even = !(buf_u[0] & 1);
+ v_is_even = !(buf_v[0] & 1);
+
+ /* select u */
+ if ( u_is_even) fpga_modular_inv_helper_cpy(buf_u_new, buf_u_half);
+ if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_u_new, buf_u);
+ if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_u_new, u_is_greater_than_v ? buf_u_minus_v_half : buf_u);
+
+ /* select v */
+ if ( u_is_even) fpga_modular_inv_helper_cpy(buf_v_new, buf_v);
+ if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_v_new, buf_v_half);
+ if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_v_new, u_is_greater_than_v ? buf_v : buf_v_minus_u_half);
+
+ /* select r */
+ if ( u_is_even) fpga_modular_inv_helper_cpy(buf_r_new, buf_r);
+ if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_r_new, buf_r_double);
+ if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_r_new, u_is_greater_than_v ? buf_r_plus_s : buf_r_double);
+
+ /* select s */
+ if ( u_is_even) fpga_modular_inv_helper_cpy(buf_s_new, buf_s_double);
+ if (!u_is_even && v_is_even) fpga_modular_inv_helper_cpy(buf_s_new, buf_s);
+ if (!u_is_even && !v_is_even) fpga_modular_inv_helper_cpy(buf_s_new, u_is_greater_than_v ? buf_s_double : buf_s_plus_r);
+
+ /* update values */
+ if (!v_is_1)
+ { fpga_modular_inv_helper_cpy(buf_u, buf_u_new);
+ fpga_modular_inv_helper_cpy(buf_v, buf_v_new);
+ fpga_modular_inv_helper_cpy(buf_r, buf_r_new);
+ fpga_modular_inv_helper_cpy(buf_s, buf_s_new);
+ }
+
+ /* update k */
+ if (!v_is_1) k++;
+ }
+
+ //
+ // Note, that to save FPGA resources, the second stage re-uses buffers
+ // used in the first stage.
+ //
+
+ /* flags for the second stage */
+ bool k_is_0, s_is_odd;
+
+ /* second stage */
+ for (i=0; i<(2*OPERAND_WIDTH); i++)
+ {
+ /* pre-calculate all possible values */
+ fpga_modular_inv_helper_shr(buf_s, buf_u);
+ fpga_modular_inv_helper_add(buf_s, buf_q, buf_r);
+ fpga_modular_inv_helper_shr(buf_r, buf_v);
+
+ //
+ // assign flags
+ //
+ s_is_odd = buf_s[0] & 1;
+ k_is_0 = (k == 0);
+
+ //
+ // select new values based on flags
+ //
+ fpga_modular_inv_helper_cpy(buf_s_new, s_is_odd ? buf_v : buf_u);
+
+ /* update s */
+ if (! k_is_0)
+ fpga_modular_inv_helper_cpy(buf_s, buf_s_new);
+
+ /* update k */
+ if (! k_is_0) k--;
+ }
+
+ /* done, copy s into output buffer */
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ a1->words[w] = buf_s[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Parallelized multiplication.
+//
+// This routine implements the algorithm in Fig. 3. from "Ultra High
+// Performance ECC over NIST Primes on Commercial FPGAs".
+//
+// Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2
+// bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide
+// inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS
+// multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial
+// output words si[].
+//
+// The order of loading a and b into the multipliers is a bit complicated,
+// during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is
+// obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the
+// remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul_helper_multiply(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si)
+//------------------------------------------------------------------------------
+{
+ int w; // counter
+ int t, x; // more counters
+ int j, i; // word indices
+ FPGA_WORD p; // product
+
+ // buffers for smaller words that multipliers can handle
+ FPGA_WORD_REDUCED ai[2*OPERAND_NUM_WORDS];
+ FPGA_WORD_REDUCED bj[2*OPERAND_NUM_WORDS];
+
+ // split a and b into smaller words
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ ai[2*w] = (FPGA_WORD_REDUCED)a->words[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)),
+ bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2));
+
+ // accumulators
+ FPGA_WORD_EXTENDED mac[2*OPERAND_NUM_WORDS];
+
+ // clear accumulators
+ for (w=0; w<(2*OPERAND_NUM_WORDS); w++) mac[w] = 0;
+
+ // run the crazy algorithm :)
+ for (t=0; t<(2*OPERAND_NUM_WORDS); t++)
+ {
+ // save upper half of si[] (one word per cycle)
+ if (t > 0)
+ { si[4*OPERAND_NUM_WORDS - (t+1)] = mac[t];
+ mac[t] = 0;
+ }
+
+ // update index
+ j = 2*OPERAND_NUM_WORDS - (t+1);
+
+ // parallel multiplication
+ for (x=0; x<(2*OPERAND_NUM_WORDS); x++)
+ {
+ // update index
+ i = t - x;
+ if (i < 0) i += 2*OPERAND_NUM_WORDS;
+
+ // multiply...
+ fpga_lowlevel_mul16(ai[i], bj[j], &p);
+
+ // ...accumulate
+ mac[x] += p;
+ }
+
+ }
+
+ // now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once)
+ for (w=0; w<(2*OPERAND_NUM_WORDS); w++)
+ si[w] = mac[2*OPERAND_NUM_WORDS - (w+1)];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Accumulation of partial words into full-size product.
+//
+// This routine implements the Algorithm 4. from "Ultra High Performance ECC
+// over NIST Primes on Commercial FPGAs".
+//
+// Input words si[] are accumulated into full-size product c[].
+//
+// The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in
+// si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are
+// summed in full, odd words are split into two parts. During every cycle the
+// upper part of the previous word and the lower part of the following word are
+// summed too.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul_helper_accumulate(FPGA_WORD_EXTENDED *si, FPGA_WORD *c)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ FPGA_WORD_EXTENDED cw0, cw1; // intermediate sums
+ FPGA_WORD_REDUCED cw_carry; // wide carry
+
+ // clear carry
+ cw_carry = 0;
+
+ // execute the algorithm
+ for (w=0; w<(2*OPERAND_NUM_WORDS); w++)
+ {
+ // handy flags
+ bool w_is_first = (w == 0);
+ bool w_is_last = (w == (2*OPERAND_NUM_WORDS-1));
+
+ // accumulate full current even word...
+ // ...and also the upper part of the previous odd word (if not the first word)
+ fpga_lowlevel_add48(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0);
+
+ // generate another word from "carry" part of the previous even word...
+ // ...and also the lower part of the following odd word (if not the last word)
+ cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2));
+ cw1 |= (FPGA_WORD_EXTENDED)cw_carry;
+
+ // accumulate once again
+ fpga_lowlevel_add48(cw0, cw1, &cw1);
+
+ // store current word...
+ c[w] = (FPGA_WORD)cw1;
+
+ // ...and carry
+ cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH);
+ }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Fast modular reduction for NIST prime P-256.
+//
+// p = c mod p256
+//
+// This routine implements the algorithm 2.29 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
+// on the other hand is the output of the parallelized Comba multiplier, so it
+// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
+//
+// To save FPGA resources, the calculation is done using only two adders and
+// one subtractor. The algorithm is split into five steps.
+//
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+void fpga_modular_mul_helper_reduce_p256(FPGA_WORD *c, FPGA_BUFFER *p)
+{
+ // "funny" words
+ FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9;
+
+ // compose "funny" words out of input words
+ s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0];
+ s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0, s2.words[1] = 0, s2.words[0] = 0;
+ s3.words[7] = 0, s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0, s3.words[1] = 0, s3.words[0] = 0;
+ s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0, s4.words[4] = 0, s4.words[3] = 0, s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8];
+ s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9];
+ s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0, s6.words[4] = 0, s6.words[3] = 0, s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11];
+ s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0, s7.words[4] = 0, s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12];
+ s8.words[7] = c[12], s8.words[6] = 0, s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13];
+ s9.words[7] = c[13], s9.words[6] = 0, s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0, s9.words[1] = c[15], s9.words[0] = c[14];
+
+ // intermediate results
+ FPGA_BUFFER sum0, sum1, difference;
+
+ /* Step 1. */
+ fpga_modular_add(&s2, &s2, &sum0); // sum0 = 2*s2
+ fpga_modular_add(&s3, &s3, &sum1); // sum1 = 2*s3
+ fpga_modular_sub(&ecdsa_zero, &s6, &difference); // difference = -s6
+
+ /* Step 2. */
+ fpga_modular_add(&sum0, &s1, &sum0); // sum0 = s1 + 2*s2
+ fpga_modular_add(&sum1, &s4, &sum1); // sum1 = s4 + 2*s3
+ fpga_modular_sub(&difference, &s7, &difference); // difference = -(s6 + s7)
+
+ /* Step 3. */
+ fpga_modular_add(&sum0, &s5, &sum0); // sum0 = s1 + 2*s2 + s5
+ fpga_modular_add(&sum1, &ecdsa_zero, &sum1); // compulsory cycle to keep sum1 constant for next stage
+ fpga_modular_sub(&difference, &s8, &difference); // difference = -(s6 + s7 + s8)
+
+ /* Step 4. */
+ fpga_modular_add(&sum0, &sum1, &sum0); // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
+// fpga_modular_add(<dummy>, <dummy>, &sum1); // dummy cycle, result ignored
+ fpga_modular_sub(&difference, &s9, &difference); // difference = -(s6 + s7 + s8 + s9)
+
+ /* Step 5. */
+ fpga_modular_add(&sum0, &difference, p); // p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9
+// fpga_modular_add(<dummy>, <dummy>, &sum1); // dummy cycle, result ignored
+// fpga_modular_add(<dummy>, <dummy>, &difference); // dummy cycle, result ignored
+}
+#endif
+
+
+//------------------------------------------------------------------------------
+//
+// Fast modular reduction for NIST prime P-384.
+//
+// p = c mod p384
+//
+// This routine implements the algorithm 2.30 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
+// on the other hand is the output of the parallelized Comba multiplier, so it
+// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
+//
+// To save FPGA resources, the calculation is done using only two adders and
+// one subtractor. The algorithm is split into five steps.
+//
+//------------------------------------------------------------------------------
+#if USE_CURVE == 2
+void fpga_modular_mul_helper_reduce_p384(FPGA_WORD *c, FPGA_BUFFER *p)
+{
+ // "funny" words
+ FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+ // compose "funny" words
+ s1.words[11] = c[11], s1.words[10] = c[10], s1.words[ 9] = c[ 9], s1.words[ 8] = c[ 8], s1.words[ 7] = c[ 7], s1.words[ 6] = c[ 6], s1.words[ 5] = c[ 5], s1.words[ 4] = c[ 4], s1.words[ 3] = c[ 3], s1.words[ 2] = c[ 2], s1.words[ 1] = c[ 1], s1.words[ 0] = c[ 0];
+ s2.words[11] = 0, s2.words[10] = 0, s2.words[ 9] = 0, s2.words[ 8] = 0, s2.words[ 7] = 0, s2.words[ 6] = c[23], s2.words[ 5] = c[22], s2.words[ 4] = c[21], s2.words[ 3] = 0, s2.words[ 2] = 0, s2.words[ 1] = 0, s2.words[ 0] = 0;
+ s3.words[11] = c[23], s3.words[10] = c[22], s3.words[ 9] = c[21], s3.words[ 8] = c[20], s3.words[ 7] = c[19], s3.words[ 6] = c[18], s3.words[ 5] = c[17], s3.words[ 4] = c[16], s3.words[ 3] = c[15], s3.words[ 2] = c[14], s3.words[ 1] = c[13], s3.words[ 0] = c[12];
+ s4.words[11] = c[20], s4.words[10] = c[19], s4.words[ 9] = c[18], s4.words[ 8] = c[17], s4.words[ 7] = c[16], s4.words[ 6] = c[15], s4.words[ 5] = c[14], s4.words[ 4] = c[13], s4.words[ 3] = c[12], s4.words[ 2] = c[23], s4.words[ 1] = c[22], s4.words[ 0] = c[21];
+ s5.words[11] = c[19], s5.words[10] = c[18], s5.words[ 9] = c[17], s5.words[ 8] = c[16], s5.words[ 7] = c[15], s5.words[ 6] = c[14], s5.words[ 5] = c[13], s5.words[ 4] = c[12], s5.words[ 3] = c[20], s5.words[ 2] = 0, s5.words[ 1] = c[23], s5.words[ 0] = 0;
+ s6.words[11] = 0, s6.words[10] = 0, s6.words[ 9] = 0, s6.words[ 8] = 0, s6.words[ 7] = c[23], s6.words[ 6] = c[22], s6.words[ 5] = c[21], s6.words[ 4] = c[20], s6.words[ 3] = 0, s6.words[ 2] = 0, s6.words[ 1] = 0, s6.words[ 0] = 0;
+ s7.words[11] = 0, s7.words[10] = 0, s7.words[ 9] = 0, s7.words[ 8] = 0, s7.words[ 7] = 0, s7.words[ 6] = 0, s7.words[ 5] = c[23], s7.words[ 4] = c[22], s7.words[ 3] = c[21], s7.words[ 2] = 0, s7.words[ 1] = 0, s7.words[ 0] = c[20];
+ s8.words[11] = c[22], s8.words[10] = c[21], s8.words[ 9] = c[20], s8.words[ 8] = c[19], s8.words[ 7] = c[18], s8.words[ 6] = c[17], s8.words[ 5] = c[16], s8.words[ 4] = c[15], s8.words[ 3] = c[14], s8.words[ 2] = c[13], s8.words[ 1] = c[12], s8.words[ 0] = c[23];
+ s9.words[11] = 0, s9.words[10] = 0, s9.words[ 9] = 0, s9.words[ 8] = 0, s9.words[ 7] = 0, s9.words[ 6] = 0, s9.words[ 5] = 0, s9.words[ 4] = c[23], s9.words[ 3] = c[22], s9.words[ 2] = c[21], s9.words[ 1] = c[20], s9.words[ 0] = 0;
+ s10.words[11] = 0, s10.words[10] = 0, s10.words[ 9] = 0, s10.words[ 8] = 0, s10.words[ 7] = 0, s10.words[ 6] = 0, s10.words[ 5] = 0, s10.words[ 4] = c[23], s10.words[ 3] = c[23], s10.words[ 2] = 0, s10.words[ 1] = 0, s10.words[ 0] = 0;
+
+ // intermediate results
+ FPGA_BUFFER sum0, sum1, difference;
+
+ /* Step 1. */
+ fpga_modular_add(&s1, &s3, &sum0); // sum0 = s1 + s3
+ fpga_modular_add(&s2, &s2, &sum1); // sum1 = 2*s2
+ fpga_modular_sub(&ecdsa_zero, &s8, &difference); // difference = -s8
+
+ /* Step 2. */
+ fpga_modular_add(&sum0, &s4, &sum0); // sum0 = s1 + s3 + s4
+ fpga_modular_add(&sum1, &s5, &sum1); // sum1 = 2*s2 + s5
+ fpga_modular_sub(&difference, &s9, &difference); // difference = -(s8 + s9)
+
+ /* Step 3. */
+ fpga_modular_add(&sum0, &s6, &sum0); // sum0 = s1 + s3 + s4 + s6
+ fpga_modular_add(&sum1, &s7, &sum1); // sum1 = 2*s2 + s5 + s7
+ fpga_modular_sub(&difference, &s10, &difference); // difference = -(s8 + s9 + s10)
+
+ /* Step 4. */
+ fpga_modular_add(&sum0, &sum1, &sum0); // sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
+// fpga_modular_add(<dummy>, <dummy>, &sum1); // dummy cycle, result ignored
+ fpga_modular_sub(&difference, &ecdsa_zero, &difference); // compulsory cycle to keep difference constant for next stage
+
+ /* Step 5. */
+ fpga_modular_add(&sum0, &difference, p); // p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
+// fpga_modular_add(<dummy>, <dummy>, &sum1); // dummy cycle, result ignored
+// fpga_modular_add(<dummy>, <dummy>, &difference); // dummy cycle, result ignored
+}
+#endif
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word shift to the left by 1 bit.
+//
+// y = x << 1
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_shl(FPGA_WORD *x, FPGA_WORD *y)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ FPGA_WORD carry_in, carry_out; // carries
+
+ carry_in = 0; // first word has no carry
+
+ // shift word-by-word
+ for (w=0; w<=OPERAND_NUM_WORDS; w++)
+ carry_out = x[w] >> (FPGA_WORD_WIDTH - 1), // store next carry
+ y[w] = x[w] << 1, // shift
+ y[w] |= carry_in, // process carry
+ carry_in = carry_out; // propagate carry
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word shift to the right by 1 bit.
+//
+// y = x >> 1
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_shr(FPGA_WORD *x, FPGA_WORD *y)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ FPGA_WORD carry_in, carry_out; // carries
+
+ carry_in = 0; // first word has no carry
+
+ // shift word-by-word
+ for (w=OPERAND_NUM_WORDS; w>=0; w--)
+ carry_out = x[w], // store next carry
+ y[w] = x[w] >> 1, // shift
+ y[w] |= carry_in << (FPGA_WORD_WIDTH - 1), // process carry
+ carry_in = carry_out; // propagate carry
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word addition.
+//
+// s = x + y
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_add(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *s)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ bool carry_in, carry_out; // carries
+
+ // lowest word has no carry
+ carry_in = false;
+
+ // sum a and b word-by-word
+ for (w=0; w<=OPERAND_NUM_WORDS; w++)
+ {
+ // low-level addition
+ fpga_lowlevel_add32(x[w], y[w], carry_in, &s[w], &carry_out);
+
+ // propagate carry bit
+ carry_in = carry_out;
+ }
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word subtraction .
+//
+// d = x - y
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_sub(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *d)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ bool borrow_in, borrow_out; // borrows
+
+ // lowest word has no borrow
+ borrow_in = false;
+
+ // subtract b from a word-by-word
+ for (w=0; w<=OPERAND_NUM_WORDS; w++)
+ {
+ // low-level subtraction
+ fpga_lowlevel_sub32(x[w], y[w], borrow_in, &d[w], &borrow_out);
+
+ // propagate borrow bit
+ borrow_in = borrow_out;
+ }
+
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word copy.
+//
+// dst = src
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_cpy(FPGA_WORD *dst, FPGA_WORD *src)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+
+ // copy all the words from src into dst
+ for (w=0; w<=OPERAND_NUM_WORDS; w++)
+ dst[w] = src[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word comparison.
+//
+// The return value is -1 when a<b, 0 when a=b and 1 when a>b.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_cmp(FPGA_WORD *a, FPGA_WORD *b, int *c)
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+ int r, r0, ra, rb; // result
+ bool borrow; // borrow
+ FPGA_WORD d; // partial difference
+
+ // result is unknown so far
+ r = 0;
+
+ // clear borrow for the very first word
+ borrow = false;
+
+ // compare a and b word-by-word
+ for (w=OPERAND_NUM_WORDS; w>=0; w--)
+ {
+ // save result
+ r0 = r;
+
+ // subtract current words
+ fpga_lowlevel_sub32(a[w], b[w], false, &d, &borrow);
+
+ // analyze flags
+ rb = borrow ? -1 : 0; // a[w] < b[w]
+ ra = (!borrow && (d != 0)) ? 1 : 0; // a[w] > b[w]
+
+ //
+ // Note, that ra is either 0 or 1, rb is either 0 or -1 and they
+ // can never be non-zero at the same time.
+ //
+ // Note, that r can only be updated if comparison result has not
+ // been resolved yet. Even if we already know comparison result,
+ // we continue doing dummy subtractions to keep run-time constant.
+ //
+
+ // update result
+ r = (r == 0) ? ra + rb : r0;
+ }
+
+ // done
+ *c = r;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_modular.h b/fpga_modular.h
new file mode 100644
index 0000000..871c8d3
--- /dev/null
+++ b/fpga_modular.h
@@ -0,0 +1,87 @@
+//------------------------------------------------------------------------------
+//
+// fpga_modular.h
+// ---------------------------
+// Modular arithmetic routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER ecdsa_q;
+extern FPGA_BUFFER ecdsa_zero;
+extern FPGA_BUFFER ecdsa_one;
+extern FPGA_BUFFER ecdsa_delta;
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void fpga_modular_init ();
+
+void fpga_modular_add (FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *s);
+void fpga_modular_sub (FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *d);
+void fpga_modular_mul (FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *p);
+void fpga_modular_inv (FPGA_BUFFER *a, FPGA_BUFFER *a1);
+
+void fpga_modular_mul_helper_multiply (FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si);
+void fpga_modular_mul_helper_accumulate (FPGA_WORD_EXTENDED *si, FPGA_WORD *c);
+void fpga_modular_mul_helper_reduce_p256 (FPGA_WORD *c, FPGA_BUFFER *p);
+void fpga_modular_mul_helper_reduce_p384 (FPGA_WORD *c, FPGA_BUFFER *p);
+
+void fpga_modular_inv_helper_shl (FPGA_WORD *x, FPGA_WORD *y);
+void fpga_modular_inv_helper_shr (FPGA_WORD *x, FPGA_WORD *y);
+void fpga_modular_inv_helper_add (FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *s);
+void fpga_modular_inv_helper_sub (FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *d);
+void fpga_modular_inv_helper_cpy (FPGA_WORD *dst, FPGA_WORD *src);
+void fpga_modular_inv_helper_cmp (FPGA_WORD *a, FPGA_WORD *b, int *c);
+
+
+//------------------------------------------------------------------------------
+// Reduction Routine Selection
+//------------------------------------------------------------------------------
+
+#if USE_CURVE == 1
+#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p256
+#elif USE_CURVE == 2
+#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p384
+#else
+#error USE_CURVE must be either 1 or 2!
+#endif
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_util.cpp b/fpga_util.cpp
new file mode 100644
index 0000000..d5e4708
--- /dev/null
+++ b/fpga_util.cpp
@@ -0,0 +1,86 @@
+//------------------------------------------------------------------------------
+//
+// fpga_util.cpp
+// ---------------------
+// Utility FPGA routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, 2018 NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdh_fpga_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_util.h"
+
+
+//------------------------------------------------------------------------------
+bool fpga_buffer_is_zero(FPGA_BUFFER *x)
+//------------------------------------------------------------------------------
+//
+// Returns true when large multi-word integer x is zero.
+//
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+
+ // try to find non-zero words
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ if (x->words[w]) return false;
+
+ // no non-zero words found
+ return true;
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_buffer_copy(FPGA_BUFFER *src, FPGA_BUFFER *dst)
+//------------------------------------------------------------------------------
+//
+// Copies large multi-word integer from src into dst.
+//
+//------------------------------------------------------------------------------
+{
+ int w; // word counter
+
+ // copy all the words from src into dst
+ for (w=0; w<OPERAND_NUM_WORDS; w++)
+ dst->words[w] = src->words[w];
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_util.h b/fpga_util.h
new file mode 100644
index 0000000..7ac647c
--- /dev/null
+++ b/fpga_util.h
@@ -0,0 +1,49 @@
+//------------------------------------------------------------------------------
+//
+// fpga_util.h
+// ---------------------
+// Utility FPGA routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+bool fpga_buffer_is_zero (FPGA_BUFFER *x);
+void fpga_buffer_copy (FPGA_BUFFER *src, FPGA_BUFFER *dst);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/test_vectors/alice_p256.key b/test_vectors/alice_p256.key
new file mode 100644
index 0000000..ae4ebb1
--- /dev/null
+++ b/test_vectors/alice_p256.key
@@ -0,0 +1,8 @@
+-----BEGIN EC PARAMETERS-----
+BggqhkjOPQMBBw==
+-----END EC PARAMETERS-----
+-----BEGIN EC PRIVATE KEY-----
+MHcCAQEEIEBNSvo4ZaPW+SHMtHzepOknbD1FboTRlmMyTa+MXi9EoAoGCCqGSM49
+AwEHoUQDQgAEbzthrz150bZ8EoNpH+joct6B4XoGtdThru3NVwm/HRvRNFvMoCLq
+iVOwTC0R/CT4Czt/hHt53u7ZLsQw2Ow8mA==
+-----END EC PRIVATE KEY-----
diff --git a/test_vectors/alice_p384.key b/test_vectors/alice_p384.key
new file mode 100644
index 0000000..04aba40
--- /dev/null
+++ b/test_vectors/alice_p384.key
@@ -0,0 +1,9 @@
+-----BEGIN EC PARAMETERS-----
+BgUrgQQAIg==
+-----END EC PARAMETERS-----
+-----BEGIN EC PRIVATE KEY-----
+MIGkAgEBBDDnM9nbuIZ7Vzy7wL2JnIjbZpMi6AQ1waTisN2xXnVzcfaEpZUF2SPI
+v5bcEcOuUFqgBwYFK4EEACKhZANiAASLhScIGDEffx7y8E2zjmiyPACL+xTiDLA3
+76Qhw8PfbaN8SE6FXamB2oZlgHID/zbXSPUV738mcgzopw4ggnKWjuaomq7M2Pth
+s1Nkxw37SOtcaFyBC9nLLRhPsQlqsw8=
+-----END EC PRIVATE KEY-----
diff --git a/test_vectors/bob_p256.key b/test_vectors/bob_p256.key
new file mode 100644
index 0000000..f1e17bc
--- /dev/null
+++ b/test_vectors/bob_p256.key
@@ -0,0 +1,8 @@
+-----BEGIN EC PARAMETERS-----
+BggqhkjOPQMBBw==
+-----END EC PARAMETERS-----
+-----BEGIN EC PRIVATE KEY-----
+MHcCAQEEIHFZpDvoMiRxGf6usnqSRm4rB8jfKbvX6tMjKvhEmVqVoAoGCCqGSM49
+AwEHoUQDQgAEBRRgjcLcaiF0sITWFoqtE0rNP1JuSdwyv5hyqqS+mdlySvp1TGcr
+ceh8m9rh4rFfeE9ID+tiBA4oGVO96jgpRg==
+-----END EC PRIVATE KEY-----
diff --git a/test_vectors/bob_p384.key b/test_vectors/bob_p384.key
new file mode 100644
index 0000000..7e5b303
--- /dev/null
+++ b/test_vectors/bob_p384.key
@@ -0,0 +1,9 @@
+-----BEGIN EC PARAMETERS-----
+BgUrgQQAIg==
+-----END EC PARAMETERS-----
+-----BEGIN EC PRIVATE KEY-----
+MIGkAgEBBDBWAYINcFIkpd1t2xPgoV52hp5qvje6IjV5Kvn2qb8RStH9MZ3YGB4G
+RPFUSE5zp1qgBwYFK4EEACKhZANiAAQ3q1VtBlLGueNSxkNFAr4Zn7nFDyreBJsG
+5QwwzasGc2nv4MBuEUp28TOBddtPSYIhzdrg8Re2VnydR3vB/F0kOyZlHhQG8e6z
+QYVSc5yTlWd0yE0gztwVkv1d4Eu/mK0=
+-----END EC PRIVATE KEY-----
diff --git a/test_vectors/ecdh_test_vectors.h b/test_vectors/ecdh_test_vectors.h
new file mode 100644
index 0000000..7fbb746
--- /dev/null
+++ b/test_vectors/ecdh_test_vectors.h
@@ -0,0 +1,74 @@
+/* Generated automatically, do not edit. */
+
+#define P_256_DA \
+ {0x404d4afa, 0x3865a3d6, 0xf921ccb4, 0x7cdea4e9, \
+ 0x276c3d45, 0x6e84d196, 0x63324daf, 0x8c5e2f44}
+
+#define P_256_QA_X \
+ {0x6f3b61af, 0x3d79d1b6, 0x7c128369, 0x1fe8e872, \
+ 0xde81e17a, 0x06b5d4e1, 0xaeedcd57, 0x09bf1d1b}
+
+#define P_256_QA_Y \
+ {0xd1345bcc, 0xa022ea89, 0x53b04c2d, 0x11fc24f8, \
+ 0x0b3b7f84, 0x7b79deee, 0xd92ec430, 0xd8ec3c98}
+
+#define P_256_DB \
+ {0x7159a43b, 0xe8322471, 0x19feaeb2, 0x7a92466e, \
+ 0x2b07c8df, 0x29bbd7ea, 0xd3232af8, 0x44995a95}
+
+#define P_256_QB_X \
+ {0x0514608d, 0xc2dc6a21, 0x74b084d6, 0x168aad13, \
+ 0x4acd3f52, 0x6e49dc32, 0xbf9872aa, 0xa4be99d9}
+
+#define P_256_QB_Y \
+ {0x724afa75, 0x4c672b71, 0xe87c9bda, 0xe1e2b15f, \
+ 0x784f480f, 0xeb62040e, 0x281953bd, 0xea382946}
+
+#define P_256_S_X \
+ {0xa001c11b, 0x0d04b6c3, 0xbe99551e, 0x9115b811, \
+ 0x0a41a0b7, 0x59c3e3f2, 0xfb636df1, 0xeb0e9a42}
+
+#define P_256_S_Y \
+ {0x14ed5674, 0x62b6ba27, 0x2ba0e01b, 0x2647d725, \
+ 0x5919bf5e, 0xcbb542f7, 0x659d40de, 0x324524ac}
+
+#define P_384_DA \
+ {0xe733d9db, 0xb8867b57, 0x3cbbc0bd, 0x899c88db, \
+ 0x669322e8, 0x0435c1a4, 0xe2b0ddb1, 0x5e757371, \
+ 0xf684a595, 0x05d923c8, 0xbf96dc11, 0xc3ae505a}
+
+#define P_384_QA_X \
+ {0x8b852708, 0x18311f7f, 0x1ef2f04d, 0xb38e68b2, \
+ 0x3c008bfb, 0x14e20cb0, 0x37efa421, 0xc3c3df6d, \
+ 0xa37c484e, 0x855da981, 0xda866580, 0x7203ff36}
+
+#define P_384_QA_Y \
+ {0xd748f515, 0xef7f2672, 0x0ce8a70e, 0x20827296, \
+ 0x8ee6a89a, 0xaeccd8fb, 0x61b35364, 0xc70dfb48, \
+ 0xeb5c685c, 0x810bd9cb, 0x2d184fb1, 0x096ab30f}
+
+#define P_384_DB \
+ {0x5601820d, 0x705224a5, 0xdd6ddb13, 0xe0a15e76, \
+ 0x869e6abe, 0x37ba2235, 0x792af9f6, 0xa9bf114a, \
+ 0xd1fd319d, 0xd8181e06, 0x44f15448, 0x4e73a75a}
+
+#define P_384_QB_X \
+ {0x37ab556d, 0x0652c6b9, 0xe352c643, 0x4502be19, \
+ 0x9fb9c50f, 0x2ade049b, 0x06e50c30, 0xcdab0673, \
+ 0x69efe0c0, 0x6e114a76, 0xf1338175, 0xdb4f4982}
+
+#define P_384_QB_Y \
+ {0x21cddae0, 0xf117b656, 0x7c9d477b, 0xc1fc5d24, \
+ 0x3b26651e, 0x1406f1ee, 0xb3418552, 0x739c9395, \
+ 0x6774c84d, 0x20cedc15, 0x92fd5de0, 0x4bbf98ad}
+
+#define P_384_S_X \
+ {0x15ac62cb, 0xbb51e1ed, 0xd41d489f, 0xdfa05d45, \
+ 0x115f4ef2, 0x269fbd26, 0x3f6c7364, 0x673f0b19, \
+ 0x489e8a7b, 0xdfad6d40, 0x277edf9f, 0x62220c51}
+
+#define P_384_S_Y \
+ {0xa0b846fe, 0xa76973b4, 0x12dfae76, 0x2b3b6587, \
+ 0xf62be0a3, 0x73da36ef, 0x8992e7c9, 0x6cf7619d, \
+ 0xa2d6c0a2, 0xd31ad05d, 0xb3a16a95, 0x0cb7055f}
+
diff --git a/test_vectors/format_test_vectors.py b/test_vectors/format_test_vectors.py
new file mode 100644
index 0000000..a49b34b
--- /dev/null
+++ b/test_vectors/format_test_vectors.py
@@ -0,0 +1,450 @@
+#
+# format_test_vectors.py
+# ------------------------------------------
+# Formats test vectors for ecdsa_fpga_model
+#
+# Author: Pavel Shatov
+# Copyright (c) 2017, NORDUnet A/S
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# - Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# - Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# - Neither the name of the NORDUnet nor the names of its contributors may
+# be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+#
+# This script reads the test vectors generated by regenerate_test_vectors.py
+# and writes nicely formatted C header file and Verilog include file.
+#
+
+#
+# imports
+#
+import sys
+import subprocess
+from fastecdsa.curve import P256
+from fastecdsa.curve import P384
+from fastecdsa.point import Point
+
+# list of curve names of interest
+CURVE_P256 = "p256"
+CURVE_P384 = "p384"
+
+# the base point for p-256
+P256_GX = 0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296
+P256_GY = 0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5
+
+# the base point for p-384
+P384_GX = 0xaa87ca22be8b05378eb1c71ef320ad746e1d3b628ba79b9859f741e082542a385502f25dbf55296c3a545e3872760ab7
+P384_GY = 0x3617de4a96262c6f5d9e98bf9292dc29f8f41dbd289a147ce9da3113b5f0b8c00a60b1ce1d7e819d7a431d7c90ea0e5f
+
+
+#
+# get part of string between two markers
+#
+#def string_between(s, s_left, s_right):
+# s_begin = s.index(s_left) + len(s_left)
+# s_end = s.index(s_right, s_begin)
+# return s[s_begin:s_end]
+
+#
+# load message from file
+#
+#def read_message(key):
+# with open(key + ".txt", "r") as f:
+# return f.readlines()[0]
+#
+#
+# read modulus from file
+#
+#def read_modulus(key):
+# openssl_command = ["openssl", "rsa", "-in", key + ".key", "-noout", "-modulus"]
+# openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+# return openssl_stdout.strip().split("=")[1]
+
+#
+# read private exponent from file
+#
+#def read_secret(key):
+# openssl_command = ["openssl", "rsa", "-in", key + ".key", "-noout", "-text"]
+# openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+# openssl_secret = string_between(openssl_stdout, "privateExponent", "prime1")
+# openssl_secret = openssl_secret.replace(":", "")
+# openssl_secret = openssl_secret.replace("\n", "")
+# openssl_secret = openssl_secret.replace(" ", "")
+# return openssl_secret
+
+#
+# read part of private key from file
+#
+#def read_prime1(key):
+# openssl_command = ["openssl", "rsa", "-in", key + ".key", "-noout", "-text"]
+# openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+# openssl_secret = string_between(openssl_stdout, "prime1", "prime2")
+# openssl_secret = openssl_secret.replace(":", "")
+# openssl_secret = openssl_secret.replace("\n", "")
+# openssl_secret = openssl_secret.replace(" ", "")
+# return openssl_secret
+#def read_prime2(key):
+# openssl_command = ["openssl", "rsa", "-in", key + ".key", "-noout", "-text"]
+# openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+# openssl_secret = string_between(openssl_stdout, "prime2", "exponent1")
+# openssl_secret = openssl_secret.replace(":", "")
+# openssl_secret = openssl_secret.replace("\n", "")
+# openssl_secret = openssl_secret.replace(" ", "")
+# return openssl_secret
+
+#
+# read prive exponent from file
+#
+#def read_exponent1(key):
+# openssl_command = ["openssl", "rsa", "-in", key + ".key", "-noout", "-text"]
+# openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+# openssl_secret = string_between(openssl_stdout, "exponent1", "exponent2")
+# openssl_secret = openssl_secret.replace(":", "")
+# openssl_secret = openssl_secret.replace("\n", "")
+# openssl_secret = openssl_secret.replace(" ", "")
+# return openssl_secret
+#def read_exponent2(key):
+# openssl_command = ["openssl", "rsa", "-in", key + ".key", "-noout", "-text"]
+# openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+# openssl_secret = string_between(openssl_stdout, "exponent2", "coefficient")
+# openssl_secret = openssl_secret.replace(":", "")
+# openssl_secret = openssl_secret.replace("\n", "")
+# openssl_secret = openssl_secret.replace(" ", "")
+# return openssl_secret
+
+#
+# format one test vector
+#
+def format_c_header(f, curve, da, qax, qay, db, qbx, qby, sx, sy):
+
+ if curve == CURVE_P256: curve_str = "P_256"
+ if curve == CURVE_P384: curve_str = "P_384"
+
+ # write all numbers in vector
+ format_c_array(f, da, "#define " + curve_str + "_DA" + " \\\n")
+ format_c_array(f, qax, "#define " + curve_str + "_QA_X" + " \\\n")
+ format_c_array(f, qay, "#define " + curve_str + "_QA_Y" + " \\\n")
+
+ format_c_array(f, db, "#define " + curve_str + "_DB" + " \\\n")
+ format_c_array(f, qbx, "#define " + curve_str + "_QB_X" + " \\\n")
+ format_c_array(f, qby, "#define " + curve_str + "_QB_Y" + " \\\n")
+
+ format_c_array(f, sx, "#define " + curve_str + "_S_X" + " \\\n")
+ format_c_array(f, sy, "#define " + curve_str + "_S_Y" + " \\\n")
+
+
+#
+# format one test vector
+#
+#def format_verilog_include(f, key, n, m, d, s, p, q, dp, dq, mp, mq):
+#
+# # calculate factor to bring message into Montgomery domain
+# factor = calc_montgomery_factor(int(key), n)
+# factor_p = calc_montgomery_factor(int(key)//2, p);
+# factor_q = calc_montgomery_factor(int(key)//2, q);
+#
+# # calculate helper coefficients for Montgomery multiplication
+# n_coeff = calc_montgomery_n_coeff(int(key), n)
+# p_coeff = calc_montgomery_n_coeff(int(key)//2, p)
+# q_coeff = calc_montgomery_n_coeff(int(key)//2, q)
+#
+# # calculate the extra coefficient Montgomery multiplication brings in
+# coeff = modinv(1 << int(key), n)
+#
+# # convert m into Montgomery representation
+# m_factor = (m * factor * coeff) % n
+#
+# # write all numbers
+# format_verilog_concatenation(f, m, "localparam [" + str(int(key)-1) + ":0] M_" + key + " =\n")
+# format_verilog_concatenation(f, n, "localparam [" + str(int(key)-1) + ":0] N_" + key + " =\n")
+# format_verilog_concatenation(f, n_coeff, "localparam [" + str(int(key)-1) + ":0] N_COEFF_" + key + " =\n")
+# format_verilog_concatenation(f, factor, "localparam [" + str(int(key)-1) + ":0] FACTOR_" + key + " =\n")
+# format_verilog_concatenation(f, coeff, "localparam [" + str(int(key)-1) + ":0] COEFF_" + key + " =\n")
+# format_verilog_concatenation(f, m_factor, "localparam [" + str(int(key)-1) + ":0] M_FACTOR_" + key + " =\n")
+# format_verilog_concatenation(f, d, "localparam [" + str(int(key)-1) + ":0] D_" + key + " =\n")
+# format_verilog_concatenation(f, s, "localparam [" + str(int(key)-1) + ":0] S_" + key + " =\n")
+#
+# format_verilog_concatenation(f, p, "localparam [" + str(int(key)//2-1) + ":0] P_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, q, "localparam [" + str(int(key)//2-1) + ":0] Q_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, p_coeff, "localparam [" + str(int(key)//2-1) + ":0] P_COEFF_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, q_coeff, "localparam [" + str(int(key)//2-1) + ":0] Q_COEFF_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, factor_p, "localparam [" + str(int(key)//2-1) + ":0] FACTOR_P_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, factor_q, "localparam [" + str(int(key)//2-1) + ":0] FACTOR_Q_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, dp, "localparam [" + str(int(key)//2-1) + ":0] DP_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, dq, "localparam [" + str(int(key)//2-1) + ":0] DQ_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, mp, "localparam [" + str(int(key)//2-1) + ":0] MP_" + str(int(key)//2) + " =\n")
+# format_verilog_concatenation(f, mq, "localparam [" + str(int(key)//2-1) + ":0] MQ_" + str(int(key)//2) + " =\n")
+
+
+#
+# nicely format multi-word integer into C array initializer
+#
+def format_c_array(f, n, s):
+
+ # print '#define XYZ \'
+ f.write(s)
+
+ # convert number to hex string and prepend it with zeroes if necessary
+ n_hex = hex(n).lstrip("0x").rstrip("L")
+ while (len(n_hex) % 8) > 0:
+ n_hex = "0" + n_hex
+
+ # get number of 32-bit words
+ num_words = len(n_hex) // 8
+
+ # print all words in n
+ w = 0
+ while w < num_words:
+
+ n_part = ""
+
+ # add tab for every new line
+ if w == 0:
+ n_part += "\t{"
+ elif (w % 4) == 0:
+ n_part += "\t "
+
+ # add current word
+ n_part += "0x" + n_hex[8 * w : 8 * (w + 1)]
+
+ # add separator or newline
+ if (w + 1) == num_words:
+ n_part += "}\n"
+ else:
+ n_part += ", "
+ if (w % 4) == 3:
+ n_part += "\\\n"
+
+ w += 1
+
+ # write current part
+ f.write(n_part)
+
+ # write final newline
+ f.write("\n")
+
+
+#def format_verilog_concatenation(f, n, s):
+#
+# # print 'localparam ZZZ ='
+# f.write(s)
+#
+# # convert number to hex string and prepend it with zeroes if necessary
+# n_hex = hex(n).split("0x")[1]
+# while (len(n_hex) % 8) > 0:
+# n_hex = "0" + n_hex
+#
+# # get number of 32-bit words
+# num_words = len(n_hex) // 8
+#
+# # print all words in n
+# w = 0
+# while w < num_words:
+#
+# n_part = ""
+#
+# if w == 0:
+# n_part += "\t{"
+# elif (w % 4) == 0:
+# n_part += "\t "
+#
+# n_part += "32'h" + n_hex[8 * w : 8 * (w + 1)]
+#
+# if (w + 1) == num_words:
+# n_part += "};\n"
+# else:
+# n_part += ", "
+# if (w % 4) == 3:
+# n_part += "\n"
+# w += 1
+#
+# f.write(n_part)
+#
+# f.write("\n")
+
+
+ #
+ # returns d, qx, qy, where
+ # d is private key and qx, qy is the corresponding public key
+ #
+def get_key(party, curve):
+
+ # generate private key filename
+ key_file = party + "_" + curve + ".key"
+
+ # retrieve key components using openssl
+ openssl_command = ["openssl", "ec", "-in", key_file, "-noout", "-text"]
+ openssl_stdout = subprocess.check_output(openssl_command).decode("utf-8")
+ stdout_lines = openssl_stdout.splitlines()
+
+ found_priv = False
+ found_pub = False
+
+ key_priv = ""
+ key_pub = ""
+
+ # process lines looking for "priv:" and "pub:" markers
+ for line in stdout_lines:
+
+ # found private key marker?
+ if line.strip() == "priv:":
+ found_priv = True
+ found_pub = False
+ continue
+
+ # found public key marker?
+ if line.strip() == "pub:": # openssl 1.0.2g prints 'pub: ' (extra space before newline),
+ found_pub = True # so we need compare against line.strip(), not just line
+ found_priv = False
+ continue
+
+ # found part of private key?
+ if found_priv:
+ if not line.startswith(" "):
+ found_priv = False
+ continue
+ else:
+ key_priv += line.strip()
+
+ # found part of public key?
+ if found_pub:
+ if not line.startswith(" "):
+ found_pub = False
+ continue
+ else:
+ key_pub += line.strip()
+
+ # do some cleanup and sanity checking on private key
+ # * remove extra leading zero byte if present
+ # * remove colons
+ # * check length (256 bits or 384 bits)
+ while key_priv.startswith("00"):
+ key_priv = key_priv[2:]
+
+ key_priv = key_priv.replace(":", "")
+
+ if curve == CURVE_P256 and len(key_priv) != 256 / 4: sys.exit()
+ if curve == CURVE_P384 and len(key_priv) != 384 / 4: sys.exit()
+
+ # do some cleanup and sanity checking on public key
+ # * make sure, that uncompressed form marker (0x04) is present and
+ # then remove it
+ # * remove colons
+ # * check length (2x256 or 2x384 bits)
+ if not key_pub.startswith("04"): sys.exit()
+
+ key_pub = key_pub[2:]
+ key_pub = key_pub.replace(":", "")
+
+ if curve == CURVE_P256 and len(key_pub) != 2 * 256 / 4: sys.exit()
+ if curve == CURVE_P384 and len(key_pub) != 2 * 384 / 4: sys.exit()
+
+ # split public key into parts
+ if curve == CURVE_P256:
+ key_pub_x = key_pub[ 0: 64]
+ key_pub_y = key_pub[ 64:128]
+
+ if curve == CURVE_P384:
+ key_pub_x = key_pub[ 0: 96]
+ key_pub_y = key_pub[ 96:192]
+
+ # convert from strings to integers
+ key_priv = int(key_priv, 16)
+ key_pub_x = int(key_pub_x, 16)
+ key_pub_y = int(key_pub_y, 16)
+
+ # another sanity check (make sure, that Q is actually d * G)
+ if curve == CURVE_P256:
+ G = Point(P256_GX, P256_GY, curve=P256)
+ Q = Point(key_pub_x, key_pub_y, curve=P256)
+
+ if curve == CURVE_P384:
+ G = Point(P384_GX, P384_GY, curve=P384)
+ Q = Point(key_pub_x, key_pub_y, curve=P384)
+
+ # multiply using fastecdsa
+ R = key_priv * G
+
+ if R.x != Q.x: sys.exit()
+ if R.y != Q.y: sys.exit()
+
+ # done
+ return key_priv, key_pub_x, key_pub_y
+
+
+if __name__ == "__main__":
+
+ # list of curves to process
+ curves = [CURVE_P256, CURVE_P384]
+
+ # open output files
+ file_h = open('ecdsa_fpga_model_ecdh_vectors.h', 'w')
+# file_v = open('modexp_fpga_model_vectors.v', 'w')
+
+ # write headers
+ file_h.write("/* Generated automatically, do not edit. */\n\n")
+# file_v.write("/* Generated automatically, do not edit. */\n\n")
+
+ # process all the keys
+ for curve in curves:
+
+ # load keys
+ da, qax, qay = get_key("alice", curve)
+ db, qbx, qby = get_key("bob", curve)
+
+ # Alice's public key
+ if (curve == CURVE_P256): QA = Point(qax, qay, curve=P256)
+ if (curve == CURVE_P384): QA = Point(qax, qay, curve=P384)
+
+ # Bob's public key
+ if (curve == CURVE_P256): QB = Point(qbx, qby, curve=P256)
+ if (curve == CURVE_P384): QB = Point(qbx, qby, curve=P384)
+
+ # we derive the shared secret two different ways (from Alice's and
+ # from Bob's perspective, they must be identical of course
+ QAB = da * QB # Alice's secret
+ QBA = db * QA # Bob's secret
+
+ if (QAB.x != QBA.x): sys.exit()
+ if (QBA.y != QAB.y): sys.exit()
+
+ print("Derived shared secret.");
+
+ # format numbers and write to file
+ format_c_header(file_h, curve, da, qax, qay, db, qbx, qby, QAB.x, QBA.y)
+# format_verilog_include(file_v, key, modulus, message, secret, signature, prime1, prime2, exponent1, exponent2, message1, message2)
+
+ # done
+ file_h.close()
+# file_v.close()
+
+ # everything went just fine
+ print("Test vectors formatted.")
+
+#
+# End of file
+#
diff --git a/test_vectors/regenerate_test_vectors.py b/test_vectors/regenerate_test_vectors.py
new file mode 100644
index 0000000..f734854
--- /dev/null
+++ b/test_vectors/regenerate_test_vectors.py
@@ -0,0 +1,71 @@
+#
+# regenerate_test_vectors.py
+# -------------------------------------------------------------------
+# Generates a new set of randomized test vectors for ecdsa_fpga_model
+#
+# Author: Pavel Shatov
+# Copyright (c) 2017, NORDUnet A/S
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# - Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# - Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# - Neither the name of the NORDUnet nor the names of its contributors may
+# be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+#
+# This script generates a pair of test vectors. Each test vector contains two
+# private keys. One test vector is for P-256, the other one is for P-384.
+#
+
+#
+# imports
+#
+import subprocess
+
+CURVE_P256 = "prime256v1"
+CURVE_P384 = "secp384r1"
+
+SECRET_ALICE_256 = "alice_p256.key"
+SECRET_ALICE_384 = "alice_p384.key"
+
+SECRET_BOB_256 = "bob_p256.key"
+SECRET_BOB_384 = "bob_p384.key"
+
+def openssl_ecparam_genkey(curve, file):
+ subprocess.call(["openssl", "ecparam", "-genkey", "-name", curve, "-out", file])
+
+if __name__ == "__main__":
+
+ # generate two new private keys for P-256 curve
+ openssl_ecparam_genkey(CURVE_P256, SECRET_ALICE_256)
+ openssl_ecparam_genkey(CURVE_P256, SECRET_BOB_256)
+
+ # generate two new private keys for P-384 curve
+ openssl_ecparam_genkey(CURVE_P384, SECRET_ALICE_384)
+ openssl_ecparam_genkey(CURVE_P384, SECRET_BOB_384)
+
+#
+# End of file
+#