From 634344f3a359576981277755d53aaf31002c2595 Mon Sep 17 00:00:00 2001
From: "Pavel V. Shatov (Meister)" <meisterpaul1@yandex.ru>
Date: Mon, 31 Oct 2016 00:00:32 +0300
Subject: Initial commit of FPGA base point multiplier reference model for
 ECDSA curves P-256 and P-384.

---
 Makefile             |  24 ++
 README.md            |  17 ++
 ecdsa_model.h        | 205 +++++++++++++
 ecdsa_model_fpga.cpp | 414 +++++++++++++++++++++++++
 fpga_curve.cpp       | 340 +++++++++++++++++++++
 fpga_curve.h         |  61 ++++
 fpga_lowlevel.cpp    | 137 +++++++++
 fpga_lowlevel.h      |  80 +++++
 fpga_modular.cpp     | 831 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fpga_modular.h       |  87 ++++++
 fpga_util.cpp        |  86 ++++++
 fpga_util.h          |  49 +++
 12 files changed, 2331 insertions(+)
 create mode 100644 Makefile
 create mode 100644 README.md
 create mode 100644 ecdsa_model.h
 create mode 100644 ecdsa_model_fpga.cpp
 create mode 100644 fpga_curve.cpp
 create mode 100644 fpga_curve.h
 create mode 100644 fpga_lowlevel.cpp
 create mode 100644 fpga_lowlevel.h
 create mode 100644 fpga_modular.cpp
 create mode 100644 fpga_modular.h
 create mode 100644 fpga_util.cpp
 create mode 100644 fpga_util.h

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..01bf790
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,24 @@
+CC=gcc
+CFLAGS=-c
+LDFLAGS=
+ 
+ecdsa_model_fpga: ecdsa_model_fpga.o fpga_curve.o fpga_modular.o fpga_lowlevel.o fpga_util.o
+	$(CC) $(LDFLAGS) ecdsa_model_fpga.o fpga_curve.o fpga_modular.o fpga_lowlevel.o fpga_util.o -o ecdsa_model_fpga
+
+ecdsa_model_fpga.o: ecdsa_model_fpga.cpp ecdsa_model.h fpga_curve.h fpga_modular.h fpga_lowlevel.h fpga_util.h
+	$(CC) $(CFLAGS) ecdsa_model_fpga.cpp
+	
+fpga_curve.o: fpga_curve.cpp ecdsa_model.h fpga_lowlevel.h fpga_modular.h fpga_curve.h fpga_util.h
+	$(CC) $(CFLAGS) fpga_curve.cpp
+
+fpga_modular.o: fpga_modular.cpp ecdsa_model.h fpga_lowlevel.h fpga_modular.h
+	$(CC) $(CFLAGS) fpga_modular.cpp
+	
+fpga_lowlevel.o: fpga_lowlevel.cpp ecdsa_model.h fpga_lowlevel.h
+	$(CC) $(CFLAGS) fpga_lowlevel.cpp
+	
+fpga_util.o: fpga_util.cpp ecdsa_model.h fpga_lowlevel.h fpga_util.h
+	$(CC) $(CFLAGS) fpga_util.cpp
+
+clean:
+	rm -f ecdsa_model_fpga *.o
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c78c462
--- /dev/null
+++ b/README.md
@@ -0,0 +1,17 @@
+# ecdsa_model_fpga
+
+This reference model was written to help debug Verilog code, it mimics how an FPGA would do elliptic curve base point scalar multiplication for ECDSA curves P-256 and P-384. Note, that the model may do weird (from CPU point of view, of course) things at times. Another important thing is that while FPGA modules are actually written to operate in constant-time manner, this model itself doesn't take any active measures to keep run-time constant. Do **NOT** use it in production as-is!
+
+The model is split into 4 layers:
+
+ * Low-level primitives (32- and 48-bit adders, 32-bit subtractor, 16x16-bit multiplier, 48-bit accumulator)
+ * Utility routines (copier, comparator)
+ * Modular arithmetic (adder, subtractor, multiplier, invertor)
+ * EC arithmetic (adder, doubler, multiplier)
+
+Modular multiplier and invertor use complex algorithms and are thus further split into "helper" sub-routines.
+
+This model uses tips and tricks from the following sources:
+1. [Guide to Elliptic Curve Cryptography](http://diamond.boisestate.edu/~liljanab/MATH308/GuideToECC.pdf)
+2. [Ultra High Performance ECC over NIST Primes
+on Commercial FPGAs](https://www.iacr.org/archive/ches2008/51540064/51540064.pdf)
diff --git a/ecdsa_model.h b/ecdsa_model.h
new file mode 100644
index 0000000..1e6a04c
--- /dev/null
+++ b/ecdsa_model.h
@@ -0,0 +1,205 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_model.h
+// --------------------------------------------
+// Base point scalar multiplier model for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+//
+// Curve Selection
+//
+// USE_CURVE == 1  -> P-256
+// USE_CURVE == 2  -> P-384
+//
+//------------------------------------------------------------------------------
+#define USE_CURVE	1
+
+
+//------------------------------------------------------------------------------
+// Model Parameters
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+#define OPERAND_WIDTH	(256)	// largest supported operand width in bits
+#elif USE_CURVE == 2
+#define OPERAND_WIDTH	(384)	// largest supported operand width in bits
+#else
+#error USE_CURVE must be either 1 or 2!
+#endif
+
+
+//------------------------------------------------------------------------------
+// P-256 Parameters and Test Vectors
+//------------------------------------------------------------------------------
+
+/* Field Size */
+#define P_256_Q			{0xffffffff, 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff}
+
+/* Generic Numbers */
+#define P_256_ZERO		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
+#define P_256_ONE		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001}
+
+/* Division Factor  */
+#define P_256_DELTA		{0x7fffffff, 0x80000000, 0x80000000, 0x00000000, 0x00000000, 0x80000000, 0x00000000, 0x00000000}
+
+/* Base Point */
+#define P_256_G_X		{0x6b17d1f2, 0xe12c4247, 0xf8bce6e5, 0x63a440f2, 0x77037d81, 0x2deb33a0, 0xf4a13945, 0xd898c296}
+#define P_256_G_Y		{0x4fe342e2, 0xfe1a7f9b, 0x8ee7eb4a, 0x7c0f9e16, 0x2bce3357, 0x6b315ece, 0xcbb64068, 0x37bf51f5}
+
+/* Doubled Base Point */
+#define P_256_H_X		{0x29d05c19, 0x3da77b71, 0x0e863235, 0x38b77e1b, 0x11f904fe, 0xa42998be, 0x16bd8d74, 0x4ece7ad0}
+#define P_256_H_Y		{0xb01cbd1c, 0x01e58065, 0x711814b5, 0x83f061e9, 0xd431cca9, 0x94cea131, 0x3449bf97, 0xc840ae07}
+
+/* Base Point Order */
+#define P_256_N			{0xffffffff, 0x00000000, 0xffffffff, 0xffffffff, 0xbce6faad, 0xa7179e84, 0xf3b9cac2, 0xfc632551}
+
+/* Private Key */
+#define P_256_D			{0x70a12c2d, 0xb16845ed, 0x56ff68cf, 0xc21a472b, 0x3f04d7d6, 0x851bf634, 0x9f2d7d5b, 0x3452b38a}
+
+/* Per-message Random Number */
+#define P_256_K			{0x580ec00d, 0x85643433, 0x4cef3f71, 0xecaed496, 0x5b12ae37, 0xfa47055b, 0x1965c7b1, 0x34ee45d0}
+
+/* Public Key */
+#define P_256_Q_X		{0x8101ece4, 0x7464a6ea, 0xd70cf69a, 0x6e2bd3d8, 0x8691a326, 0x2d22cba4, 0xf7635eaf, 0xf26680a8}
+#define P_256_Q_Y		{0xd8a12ba6, 0x1d599235, 0xf67d9cb4, 0xd58f1783, 0xd3ca43e7, 0x8f0a5aba, 0xa6240799, 0x36c0c3a9}
+
+/* Part of Signature */
+#define P_256_R_X		{0x7214bc96, 0x47160bbd, 0x39ff2f80, 0x533f5dc6, 0xddd70ddf, 0x86bb8156, 0x61e805d5, 0xd4e6f27c}
+#define P_256_R_Y		{0x8b81e3e9, 0x77597110, 0xc7cf2633, 0x435b2294, 0xb7264298, 0x7defd3d4, 0x007e1cfc, 0x5df84541}
+
+
+//------------------------------------------------------------------------------
+// P-384 Parameters and Test Vectors
+//------------------------------------------------------------------------------
+
+/* Field Size */
+#define P_384_Q			{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff}
+
+/* Generic Numbers */
+#define P_384_ZERO		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
+#define P_384_ONE		{0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001}
+
+/* Division Factor  */
+#define P_384_DELTA		{0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff, 0x80000000, 0x0000000, 0x080000000}
+
+/* Base Point */
+#define P_384_G_X		{0xaa87ca22, 0xbe8b0537, 0x8eb1c71e, 0xf320ad74, 0x6e1d3b62, 0x8ba79b98, 0x59f741e0, 0x82542a38, 0x5502f25d, 0xbf55296c, 0x3a545e38, 0x72760ab7}
+#define P_384_G_Y		{0x3617de4a, 0x96262c6f, 0x5d9e98bf, 0x9292dc29, 0xf8f41dbd, 0x289a147c, 0xe9da3113, 0xb5f0b8c0, 0x0a60b1ce, 0x1d7e819d, 0x7a431d7c, 0x90ea0e5f}
+
+/* Doubled Base Point */
+#define P_384_H_X		{0xaaf06bba, 0x82e9f590, 0xe29c71c2, 0x19bea517, 0x23c5893a, 0xe8b0c8cf, 0x4c117c3e, 0xfb57ab8d, 0x55fa1b42, 0x8155ad27, 0x8b574391, 0x1b13ea8a}
+#define P_384_H_Y		{0xc9e821b5, 0x69d9d390, 0xa2616740, 0x6d6d23d6, 0x070be242, 0xd765eb83, 0x1625ceec, 0x4a0f473e, 0xf59f4e30, 0xe2817e62, 0x85bce284, 0x6f15f19d}
+
+/* Base Point Order */
+#define P_384_N			{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xc7634d81, 0xf4372ddf, 0x581a0db2, 0x48b0a77a, 0xecec196a, 0xccc52973}
+
+/* Private Key */
+#define P_384_D			{0xc838b852, 0x53ef8dc7, 0x394fa580, 0x8a518398, 0x1c7deef5, 0xa69ba8f4, 0xf2117ffe, 0xa39cfcd9, 0x0e95f6cb, 0xc854abac, 0xab701d50, 0xc1f3cf24}
+
+/* Per-message Random Number */
+#define P_384_K			{0xdc6b4403, 0x6989a196, 0xe39d1cda, 0xc000812f, 0x4bdd8b2d, 0xb41bb33a, 0xf5137258, 0x5ebd1db6, 0x3f0ce827, 0x5aa1fd45, 0xe2d2a735, 0xf8749359}
+
+/* Public Key */
+#define P_384_Q_X		{0x1fbac8ee, 0xbd0cbf35, 0x640b39ef, 0xe0808dd7, 0x74debff2, 0x0a2a329e, 0x91713baf, 0x7d7f3c3e, 0x81546d88, 0x3730bee7, 0xe48678f8, 0x57b02ca0}
+#define P_384_Q_Y		{0xeb213103, 0xbd68ce34, 0x3365a8a4, 0xc3d4555f, 0xa385f533, 0x0203bdd7, 0x6ffad1f3, 0xaffb9575, 0x1c132007, 0xe1b24035, 0x3cb0a4cf, 0x1693bdf9}
+
+/* Part of Signature */
+#define P_384_R_X		{0xa0c27ec8, 0x93092dea, 0x1e1bd2cc, 0xfed3cf94, 0x5c8134ed, 0x0c9f8131, 0x1a0f4a05, 0x942db8db, 0xed8dd59f, 0x267471d5, 0x462aa14f, 0xe72de856}
+#define P_384_R_Y		{0x85564940, 0x9815bb91, 0x424eaca5, 0xfd76c973, 0x75d575d1, 0x422ec53d, 0x343bd33b, 0x847fdf0c, 0x11569685, 0xb528ab25, 0x49301542, 0x8d7cf72b}
+
+
+//------------------------------------------------------------------------------
+// Parameter and Test Vector Selection
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+
+#define ECDSA_Q			P_256_Q
+
+#define ECDSA_ZERO		P_256_ZERO
+#define ECDSA_ONE		P_256_ONE
+
+#define ECDSA_DELTA		P_256_DELTA
+
+#define ECDSA_G_X		P_256_G_X
+#define ECDSA_G_Y		P_256_G_Y
+
+#define ECDSA_H_X		P_256_H_X
+#define ECDSA_H_Y		P_256_H_Y
+
+#define ECDSA_N			P_256_N
+#define ECDSA_D			P_256_D
+#define ECDSA_K			P_256_K
+
+#define ECDSA_Q_X		P_256_Q_X
+#define ECDSA_Q_Y		P_256_Q_Y
+
+#define ECDSA_R_X		P_256_R_X
+#define ECDSA_R_Y		P_256_R_Y
+
+#elif USE_CURVE == 2
+
+#define ECDSA_Q			P_384_Q
+
+#define ECDSA_ZERO		P_384_ZERO
+#define ECDSA_ONE		P_384_ONE
+
+#define ECDSA_DELTA		P_384_DELTA
+
+#define ECDSA_G_X		P_384_G_X
+#define ECDSA_G_Y		P_384_G_Y
+
+#define ECDSA_H_X		P_384_H_X
+#define ECDSA_H_Y		P_384_H_Y
+
+#define ECDSA_N			P_384_N
+#define ECDSA_D			P_384_D
+#define ECDSA_K			P_384_K
+
+#define ECDSA_Q_X		P_384_Q_X
+#define ECDSA_Q_Y		P_384_Q_Y
+
+#define ECDSA_R_X		P_384_R_X
+#define ECDSA_R_Y		P_384_R_Y
+
+#else
+
+#error USE_CURVE must be either 1 or 2!
+
+#endif
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/ecdsa_model_fpga.cpp b/ecdsa_model_fpga.cpp
new file mode 100644
index 0000000..8ad0962
--- /dev/null
+++ b/ecdsa_model_fpga.cpp
@@ -0,0 +1,414 @@
+//------------------------------------------------------------------------------
+//
+// ecdsa_model_fpga.cpp
+// --------------------------------------------
+// Base point scalar multiplier model for ECDSA
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "ecdsa_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_modular.h"
+#include "fpga_curve.h"
+#include "fpga_util.h"
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void	fpga_model_init					();
+bool	test_base_point_multiplier		(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
+
+bool	abuse_internal_point_adder		();
+bool	abuse_internal_point_doubler	();
+
+void	print_fpga_buffer				(const char *s, FPGA_BUFFER *v);
+
+bool	compare_fpga_buffers			(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *bx, FPGA_BUFFER *by);
+bool	compare_fpga_buffers			(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *az, FPGA_BUFFER *bx, FPGA_BUFFER *by, FPGA_BUFFER *bz);
+
+
+//------------------------------------------------------------------------------
+// Locals
+//------------------------------------------------------------------------------
+static FPGA_BUFFER ecdsa_d;
+static FPGA_BUFFER ecdsa_k;
+static FPGA_BUFFER ecdsa_n;
+
+
+//------------------------------------------------------------------------------
+int main()
+//------------------------------------------------------------------------------
+{
+	bool	ok;		// flag
+
+		//
+		// initialize buffers
+		//
+	fpga_model_init();
+	fpga_modular_init();
+	fpga_curve_init();
+
+	
+		//
+		// test base point multiplier: Q = d * G
+		//
+	printf("Trying to derive public key from private key...\n\n");
+	ok = test_base_point_multiplier(&ecdsa_d, &ecdsa_q_x, &ecdsa_q_y);
+	if (!ok) return EXIT_FAILURE;
+
+
+		//
+		// test base point multiplier: R = k * G
+		//
+	printf("Trying to sign something...\n\n");
+	ok = test_base_point_multiplier(&ecdsa_k, &ecdsa_r_x, &ecdsa_r_y);
+	if (!ok) return EXIT_FAILURE;
+
+
+		//
+		// test base point multiplier: O = n * G
+		//
+	printf("Trying to multiply the base point by its order...\n\n");
+	ok = test_base_point_multiplier(&ecdsa_n, &ecdsa_zero, &ecdsa_zero);
+	if (!ok) return EXIT_FAILURE;
+
+
+		//
+		// try to abuse internal point doubler
+		//
+	ok = abuse_internal_point_doubler();
+	if (!ok) return EXIT_FAILURE;
+	
+
+		//
+		// try to abuse internal point adder
+		//
+	ok = abuse_internal_point_adder();
+	if (!ok) return EXIT_FAILURE;
+	
+
+		//
+		// everything went just fine
+		//
+	return EXIT_SUCCESS;
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_model_init()
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+	FPGA_BUFFER tmp_d = ECDSA_D;
+	FPGA_BUFFER tmp_k = ECDSA_K;
+	FPGA_BUFFER tmp_n = ECDSA_N;
+
+		/* fill buffers for large multi-word integers */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{	ecdsa_d.words[w] = tmp_d.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_k.words[w] = tmp_k.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_n.words[w] = tmp_n.words[OPERAND_NUM_WORDS - (w+1)];
+	}
+}
+
+
+//------------------------------------------------------------------------------
+bool test_base_point_multiplier(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+//
+// k - multiplier
+//
+// qx, qy - expected coordinates of product
+//
+// Returns true when point (rx,ry) = k * G matches the point (qx,qy).
+//
+//------------------------------------------------------------------------------
+{
+	bool ok;				// flag
+	FPGA_BUFFER rx, ry;		// result
+
+		/* run the model */
+	fpga_curve_scalar_multiply(k, &rx, &ry);
+
+		/* handle result */
+	ok = compare_fpga_buffers(qx, qy, &rx, &ry);
+	if (!ok)
+	{	printf("\n    ERROR\n\n");
+		return false;
+	}
+	else printf("\n    OK\n\n");
+
+		// everything went just fine
+	return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool abuse_internal_point_doubler()
+//------------------------------------------------------------------------------
+//
+// This routine tries to abuse the internal curve point doubler by forcing it
+// to double point at infinity.
+//
+//------------------------------------------------------------------------------
+{
+	int w;		// word counter
+	bool ok;	// flag
+	
+	FPGA_BUFFER px, py, pz;		// input
+	FPGA_BUFFER qx, qy, qz;		// output
+
+		// set P.X and P.Y to some "random" garbage and P.Z to zero
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{	px.words[w] = ecdsa_g_x.words[w] ^ ecdsa_h_x.words[w];
+		py.words[w] = ecdsa_g_y.words[w] ^ ecdsa_h_y.words[w];
+	}
+	fpga_buffer_copy(&ecdsa_zero, &pz);
+
+		// try to double point at infinity (should produce point at infinity)
+	printf("Trying to double something at infinity...\n\n");
+	fpga_curve_double_jacobian(&px, &py, &pz, &qx, &qy, &qz);
+
+		// handle result
+	ok = compare_fpga_buffers(&ecdsa_one, &ecdsa_one, &ecdsa_zero, &qx, &qy, &qz);
+	if (! ok)
+	{	printf("\n    ERROR\n\n");
+		return false;
+	}
+	else printf("\n    OK\n\n");
+
+		// everything went just fine
+	return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool abuse_internal_point_adder()
+//------------------------------------------------------------------------------
+//
+// This routine tries to abuse the internal curve point adder by forcing it to
+// go throgh all the possible "corner cases".
+//
+//------------------------------------------------------------------------------
+{
+	int w;		// word counter	
+	bool ok;	// flag
+
+	FPGA_BUFFER px, py, pz;		// input
+	FPGA_BUFFER rx, ry, rz;		// output
+
+	//
+	// try to add point at infinity to the base point
+	//
+	{
+			// set P.X and P.Y to some "random" garbage and P.Z to zero
+		for (w=0; w<OPERAND_NUM_WORDS; w++)
+		{	px.words[w] = ecdsa_g_x.words[w] ^ ecdsa_h_x.words[w];
+			py.words[w] = ecdsa_g_y.words[w] ^ ecdsa_h_y.words[w];
+		}
+		fpga_buffer_copy(&ecdsa_zero, &pz);
+
+			// run addition proceduce
+		printf("Trying to add something at infinity to the base point...\n\n");
+		fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
+
+			// handle result
+		ok = compare_fpga_buffers(&ecdsa_g_x, &ecdsa_g_y, &ecdsa_one, &rx, &ry, &rz);
+		if (! ok)
+		{	printf("\n    ERROR\n\n");
+			return false;
+		}
+		else printf("\n    OK\n\n");
+	}
+
+	//
+	// try to add the base point to itself
+	//
+	{
+			// set P to G
+		fpga_buffer_copy(&ecdsa_g_x, &px);
+		fpga_buffer_copy(&ecdsa_g_y, &py);
+		fpga_buffer_copy(&ecdsa_one, &pz);
+
+			// run addition proceduce
+		printf("Trying to add the base point to itself...\n\n");
+		fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
+
+			// handle result
+		ok = compare_fpga_buffers(&ecdsa_h_x, &ecdsa_h_y, &ecdsa_one, &rx, &ry, &rz);
+		if (! ok)
+		{	printf("\n    ERROR\n\n");
+			return false;
+		}
+		else printf("\n    OK\n\n");
+	}
+
+	//
+	// try to add the base point to its opposite
+	//
+	{
+			// set P to (G.X, -G.Y, 1)
+		fpga_buffer_copy(&ecdsa_zero, &px);
+		fpga_buffer_copy(&ecdsa_zero, &py);
+		fpga_buffer_copy(&ecdsa_one,  &pz);
+
+		fpga_modular_add(&ecdsa_zero, &ecdsa_g_x, &px);
+		fpga_modular_sub(&ecdsa_zero, &ecdsa_g_y, &py);
+
+			// run addition proceduce
+		printf("Trying to add the base point to its opposite...\n\n");
+		fpga_curve_add_jacobian(&px, &py, &pz, &rx, &ry, &rz);
+
+			// handle result
+		ok = compare_fpga_buffers(&ecdsa_one, &ecdsa_one, &ecdsa_zero, &rx, &ry, &rz);
+		if (! ok)
+		{	printf("\n    ERROR\n\n");
+			return false;
+		}
+		else printf("\n    OK\n\n");
+	}
+
+		// everything went just fine
+	return true;
+}
+
+
+//------------------------------------------------------------------------------
+void print_fpga_buffer(const char *s, FPGA_BUFFER *buf)
+//------------------------------------------------------------------------------
+//
+// Pretty print large multi-word integer.
+//
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+		// print header
+	printf("%s", s);
+
+		// print all bytes
+	for (w=OPERAND_NUM_WORDS; w>0; w--)
+	{	
+		printf("%08x", buf->words[w-1]);
+
+			// insert space after every group of 4 bytes
+		if (w > 1) printf(" ");
+	}
+
+		// print footer
+	printf("\n");
+}
+
+
+//------------------------------------------------------------------------------
+bool compare_fpga_buffers(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *bx, FPGA_BUFFER *by)
+//------------------------------------------------------------------------------
+//
+// Compare affine coordinates of two points and return true when they match.
+//
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+		// print all the values
+	print_fpga_buffer("  Expected:   X = ", ax);
+	print_fpga_buffer("  Calculated: X = ", bx);
+	printf("\n");
+	print_fpga_buffer("  Expected:   Y = ", ay);
+	print_fpga_buffer("  Calculated: Y = ", by);
+
+		// compare values
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{
+			// compare x
+		if (ax->words[w] != bx->words[w]) return false;
+
+			// compare y
+		if (ay->words[w] != by->words[w]) return false;
+	}
+
+		// values are the same
+	return true;
+}
+
+
+//------------------------------------------------------------------------------
+bool compare_fpga_buffers(FPGA_BUFFER *ax, FPGA_BUFFER *ay, FPGA_BUFFER *az, FPGA_BUFFER *bx, FPGA_BUFFER *by, FPGA_BUFFER *bz)
+//------------------------------------------------------------------------------
+//
+// Compare projective coordinates of two points and return true when they match.
+//
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+		// print all the values
+	print_fpga_buffer("  Expected:   X = ", ax);
+	print_fpga_buffer("  Calculated: X = ", bx);
+	printf("\n");
+	print_fpga_buffer("  Expected:   Y = ", ay);
+	print_fpga_buffer("  Calculated: Y = ", by);
+	printf("\n");
+	print_fpga_buffer("  Expected:   Z = ", az);
+	print_fpga_buffer("  Calculated: Z = ", bz);
+
+		// compare values
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{
+			// compare x
+		if (ax->words[w] != bx->words[w]) return false;
+
+			// compare y
+		if (ay->words[w] != by->words[w]) return false;
+
+			// compare z
+		if (az->words[w] != bz->words[w]) return false;
+	}
+
+		// values are the same
+	return true;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_curve.cpp b/fpga_curve.cpp
new file mode 100644
index 0000000..9cc8ec0
--- /dev/null
+++ b/fpga_curve.cpp
@@ -0,0 +1,340 @@
+//------------------------------------------------------------------------------
+//
+// fpga_curve.cpp
+// ------------------------------------
+// Elliptic curve arithmetic procedures
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdsa_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_modular.h"
+#include "fpga_curve.h"
+#include "fpga_util.h"
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+FPGA_BUFFER ecdsa_g_x, ecdsa_g_y;
+FPGA_BUFFER ecdsa_h_x, ecdsa_h_y;
+FPGA_BUFFER ecdsa_q_x, ecdsa_q_y;
+FPGA_BUFFER ecdsa_r_x, ecdsa_r_y;
+
+
+//------------------------------------------------------------------------------
+void fpga_curve_init()
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+	FPGA_BUFFER tmp_g_x = ECDSA_G_X, tmp_g_y = ECDSA_G_Y;
+	FPGA_BUFFER tmp_h_x = ECDSA_H_X, tmp_h_y = ECDSA_H_Y;
+	FPGA_BUFFER tmp_q_x = ECDSA_Q_X, tmp_q_y = ECDSA_Q_Y;
+	FPGA_BUFFER tmp_r_x = ECDSA_R_X, tmp_r_y = ECDSA_R_Y;
+
+		/* fill buffers for large multi-word integers */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{	ecdsa_g_x.words[w]	= tmp_g_x.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_g_y.words[w]	= tmp_g_y.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_h_x.words[w]	= tmp_h_x.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_h_y.words[w]	= tmp_h_y.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_r_x.words[w]	= tmp_r_x.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_r_y.words[w]	= tmp_r_y.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_q_x.words[w]	= tmp_q_x.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_q_y.words[w]	= tmp_q_y.words[OPERAND_NUM_WORDS - (w+1)];
+	}
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve point doubling routine.
+//
+// R(rx,ry,rz) = 2 * P(px,py,pz)
+//
+// Note, that P(px,py,pz) is supposed to be in projective Jacobian coordinates,
+// R will be in projective Jacobian coordinates.
+//
+// This routine implements algorithm 3.21 from "Guide to Elliptic Curve
+// Cryptography", the only difference is that step 6. does T1 = T2 + T2 and
+// then T2 = T2 + T1 instead of T2 = 3 * T2, because our addition is much
+// faster, than multiplication.
+//
+// Note, that this routine also handles one special case, namely when P is at
+// infinity.
+//
+// Instead of actual modular division, multiplication by pre-computed constant
+// (2^-1 mod q) is done.
+//
+// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
+// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
+// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
+// then fpga_modular_mul(pz, &t1, &t1) instead.
+//
+// WARNING: Though this procedure always does doubling steps, it does not take
+// any active measures to keep run-time constant. The main purpose of this
+// model is to help debug Verilog code for FPGA, so *DO NOT* use is anywhere
+// near production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_double_jacobian(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz)
+//------------------------------------------------------------------------------
+{
+	FPGA_BUFFER t1, t2, t3;	// temporary variables
+
+		// check, whether P is at infinity
+	bool pz_is_zero = fpga_buffer_is_zero(pz);
+
+	/*  2. */ fpga_buffer_copy(pz,  &t1);
+              fpga_modular_mul(pz,  &t1,          &t1);
+	/*  3. */ fpga_modular_sub(px,  &t1,          &t2);
+	/*  4. */ fpga_modular_add(px,  &t1,          &t1);
+	/*  5. */ fpga_modular_mul(&t1, &t2,          &t2);
+	/*  6. */ fpga_modular_add(&t2, &t2,          &t1);
+	/*     */ fpga_modular_add(&t1, &t2,          &t2);
+	/*  7. */ fpga_modular_add(py,  py,           ry);
+	/*  8. */ fpga_modular_mul(pz,  ry,           rz);
+	/*  9. */ fpga_buffer_copy(ry,  &t1);
+	          fpga_buffer_copy(ry,  &t3);
+	          fpga_modular_mul(&t1, &t3,          ry);
+	/* 10. */ fpga_modular_mul(px,  ry,           &t3);
+	/* 11. */ fpga_buffer_copy(ry,  &t1);
+	          fpga_modular_mul(ry,  &t1,          &t1);
+	/* 12. */ fpga_modular_mul(&t1, &ecdsa_delta, ry);
+	/* 13. */ fpga_buffer_copy(&t2, &t1);
+	          fpga_modular_mul(&t1, &t2,          rx);
+	/* 14. */ fpga_modular_add(&t3, &t3,          &t1);
+	/* 15. */ fpga_modular_sub(rx,  &t1,          rx);
+	/* 16. */ fpga_modular_sub(&t3, rx,           &t1);	
+	/* 17. */ fpga_modular_mul(&t1, &t2,          &t1);
+	/* 18. */ fpga_modular_sub(&t1, ry,           ry);	
+
+		// handle special case (input point is at infinity)
+	if (pz_is_zero)
+	{	fpga_buffer_copy(&ecdsa_one,  rx);
+		fpga_buffer_copy(&ecdsa_one,  ry);
+		fpga_buffer_copy(&ecdsa_zero, rz);
+	}
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve point addition routine.
+//
+// R(rx,ry,rz) = P(px,py,pz) + Q(qx,qy)
+//
+// Note, that P(px, py, pz) is supposed to be in projective Jacobian
+// coordinates, while Q(qx,qy) is supposed to be in affine coordinates,
+// R(rx, ry, rz) will be in projective Jacobian coordinates. Moreover, in this
+// particular implementation Q is always the base point G.
+//
+// This routine implements algorithm 3.22 from "Guide to Elliptic Curve
+// Cryptography". Differences from the original algorithm:
+// 
+// 1) Step 1. is omitted, because point Q is always the base point, which is
+//    not at infinity by definition.
+//
+// 2) Step 9.1 just returns the pre-computed double of the base point instead
+// of actually doubling it.
+//
+// Note, that this routine also handles three special cases:
+//
+// 1) P is at infinity
+// 2) P == Q
+// 3) P == -Q
+//
+// Note, that FPGA modular multiplier can't multiply a given buffer by itself,
+// this way it's impossible to do eg. fpga_modular_mul(pz, pz, &t1). To overcome
+// the problem the algorithm was modified to do fpga_buffer_copy(pz, &t1) and
+// then fpga_modular_mul(pz, &t1, &t1) instead.
+//
+// WARNING: This procedure does not take any active measures to keep run-time
+// constant. The main purpose of this model is to help debug Verilog code for
+// FPGA, so *DO NOT* use is anywhere near production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_add_jacobian(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz)
+//------------------------------------------------------------------------------
+{
+	FPGA_BUFFER t1, t2, t3, t4;		// temporary variables
+	
+	bool pz_is_zero = fpga_buffer_is_zero(pz);		// Step 2.
+
+	/*  3. */ fpga_buffer_copy(pz,  &t1);
+	          fpga_modular_mul(pz,  &t1,        &t1);
+	/*  4. */ fpga_modular_mul(pz,  &t1,        &t2);
+	/*  5. */ fpga_modular_mul(&t1, &ecdsa_g_x, &t1);
+	/*  6. */ fpga_modular_mul(&t2, &ecdsa_g_y, &t2);
+	/*  7. */ fpga_modular_sub(&t1, px,         &t1);
+	/*  8. */ fpga_modular_sub(&t2, py,         &t2);
+
+	bool t1_is_zero = fpga_buffer_is_zero(&t1);		// | Step 9.
+	bool t2_is_zero = fpga_buffer_is_zero(&t2);		// |
+
+	/* 10. */ fpga_modular_mul(pz,  &t1,        rz);
+	/* 11. */ fpga_buffer_copy(&t1, &t3);
+	          fpga_modular_mul(&t1, &t3,        &t3);
+	/* 12. */ fpga_modular_mul(&t1, &t3,        &t4);
+	/* 13. */ fpga_modular_mul(px,  &t3,        &t3);
+	/* 14. */ fpga_modular_add(&t3, &t3,        &t1);
+	/* 15. */ fpga_buffer_copy(&t2, rx);
+	          fpga_modular_mul(rx,  &t2,        rx);
+	/* 16. */ fpga_modular_sub(rx,  &t1,        rx);
+	/* 17. */ fpga_modular_sub(rx,  &t4,        rx);
+	/* 18. */ fpga_modular_sub(&t3, rx,         &t3);
+	/* 19. */ fpga_modular_mul(&t2, &t3,        &t3);
+	/* 20. */ fpga_modular_mul(py,  &t4,        &t4);
+	/* 21. */ fpga_modular_sub(&t3, &t4,        ry);
+
+		//
+		// final selection
+		//
+	if (pz_is_zero)	// P at infinity ?
+	{	fpga_buffer_copy(&ecdsa_g_x, rx);
+		fpga_buffer_copy(&ecdsa_g_y, ry);
+		fpga_buffer_copy(&ecdsa_one, rz);
+	}
+	else if (t1_is_zero) // same x for P and Q ?
+	{
+		fpga_buffer_copy(t2_is_zero ? &ecdsa_h_x : &ecdsa_one,  rx);	// | same y ? (P==Q => R=2*G) : (P==-Q => R=O)
+		fpga_buffer_copy(t2_is_zero ? &ecdsa_h_y : &ecdsa_one,  ry);	// |
+		fpga_buffer_copy(t2_is_zero ? &ecdsa_one : &ecdsa_zero, rz);	// |
+	}
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Conversion from projective Jacobian to affine coordinates.
+//
+// P(px,py,pz) -> Q(qx,qy)
+//
+// Note, that qx = px / Z^2 and qy = py / Z^3. Division in modular arithmetic
+// is equivalent to multiplication by the inverse value of divisor, so
+// qx = px * (pz^-1)^2 and qy = py * (pz^-1)^3.
+//
+// Note, that this procedure does *NOT* handle points at infinity correctly. It
+// can only be called from the base point multiplication routine, that
+// specifically makes sure that P is not at infinity, so pz will always be
+// non-zero value.
+//
+//------------------------------------------------------------------------------
+void fpga_curve_point_to_affine(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+{
+	FPGA_BUFFER pz1;					// inverse value of pz
+	FPGA_BUFFER t2, t3;					// intermediate values
+
+	fpga_modular_inv(pz, &pz1);			// pz1 = pz^-1 (mod q)
+
+	fpga_modular_mul(&pz1, &pz1, &t2);	// t2 = pz1 ^ 2 (mod q)
+	fpga_modular_mul(&pz1, &t2,  &t3);	// t3 = tz1 ^ 3 (mod q)
+
+	fpga_modular_mul(px, &t2, qx);		// qx = px * (pz^-1)^2 (mod q)
+	fpga_modular_mul(py, &t3, qy);		// qy = py * (pz^-1)^3 (mod q)
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Elliptic curve base point scalar multiplication routine.
+//
+// Q(qx,qy) = k * G(px,py)
+//
+// Note, that Q is supposed to be in affine coordinates. Multiplication is done
+// using the double-and-add algorithm 3.27 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// WARNING: Though this procedure always does the addition step, it only
+// updates the result when current bit of k is set. It does not take any
+// active measures to keep run-time constant. The main purpose of this model
+// is to help debug Verilog code for FPGA, so *DO NOT* use it anywhere near
+// production!
+//
+//------------------------------------------------------------------------------
+void fpga_curve_scalar_multiply(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy)
+//------------------------------------------------------------------------------
+{
+	int word_count, bit_count;	// counters
+
+	FPGA_BUFFER rx, ry, rz;		// intermediate result
+	FPGA_BUFFER tx, ty, tz;		// temporary variable
+
+		/* set initial value of R to point at infinity */
+	fpga_buffer_copy(&ecdsa_one,  &rx);
+	fpga_buffer_copy(&ecdsa_one,  &ry);
+	fpga_buffer_copy(&ecdsa_zero, &rz);
+
+		/* process bits of k left-to-right */
+	for (word_count=OPERAND_NUM_WORDS; word_count>0; word_count--)
+		for (bit_count=FPGA_WORD_WIDTH; bit_count>0; bit_count--)
+		{
+				/* calculate T = 2 * R */
+			fpga_curve_double_jacobian(&rx, &ry, &rz, &tx, &ty, &tz);
+
+				/* always calculate R = T + P for constant-time */
+			fpga_curve_add_jacobian(&tx, &ty, &tz, &rx, &ry, &rz);
+
+				/* revert to the value of T before addition if the current bit of k is not set */
+			if (!((k->words[word_count-1] >> (bit_count-1)) & 1))
+			{	fpga_buffer_copy(&tx, &rx);
+				fpga_buffer_copy(&ty, &ry);
+				fpga_buffer_copy(&tz, &rz);
+			}
+
+		}
+
+		// convert result to affine coordinates anyway
+	fpga_curve_point_to_affine(&rx, &ry, &rz, qx, qy);
+
+		// check, that rz is non-zero (not point at infinity)
+	bool rz_is_zero = fpga_buffer_is_zero(&rz);
+
+		// handle special case (result is point at infinity)
+	if (rz_is_zero)
+	{	fpga_buffer_copy(&ecdsa_zero, qx);
+		fpga_buffer_copy(&ecdsa_zero, qy);
+	}
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_curve.h b/fpga_curve.h
new file mode 100644
index 0000000..c90cc16
--- /dev/null
+++ b/fpga_curve.h
@@ -0,0 +1,61 @@
+//------------------------------------------------------------------------------
+//
+// fpga_curve.h
+// ------------------------------------
+// Elliptic curve arithmetic procedures
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER ecdsa_g_x, ecdsa_g_y;
+extern FPGA_BUFFER ecdsa_h_x, ecdsa_h_y;
+extern FPGA_BUFFER ecdsa_q_x, ecdsa_q_y;
+extern FPGA_BUFFER ecdsa_r_x, ecdsa_r_y;
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void	fpga_curve_init					();
+void	fpga_curve_scalar_multiply		(FPGA_BUFFER *k, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
+void	fpga_curve_add_jacobian			(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz);
+void	fpga_curve_double_jacobian		(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *rx, FPGA_BUFFER *ry, FPGA_BUFFER *rz);
+void	fpga_curve_point_to_affine		(FPGA_BUFFER *px, FPGA_BUFFER *py, FPGA_BUFFER *pz, FPGA_BUFFER *qx, FPGA_BUFFER *qy);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_lowlevel.cpp b/fpga_lowlevel.cpp
new file mode 100644
index 0000000..511856a
--- /dev/null
+++ b/fpga_lowlevel.cpp
@@ -0,0 +1,137 @@
+//------------------------------------------------------------------------------
+//
+// fpga_lowlevel.cpp
+// -----------------------------------
+// Models of Low-level FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdsa_model.h"
+#include "fpga_lowlevel.h"
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 32-bit adder with carry input and carry output.
+//
+// Carries are 1 bit wide.
+//
+// {c_out, s} = x + y + c_in
+//
+//------------------------------------------------------------------------------
+void fpga_lowlevel_add32(FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out)
+//------------------------------------------------------------------------------
+{
+		// calculate sum
+	FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x + (FPGA_WORD_EXTENDED)y;
+
+		// process carry input
+	if (c_in) r++;
+
+		// store sum
+	*s = (FPGA_WORD)r;
+
+		// shift sum to obtain carry flag
+	r >>= FPGA_WORD_WIDTH;
+
+		// store carry
+	*c_out = (r != 0);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 32-bit subtractor with borrow input and borrow output.
+//
+// Borrows are 1 bit wide.
+//
+// {b_out, d} = x - y - b_in
+//
+void fpga_lowlevel_sub32(FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out)
+//------------------------------------------------------------------------------
+{
+		// calculate difference
+	FPGA_WORD_EXTENDED r = (FPGA_WORD_EXTENDED)x - (FPGA_WORD_EXTENDED)y;
+
+		// process borrow input
+	if (b_in) r--;
+
+		// store difference
+	*d = (FPGA_WORD)r;
+
+		// shift difference to obtain borrow flag
+	r >>= FPGA_WORD_WIDTH;
+
+		// store borrow
+	*b_out = (r != 0);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 16x16-bit multiplier.
+//
+// Inputs are 16-bit wide, outputs are 32-bit wide.
+//
+// p = x * y
+//
+void fpga_lowlevel_mul16(FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p)
+//------------------------------------------------------------------------------
+{
+		// multiply and store result
+	*p = (FPGA_WORD)x * (FPGA_WORD)y;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Low-level 48-bit adder without carries.
+//
+// s = (x + y)[47:0]
+//
+void fpga_lowlevel_add48(FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s)
+//------------------------------------------------------------------------------
+{
+		// add and store result truncated to 48 bits
+	*s = (x + y) & FPGA_MASK_ADDER48;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_lowlevel.h b/fpga_lowlevel.h
new file mode 100644
index 0000000..eeb5c4f
--- /dev/null
+++ b/fpga_lowlevel.h
@@ -0,0 +1,80 @@
+//------------------------------------------------------------------------------
+//
+// fpga_lowlevel.h
+// -----------------------------------
+// Models of Low-level FPGA primitives
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// FPGA Pipeline Settings
+//------------------------------------------------------------------------------
+#define FPGA_WORD_WIDTH		(32)
+#define OPERAND_NUM_WORDS	(OPERAND_WIDTH / FPGA_WORD_WIDTH)
+
+
+//------------------------------------------------------------------------------
+// Word Types (Normal, Extended, Reduced)
+//------------------------------------------------------------------------------
+typedef uint32_t FPGA_WORD;
+typedef uint64_t FPGA_WORD_EXTENDED;
+typedef uint16_t FPGA_WORD_REDUCED;
+
+
+//------------------------------------------------------------------------------
+// Extended Adder Mask
+//------------------------------------------------------------------------------
+#define FPGA_MASK_ADDER48	((FPGA_WORD_EXTENDED)0x0000FFFFFFFFFFFF)
+
+
+//------------------------------------------------------------------------------
+struct FPGA_BUFFER
+//------------------------------------------------------------------------------
+{
+	FPGA_WORD	words[OPERAND_NUM_WORDS];
+};
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void	fpga_lowlevel_add32		(FPGA_WORD x, FPGA_WORD y, bool c_in, FPGA_WORD *s, bool *c_out);
+void	fpga_lowlevel_sub32		(FPGA_WORD x, FPGA_WORD y, bool b_in, FPGA_WORD *d, bool *b_out);
+void	fpga_lowlevel_mul16		(FPGA_WORD_REDUCED x, FPGA_WORD_REDUCED y, FPGA_WORD *p);
+void	fpga_lowlevel_add48		(FPGA_WORD_EXTENDED x, FPGA_WORD_EXTENDED y, FPGA_WORD_EXTENDED *s);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_modular.cpp b/fpga_modular.cpp
new file mode 100644
index 0000000..af485a0
--- /dev/null
+++ b/fpga_modular.cpp
@@ -0,0 +1,831 @@
+//------------------------------------------------------------------------------
+//
+// fpga_modular.cpp
+// ---------------------------
+// Modular arithmetic routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdsa_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_modular.h"
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+FPGA_BUFFER ecdsa_q;
+FPGA_BUFFER ecdsa_zero;
+FPGA_BUFFER ecdsa_one;
+FPGA_BUFFER ecdsa_delta;
+
+
+//------------------------------------------------------------------------------
+void fpga_modular_init()
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+	FPGA_BUFFER tmp_q		= ECDSA_Q;
+	FPGA_BUFFER tmp_zero	= ECDSA_ZERO;
+	FPGA_BUFFER tmp_one		= ECDSA_ONE;
+	FPGA_BUFFER tmp_delta	= ECDSA_DELTA;
+
+		/* fill buffers for large multi-word integers */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{	ecdsa_q.words[w]		= tmp_q.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_zero.words[w]		= tmp_zero.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_one.words[w]		= tmp_one.words[OPERAND_NUM_WORDS - (w+1)];
+		ecdsa_delta.words[w]	= tmp_delta.words[OPERAND_NUM_WORDS - (w+1)];
+	}
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular addition.
+//
+// This routine implements algorithm 3. from "Ultra High Performance ECC over
+// NIST Primes on Commercial FPGAs".
+//
+// s = (a + b) mod q
+//
+// The naive approach is like the following:
+//
+// 1. s = a + b
+// 2. if (s >= q) s -= q
+//
+// The speed-up trick is to simultaneously calculate (a + b) and (a + b - q)
+// and then select the right variant.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_add(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *s)
+//------------------------------------------------------------------------------
+{
+	int w;					// word counter
+	FPGA_BUFFER ab, ab_n;	// intermediate buffers
+	bool c_in, c_out;		// carries
+	bool b_in, b_out;		// borrows
+
+	c_in = false;			// first word has no carry
+	b_in = false;			// first word has no borrow
+	
+		// run parallel addition and subtraction
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{
+		fpga_lowlevel_add32(a->words[w], b->words[w],      c_in, &ab.words[w],   &c_out);
+		fpga_lowlevel_sub32(ab.words[w], ecdsa_q.words[w], b_in, &ab_n.words[w], &b_out);
+
+		c_in = c_out;	// propagate carry
+		b_in = b_out;	// propagate borrow
+	}
+
+		// now select the right buffer
+
+		/*
+		 * We select the right variant based on borrow and carry flags after
+		 * addition and subtraction of the very last pair of words. Note, that
+		 * we only need to select the first variant (a + b) when (a + b) < q.
+		 * This way if we get negative number after subtraction, we discard it
+		 * and use the output of the adder instead. The subtractor output is
+		 * negative when borrow flag is set *and* carry flag is not set. When
+		 * both borrow and carry are set, the number is non-negative, because
+		 * borrow and carry cancel each other out.
+		 */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		s->words[w] = (b_out && !c_out) ? ab.words[w] : ab_n.words[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular subtraction.
+//
+// This routine implements algorithm 3. from "Ultra High Performance ECC over
+// NIST Primes on Commercial FPGAs".
+//
+// d = (a - b) mod q
+//
+// The naive approach is like the following:
+//
+// 1. d = a - b
+// 2. if (a < b) d += q
+//
+// The speed-up trick is to simultaneously calculate (a - b) and (a - b + q)
+// and then select the right variant.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_sub(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *d)
+//------------------------------------------------------------------------------
+{
+	int w;					// word counter
+	FPGA_BUFFER ab, ab_n;	// intermediate buffers
+	bool c_in, c_out;		// carries
+	bool b_in, b_out;		// borrows
+
+	c_in = false;			// first word has no carry
+	b_in = false;			// first word has no borrow
+	
+		// run parallel subtraction and addition
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+	{
+		fpga_lowlevel_sub32(a->words[w], b->words[w],          b_in, &ab.words[w],   &b_out);
+		fpga_lowlevel_add32(ab.words[w], ecdsa_q.words[w], c_in, &ab_n.words[w], &c_out);
+
+		b_in = b_out;	// propagate borrow
+		c_in = c_out;	// propagate carry
+	}
+
+		// now select the right buffer
+
+		/*
+		 * We select the right variant based on borrow flag after subtraction
+		 * and addition of the very last pair of words. Note, that we only
+		 * need to select the second variant (a - b + q) when a < b. This way
+		 * if we get negative number after subtraction, we discard it
+		 * and use the output of the adder instead. The Subtractor output is
+		 * negative when borrow flag is set.
+		 */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		d->words[w] = b_out ? ab_n.words[w] : ab.words[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular multiplication.
+//
+// This routine implements modular multiplication algorithm from "Ultra High
+// Performance ECC over NIST Primes on Commercial FPGAs".
+//
+// p = (a * b) mod q
+//
+// The complex algorithm is split into three parts:
+//
+// 1. Calculation of partial words
+// 2. Acccumulation of partial words into full-size product
+// 3. Modular reduction of the full-size product
+//
+// See comments for corresponding helper routines for more information.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *p)
+//------------------------------------------------------------------------------
+{
+	FPGA_WORD_EXTENDED si[4*OPERAND_NUM_WORDS-1];	// parts of intermediate product
+	FPGA_WORD c[2*OPERAND_NUM_WORDS];				// full-size intermediate product
+
+		/* multiply to get partial words */
+	fpga_modular_mul_helper_multiply(a, b, si);
+
+		/* accumulate partial words into full-size product */
+	fpga_modular_mul_helper_accumulate(si, c);
+
+		/* reduce full-size product using special routine */
+	fpga_modular_mul_helper_reduce(c, p);
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Modular multiplicative inversion procedure.
+//
+// a1 = a^-1 (mod n)
+//
+// This routine implements the algorithm from "Constant Time Modular
+// Inversion" by Joppe W. Bos (http://www.joppebos.com/files/CTInversion.pdf)
+//
+// The algorithm has two phases: 1) calculation of "almost" modular inverse,
+// which is a^-1*2^k and 2) removal of redundant factor 2^k.
+//
+// The first stage has four temporary variables: r, s, u, v; they are updated
+// at every iteration. Depending on flags there can be four branches, FPGA
+// will pre-calculate all possible values in parallel and then use a multiplexor
+// to select the next value. This implementation also calculates all possible
+// outcomes. This is done for debugging purposes, *NOT* for constant run-time!
+//
+// The second part only works with s and k.
+//
+// Note, that k is a simple counter, and it can't exceed 2*OPERAND_WIDTH.
+//
+// The complex inversion algorithm uses helper routines. Note, that width of the
+// intermediate results can temporarily exceed OPERAND_WIDTH, so all the helper
+// routines process OPERAND_NUM_WORDS+1 words.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv(FPGA_BUFFER *a, FPGA_BUFFER *a1)
+{
+	int i;	// round counter
+	int w;	// word counter
+	int k;	// redundant power of 2
+
+		/* q, 1 */
+	FPGA_WORD buf_q[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_1[OPERAND_NUM_WORDS+1];
+
+		/* r, s */
+	FPGA_WORD buf_r[OPERAND_NUM_WORDS+1],        buf_s[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_r_double[OPERAND_NUM_WORDS+1], buf_s_double[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_r_new[OPERAND_NUM_WORDS+1],    buf_s_new[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_r_plus_s[OPERAND_NUM_WORDS+1], buf_s_plus_r[OPERAND_NUM_WORDS+1];
+
+		/* u, v */
+	FPGA_WORD buf_u[OPERAND_NUM_WORDS+1],              buf_v[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_u_half[OPERAND_NUM_WORDS+1],         buf_v_half[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_u_minus_v[OPERAND_NUM_WORDS+1],      buf_v_minus_u[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_u_minus_v_half[OPERAND_NUM_WORDS+1], buf_v_minus_u_half[OPERAND_NUM_WORDS+1];
+	FPGA_WORD buf_u_new[OPERAND_NUM_WORDS+1],          buf_v_new[OPERAND_NUM_WORDS+1];
+
+		/* comparison */
+	int cmp_v_1, cmp_u_v;
+
+		/* clear buffers */
+	for (w=0; w<=OPERAND_NUM_WORDS; w++)
+		buf_r[w] = 0, buf_s[w] = 0,
+		buf_u[w] = 0, buf_v[w] = 0,
+		buf_q[w] = 0, buf_1[w] = 0;
+
+		/* initialize q, 1 */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		buf_q[w] = ecdsa_q.words[w], buf_1[w] = ecdsa_one.words[w];
+
+		/* initialize r, s */
+	buf_r[0] = 0, buf_s[0] = 1;
+
+		/* initialize u, v */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		buf_u[w] = ecdsa_q.words[w], buf_v[w] = a->words[w];
+
+		/* initialize k */
+	k = 0;
+
+		/* flags for the first stage */
+	bool v_is_1, u_is_greater_than_v, u_is_even, v_is_even;			
+
+		/* first stage */
+	for (i=0; i<(2*OPERAND_WIDTH); i++)
+	{
+			/* pre-calculate all possible values for r and s */
+		fpga_modular_inv_helper_shl(buf_r, buf_r_double);			// r_double = 2 * r
+		fpga_modular_inv_helper_shl(buf_s, buf_s_double);			// s_double = 2 * s
+		fpga_modular_inv_helper_add(buf_r, buf_s, buf_r_plus_s);	// r_plus_s = r + s
+		fpga_modular_inv_helper_add(buf_s, buf_r, buf_s_plus_r);	// s_plus_r = s + r
+
+			/* pre-calculate all possible values for u and v */
+		fpga_modular_inv_helper_shr(buf_u, buf_u_half);						// u_half = u / 2
+		fpga_modular_inv_helper_shr(buf_v, buf_v_half);						// v_half = v / 2
+		fpga_modular_inv_helper_sub(buf_u, buf_v, buf_u_minus_v);			// u_minus_v = u - v
+		fpga_modular_inv_helper_sub(buf_v, buf_u, buf_v_minus_u);			// v_minus_u = v - u
+		fpga_modular_inv_helper_shr(buf_u_minus_v, buf_u_minus_v_half);		// u_minus_v_half = u_minus_v / 2
+		fpga_modular_inv_helper_shr(buf_v_minus_u, buf_v_minus_u_half);		// v_minus_u_half = v_minus_u / 2
+
+			/* compare */
+		fpga_modular_inv_helper_cmp(buf_v, buf_1, &cmp_v_1);
+		fpga_modular_inv_helper_cmp(buf_u, buf_v, &cmp_u_v);
+
+			/* assign flags */
+		v_is_1				= (cmp_v_1 == 0);
+		u_is_greater_than_v	= (cmp_u_v  > 0);
+		u_is_even			= !(buf_u[0] & 1);
+		v_is_even			= !(buf_v[0] & 1);
+
+			/* select u */
+		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_u_new, buf_u_half);
+		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_u_new, buf_u);
+		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_u_new, u_is_greater_than_v ? buf_u_minus_v_half : buf_u);
+
+			/* select v */
+		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_v_new, buf_v);
+		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_v_new, buf_v_half);
+		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_v_new, u_is_greater_than_v ? buf_v : buf_v_minus_u_half);
+
+			/* select r */
+		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_r_new, buf_r);
+		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_r_new, buf_r_double);
+		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_r_new, u_is_greater_than_v ? buf_r_plus_s : buf_r_double);
+
+			/* select s */
+		if ( u_is_even)					fpga_modular_inv_helper_cpy(buf_s_new, buf_s_double);
+		if (!u_is_even &&  v_is_even)	fpga_modular_inv_helper_cpy(buf_s_new, buf_s);
+		if (!u_is_even && !v_is_even)	fpga_modular_inv_helper_cpy(buf_s_new, u_is_greater_than_v ? buf_s_double : buf_s_plus_r);
+
+			/* update values */
+		if (!v_is_1)
+		{	fpga_modular_inv_helper_cpy(buf_u, buf_u_new);
+			fpga_modular_inv_helper_cpy(buf_v, buf_v_new);
+			fpga_modular_inv_helper_cpy(buf_r, buf_r_new);
+			fpga_modular_inv_helper_cpy(buf_s, buf_s_new);
+		}
+
+			/* update k */
+		if (!v_is_1) k++;
+	}
+
+		//
+		// Note, that to save FPGA resources, the second stage re-uses buffers
+		// used in the first stage.
+		//
+
+		/* flags for the second stage */
+	bool k_is_0, s_is_odd;
+
+		/* second stage */
+	for (i=0; i<(2*OPERAND_WIDTH); i++)
+	{
+			/* pre-calculate all possible values */
+		fpga_modular_inv_helper_shr(buf_s, buf_u);
+		fpga_modular_inv_helper_add(buf_s, buf_q, buf_r);
+		fpga_modular_inv_helper_shr(buf_r, buf_v);
+
+			//
+			// assign flags
+			//
+		s_is_odd = buf_s[0] & 1;
+		k_is_0   = (k == 0);
+
+			//
+			// select new values based on flags
+			//
+		fpga_modular_inv_helper_cpy(buf_s_new, s_is_odd ? buf_v : buf_u);
+
+			/* update s */
+		if (! k_is_0)
+			fpga_modular_inv_helper_cpy(buf_s, buf_s_new);
+
+			/* update k */
+		if (! k_is_0) k--;
+	}
+
+		/* done, copy s into output buffer */
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		a1->words[w] = buf_s[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Parallelized multiplication.
+//
+// This routine implements the algorithm in Fig. 3. from "Ultra High
+// Performance ECC over NIST Primes on Commercial FPGAs".
+//
+// Inputs a and b are split into 2*OPERAND_NUM_WORDS words of FPGA_WORD_WIDTH/2
+// bits each, because FPGA multipliers can't handle full FPGA_WORD_WIDTH-wide
+// inputs. These smaller words are multiplied by an array of 2*OPERAND_NUM_WORDS
+// multiplers and accumulated into an array of 4*OPERAND_NUM_WORDS-1 partial
+// output words si[].
+//
+// The order of loading a and b into the multipliers is a bit complicated,
+// during the first 2*OPERAND_NUM_WORDS-1 cycles one si word per cycle is
+// obtained, during the very last 2*OPERAND_NUM_WORDS'th cycle all the
+// remaining 2*OPERAND_NUM_WORDS partial words are obtained simultaneously.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul_helper_multiply(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si)
+//------------------------------------------------------------------------------
+{
+	int w;			// counter
+	int t, x;		// more counters
+	int j, i;		// word indices
+	FPGA_WORD p;	// product
+
+		// buffers for smaller words that multipliers can handle
+	FPGA_WORD_REDUCED ai[2*OPERAND_NUM_WORDS];
+	FPGA_WORD_REDUCED bj[2*OPERAND_NUM_WORDS];
+	
+		// split a and b into smaller words
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		ai[2*w] = (FPGA_WORD_REDUCED)a->words[w], ai[2*w + 1] = (FPGA_WORD_REDUCED)(a->words[w] >> (FPGA_WORD_WIDTH / 2)),
+		bj[2*w] = (FPGA_WORD_REDUCED)b->words[w], bj[2*w + 1] = (FPGA_WORD_REDUCED)(b->words[w] >> (FPGA_WORD_WIDTH / 2));
+
+		// accumulators
+	FPGA_WORD_EXTENDED mac[2*OPERAND_NUM_WORDS];
+	
+		// clear accumulators
+	for (w=0; w<(2*OPERAND_NUM_WORDS); w++) mac[w] = 0;
+
+		// run the crazy algorithm :)
+	for (t=0; t<(2*OPERAND_NUM_WORDS); t++)
+	{
+			// save upper half of si[] (one word per cycle)
+		if (t > 0)
+		{	si[4*OPERAND_NUM_WORDS - (t+1)] = mac[t];
+			mac[t] = 0;
+		}
+
+			// update index
+		j = 2*OPERAND_NUM_WORDS - (t+1);
+
+			// parallel multiplication
+		for (x=0; x<(2*OPERAND_NUM_WORDS); x++)
+		{
+				// update index
+			i = t - x;
+			if (i < 0) i += 2*OPERAND_NUM_WORDS;
+
+				// multiply...
+			fpga_lowlevel_mul16(ai[i], bj[j], &p);
+
+				// ...accumulate
+			mac[x] += p;
+		}
+
+	}
+
+		// now finally save lower half of si[] (2*OPERAND_NUM_WORDS words at once)
+	for (w=0; w<(2*OPERAND_NUM_WORDS); w++)
+		si[w] = mac[2*OPERAND_NUM_WORDS - (w+1)];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Accumulation of partial words into full-size product.
+//
+// This routine implements the Algorithm 4. from "Ultra High Performance ECC
+// over NIST Primes on Commercial FPGAs".
+//
+// Input words si[] are accumulated into full-size product c[].
+//
+// The algorithm is a bit tricky, there are 4*OPERAND_NUM_WORDS-1 words in
+// si[]. Complete operation takes 2*OPERAND_NUM_WORDS cycles, even words are
+// summed in full, odd words are split into two parts. During every cycle the
+// upper part of the previous word and the lower part of the following word are
+// summed too.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_mul_helper_accumulate(FPGA_WORD_EXTENDED *si, FPGA_WORD *c)
+//------------------------------------------------------------------------------
+{
+	int w;							// word counter
+	FPGA_WORD_EXTENDED cw0, cw1;	// intermediate sums
+	FPGA_WORD_REDUCED  cw_carry;	// wide carry
+
+		// clear carry
+	cw_carry = 0;
+
+		// execute the algorithm
+	for (w=0; w<(2*OPERAND_NUM_WORDS); w++)
+	{
+			// handy flags
+		bool w_is_first = (w == 0);
+		bool w_is_last  = (w == (2*OPERAND_NUM_WORDS-1));
+
+			// accumulate full current even word...
+			// ...and also the upper part of the previous odd word (if not the first word)
+		fpga_lowlevel_add48(si[2*w], w_is_first ? 0 : si[2*w-1] >> (FPGA_WORD_WIDTH / 2), &cw0);
+
+			// generate another word from "carry" part of the previous even word...
+			// ...and also the lower part of the following odd word (if not the last word)
+		cw1 = w_is_last ? 0 : (FPGA_WORD)(si[2*w+1] << (FPGA_WORD_WIDTH / 2));
+		cw1 |= (FPGA_WORD_EXTENDED)cw_carry;
+
+			// accumulate once again
+		fpga_lowlevel_add48(cw0, cw1, &cw1);
+
+			// store current word...
+		c[w] = (FPGA_WORD)cw1;
+
+			// ...and carry
+		cw_carry = (FPGA_WORD_REDUCED) (cw1 >> FPGA_WORD_WIDTH);
+	}
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Fast modular reduction for NIST prime P-256.
+//
+// p = c mod p256
+//
+// This routine implements the algorithm 2.29 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
+// on the other hand is the output of the parallelized Comba multiplier, so it
+// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
+//
+// To save FPGA resources, the calculation is done using only two adders and
+// one subtractor. The algorithm is split into five steps.
+//
+//------------------------------------------------------------------------------
+#if USE_CURVE == 1
+void fpga_modular_mul_helper_reduce_p256(FPGA_WORD *c, FPGA_BUFFER *p)
+{
+		// "funny" words
+	FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9;
+
+		// compose "funny" words out of input words
+	s1.words[7] = c[ 7], s1.words[6] = c[ 6], s1.words[5] = c[ 5], s1.words[4] = c[ 4], s1.words[3] = c[ 3], s1.words[2] = c[ 2], s1.words[1] = c[ 1], s1.words[0] = c[ 0];
+	s2.words[7] = c[15], s2.words[6] = c[14], s2.words[5] = c[13], s2.words[4] = c[12], s2.words[3] = c[11], s2.words[2] = 0,     s2.words[1] = 0,     s2.words[0] = 0;
+	s3.words[7] = 0,     s3.words[6] = c[15], s3.words[5] = c[14], s3.words[4] = c[13], s3.words[3] = c[12], s3.words[2] = 0,     s3.words[1] = 0,     s3.words[0] = 0;
+	s4.words[7] = c[15], s4.words[6] = c[14], s4.words[5] = 0,     s4.words[4] = 0,     s4.words[3] = 0,     s4.words[2] = c[10], s4.words[1] = c[ 9], s4.words[0] = c[ 8];
+	s5.words[7] = c[ 8], s5.words[6] = c[13], s5.words[5] = c[15], s5.words[4] = c[14], s5.words[3] = c[13], s5.words[2] = c[11], s5.words[1] = c[10], s5.words[0] = c[ 9];
+	s6.words[7] = c[10], s6.words[6] = c[ 8], s6.words[5] = 0,     s6.words[4] = 0,     s6.words[3] = 0,     s6.words[2] = c[13], s6.words[1] = c[12], s6.words[0] = c[11];
+	s7.words[7] = c[11], s7.words[6] = c[ 9], s7.words[5] = 0,     s7.words[4] = 0,     s7.words[3] = c[15], s7.words[2] = c[14], s7.words[1] = c[13], s7.words[0] = c[12];
+	s8.words[7] = c[12], s8.words[6] = 0,     s8.words[5] = c[10], s8.words[4] = c[ 9], s8.words[3] = c[ 8], s8.words[2] = c[15], s8.words[1] = c[14], s8.words[0] = c[13];
+	s9.words[7] = c[13], s9.words[6] = 0,     s9.words[5] = c[11], s9.words[4] = c[10], s9.words[3] = c[ 9], s9.words[2] = 0,     s9.words[1] = c[15], s9.words[0] = c[14];
+	
+		// intermediate results
+	FPGA_BUFFER sum0, sum1, difference;
+
+		/* Step 1. */
+	fpga_modular_add(&s2,         &s2,         &sum0);			// sum0 = 2*s2
+	fpga_modular_add(&s3,         &s3,         &sum1);			// sum1 = 2*s3
+	fpga_modular_sub(&ecdsa_zero, &s6,         &difference);	// difference = -s6
+
+		/* Step 2. */
+	fpga_modular_add(&sum0,       &s1,         &sum0);			// sum0 = s1 + 2*s2
+	fpga_modular_add(&sum1,       &s4,         &sum1);			// sum1 = s4 + 2*s3
+	fpga_modular_sub(&difference, &s7,         &difference);	// difference = -(s6 + s7)
+
+		/* Step 3. */
+	fpga_modular_add(&sum0,       &s5,         &sum0);			// sum0 = s1 + 2*s2 + s5
+	fpga_modular_add(&sum1,       &ecdsa_zero, &sum1);			// compulsory cycle to keep sum1 constant for next stage
+	fpga_modular_sub(&difference, &s8,         &difference);	// difference = -(s6 + s7 + s8)
+
+		/* Step 4. */
+	fpga_modular_add(&sum0,       &sum1,       &sum0);			// sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
+//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
+	fpga_modular_sub(&difference, &s9,         &difference);	// difference = -(s6 + s7 + s8 + s9)
+
+		/* Step 5. */
+	fpga_modular_add(&sum0,       &difference, p);				// p = s1 + 2*s2 + 2*s3 + s4 + s5 - s6 - s7 - s8 - s9
+//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
+//	fpga_modular_add(<dummy>,     <dummy>,     &difference);	// dummy cycle, result ignored
+}
+#endif
+
+
+//------------------------------------------------------------------------------
+//
+// Fast modular reduction for NIST prime P-384.
+//
+// p = c mod p384
+//
+// This routine implements the algorithm 2.30 from "Guide to Elliptic Curve
+// Cryptography".
+//
+// Output p is OPERAND_WIDTH wide (contains OPERAND_NUM_WORDS words), input c
+// on the other hand is the output of the parallelized Comba multiplier, so it
+// is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
+//
+// ...
+//
+//------------------------------------------------------------------------------
+#if USE_CURVE == 2
+void fpga_modular_mul_helper_reduce_p384(FPGA_WORD *c, FPGA_BUFFER *p)
+{
+		// "funny" words
+	FPGA_BUFFER s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+		// compose "funny" words
+	 s1.words[11] = c[11],   s1.words[10] = c[10],   s1.words[ 9] = c[ 9],   s1.words[ 8] = c[ 8],   s1.words[ 7] = c[ 7],   s1.words[ 6] = c[ 6],   s1.words[ 5] = c[ 5],   s1.words[ 4] = c[ 4],   s1.words[ 3] = c[ 3],   s1.words[ 2] = c[ 2],   s1.words[ 1] = c[ 1],   s1.words[ 0] = c[ 0];
+	 s2.words[11] = 0,       s2.words[10] = 0,       s2.words[ 9] = 0,       s2.words[ 8] = 0,       s2.words[ 7] = 0,       s2.words[ 6] = c[23],   s2.words[ 5] = c[22],   s2.words[ 4] = c[21],   s2.words[ 3] = 0,       s2.words[ 2] = 0,       s2.words[ 1] = 0,       s2.words[ 0] = 0;
+	 s3.words[11] = c[23],   s3.words[10] = c[22],   s3.words[ 9] = c[21],   s3.words[ 8] = c[20],   s3.words[ 7] = c[19],   s3.words[ 6] = c[18],   s3.words[ 5] = c[17],   s3.words[ 4] = c[16],   s3.words[ 3] = c[15],   s3.words[ 2] = c[14],   s3.words[ 1] = c[13],   s3.words[ 0] = c[12];
+	 s4.words[11] = c[20],   s4.words[10] = c[19],   s4.words[ 9] = c[18],   s4.words[ 8] = c[17],   s4.words[ 7] = c[16],   s4.words[ 6] = c[15],   s4.words[ 5] = c[14],   s4.words[ 4] = c[13],   s4.words[ 3] = c[12],   s4.words[ 2] = c[23],   s4.words[ 1] = c[22],   s4.words[ 0] = c[21];
+	 s5.words[11] = c[19],   s5.words[10] = c[18],   s5.words[ 9] = c[17],   s5.words[ 8] = c[16],   s5.words[ 7] = c[15],   s5.words[ 6] = c[14],   s5.words[ 5] = c[13],   s5.words[ 4] = c[12],   s5.words[ 3] = c[20],   s5.words[ 2] = 0,       s5.words[ 1] = c[23],   s5.words[ 0] = 0;
+	 s6.words[11] = 0,       s6.words[10] = 0,       s6.words[ 9] = 0,       s6.words[ 8] = 0,       s6.words[ 7] = c[23],   s6.words[ 6] = c[22],   s6.words[ 5] = c[21],   s6.words[ 4] = c[20],   s6.words[ 3] = 0,       s6.words[ 2] = 0,       s6.words[ 1] = 0,       s6.words[ 0] = 0;
+	 s7.words[11] = 0,       s7.words[10] = 0,       s7.words[ 9] = 0,       s7.words[ 8] = 0,       s7.words[ 7] = 0,       s7.words[ 6] = 0,       s7.words[ 5] = c[23],   s7.words[ 4] = c[22],   s7.words[ 3] = c[21],   s7.words[ 2] = 0,       s7.words[ 1] = 0,       s7.words[ 0] = c[20];
+	 s8.words[11] = c[22],   s8.words[10] = c[21],   s8.words[ 9] = c[20],   s8.words[ 8] = c[19],   s8.words[ 7] = c[18],   s8.words[ 6] = c[17],   s8.words[ 5] = c[16],   s8.words[ 4] = c[15],   s8.words[ 3] = c[14],   s8.words[ 2] = c[13],   s8.words[ 1] = c[12],   s8.words[ 0] = c[23];
+	 s9.words[11] = 0,       s9.words[10] = 0,       s9.words[ 9] = 0,       s9.words[ 8] = 0,       s9.words[ 7] = 0,       s9.words[ 6] = 0,       s9.words[ 5] = 0,       s9.words[ 4] = c[23],   s9.words[ 3] = c[22],   s9.words[ 2] = c[21],   s9.words[ 1] = c[20],   s9.words[ 0] = 0;
+	s10.words[11] = 0,      s10.words[10] = 0,      s10.words[ 9] = 0,      s10.words[ 8] = 0,      s10.words[ 7] = 0,      s10.words[ 6] = 0,      s10.words[ 5] = 0,      s10.words[ 4] = c[23],  s10.words[ 3] = c[23],  s10.words[ 2] = 0,      s10.words[ 1] = 0,      s10.words[ 0] = 0;
+
+
+		// intermediate results
+	FPGA_BUFFER t1, t2, t3, t4;
+
+		/* Step 1. */
+	fpga_modular_add(&s1,  &s3, &t1);	// t1 = s1 + s3
+	fpga_modular_add(&s2,  &s2, &t2);	// t2 = 2*s2
+	fpga_modular_add(&s4,  &s5, &t3);	// t3 = s4 + s5
+	fpga_modular_add(&s6,  &s7, &t4);	// t4 = s6 + s7
+
+		/* Step 2. */
+	fpga_modular_add(&t1,  &t2, &t1);	// t1 = t1 + t2 = s1 + 2*s2 + 2*s3
+	fpga_modular_add(&t3,  &t4, &t2);	// t2 = t3 + t4 = s4 + s5 + s6 + s7
+	fpga_modular_add(&s8,  &s9, &t3);	// t3 = s8 + s9
+
+		/* Step 3. */
+	fpga_modular_add(&t1,  &t2, &t1);	// t1 = t1 + t2 = s1 + 2*s2 + 2*s3 + s4 + s5 + s6 + s7
+	fpga_modular_add(&s10, &t3, &t2);	// t2 = s10 + t3 = s8 + s9 + s10
+
+		/* Step 4. */
+	fpga_modular_sub(&t1,  &t2, p);		// p = t1 - t2 = s1 + 2*s2 + 2*s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
+}
+#endif
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word shift to the left by 1 bit.
+//
+// y = x << 1
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_shl(FPGA_WORD *x, FPGA_WORD *y)
+//------------------------------------------------------------------------------
+{
+	int w;							// word counter
+	FPGA_WORD carry_in, carry_out;	// carries
+
+	carry_in = 0;	// first word has no carry
+
+		// shift word-by-word
+	for (w=0; w<=OPERAND_NUM_WORDS; w++)
+		carry_out = x[w] >> (FPGA_WORD_WIDTH - 1),	// store next carry
+		y[w]  = x[w] << 1,							// shift
+		y[w] |= carry_in,							// process carry
+		carry_in = carry_out;						// propagate carry
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word shift to the right by 1 bit.
+//
+// y = x >> 1
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_shr(FPGA_WORD *x, FPGA_WORD *y)
+//------------------------------------------------------------------------------
+{
+	int w;							// word counter
+	FPGA_WORD carry_in, carry_out;	// carries
+
+	carry_in = 0;	// first word has no carry
+
+		// shift word-by-word
+	for (w=OPERAND_NUM_WORDS; w>=0; w--)
+		carry_out = x[w],							// store next carry
+		y[w] = x[w] >> 1,							// shift
+		y[w] |= carry_in << (FPGA_WORD_WIDTH - 1),	// process carry
+		carry_in = carry_out;						// propagate carry
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word addition.
+//
+// s = x + y
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_add(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *s)
+//------------------------------------------------------------------------------
+{
+	int w;						// word counter
+	bool carry_in, carry_out;	// carries
+
+		// lowest word has no carry
+	carry_in = false;
+
+		// sum a and b word-by-word
+	for (w=0; w<=OPERAND_NUM_WORDS; w++)
+	{
+			// low-level addition
+		fpga_lowlevel_add32(x[w], y[w], carry_in, &s[w], &carry_out);
+
+			// propagate carry bit
+		carry_in = carry_out;
+	}
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word subtraction .
+//
+// d = x - y
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_sub(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *d)
+//------------------------------------------------------------------------------
+{
+	int w;							// word counter
+	bool borrow_in, borrow_out;		// borrows
+
+		// lowest word has no borrow
+	borrow_in = false;				
+
+		// subtract b from a word-by-word
+	for (w=0; w<=OPERAND_NUM_WORDS; w++)
+	{
+			// low-level subtraction
+		fpga_lowlevel_sub32(x[w], y[w], borrow_in, &d[w], &borrow_out);
+
+			// propagate borrow bit
+		borrow_in = borrow_out;
+	}
+
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word copy.
+//
+// dst = src
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_cpy(FPGA_WORD *dst, FPGA_WORD *src)
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+		// copy all the words from src into dst
+	for (w=0; w<=OPERAND_NUM_WORDS; w++)
+		dst[w] = src[w];
+}
+
+
+//------------------------------------------------------------------------------
+//
+// Multi-word comparison.
+//
+// The return value is -1 when a<b, 0 when a=b and 1 when a>b.
+//
+//------------------------------------------------------------------------------
+void fpga_modular_inv_helper_cmp(FPGA_WORD *a, FPGA_WORD *b, int *c)
+//------------------------------------------------------------------------------
+{
+	int w;					// word counter
+	int r, r0, ra, rb;		// result
+	bool borrow;			// borrow
+	FPGA_WORD d;			// partial difference
+
+		// result is unknown so far
+	r = 0;	
+
+		// clear borrow for the very first word
+	borrow = false;
+
+		// compare a and b word-by-word
+	for (w=OPERAND_NUM_WORDS; w>=0; w--)
+	{
+			// save result
+		r0 = r;
+
+			// subtract current words
+		fpga_lowlevel_sub32(a[w], b[w], false, &d, &borrow);
+
+			// analyze flags
+		rb = borrow ? -1 : 0;					// a[w] < b[w]
+		ra = (!borrow && (d != 0)) ? 1 : 0;		// a[w] > b[w]
+		
+			//
+			// Note, that ra is either 0 or 1, rb is either 0 or -1 and they
+			// can never be non-zero at the same time.
+			//
+			// Note, that r can only be updated if comparison result has not
+			// been resolved yet. Even if we already know comparison result,
+			// we continue doing dummy subtractions to keep run-time constant.
+			//
+
+			// update result
+		r = (r == 0) ? ra + rb : r0;
+	}
+
+		// done
+	*c = r;
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_modular.h b/fpga_modular.h
new file mode 100644
index 0000000..4d31725
--- /dev/null
+++ b/fpga_modular.h
@@ -0,0 +1,87 @@
+//------------------------------------------------------------------------------
+//
+// fpga_modular.h
+// ---------------------------
+// Modular arithmetic routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Globals
+//------------------------------------------------------------------------------
+extern FPGA_BUFFER ecdsa_q;
+extern FPGA_BUFFER ecdsa_zero;
+extern FPGA_BUFFER ecdsa_one;
+extern FPGA_BUFFER ecdsa_delta;
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+void	fpga_modular_init						();
+
+void	fpga_modular_add						(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *s);
+void	fpga_modular_sub						(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *d);
+void	fpga_modular_mul						(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_BUFFER *p);
+void	fpga_modular_inv						(FPGA_BUFFER *a, FPGA_BUFFER *a1);
+
+void	fpga_modular_mul_helper_multiply		(FPGA_BUFFER *a, FPGA_BUFFER *b, FPGA_WORD_EXTENDED *si);
+void	fpga_modular_mul_helper_accumulate		(FPGA_WORD_EXTENDED *si, FPGA_WORD *c);
+void	fpga_modular_mul_helper_reduce_p256		(FPGA_WORD *c, FPGA_BUFFER *p);
+void	fpga_modular_mul_helper_reduce_p384		(FPGA_WORD *c, FPGA_BUFFER *p);
+
+void	fpga_modular_inv_helper_shl				(FPGA_WORD *x, FPGA_WORD *y);
+void	fpga_modular_inv_helper_shr				(FPGA_WORD *x, FPGA_WORD *y);
+void	fpga_modular_inv_helper_add				(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *s);
+void	fpga_modular_inv_helper_sub				(FPGA_WORD *x, FPGA_WORD *y, FPGA_WORD *d);
+void	fpga_modular_inv_helper_cpy				(FPGA_WORD *dst, FPGA_WORD *src);
+void	fpga_modular_inv_helper_cmp				(FPGA_WORD *a, FPGA_WORD *b, int *c);
+
+
+//------------------------------------------------------------------------------
+// Reduction Routine Selection
+//------------------------------------------------------------------------------
+
+#if USE_CURVE == 1
+#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p256
+#elif USE_CURVE == 2
+#define fpga_modular_mul_helper_reduce fpga_modular_mul_helper_reduce_p384
+#else
+#error USE_CURVE must be either 1 or 2!
+#endif
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_util.cpp b/fpga_util.cpp
new file mode 100644
index 0000000..6f7a834
--- /dev/null
+++ b/fpga_util.cpp
@@ -0,0 +1,86 @@
+//------------------------------------------------------------------------------
+//
+// fpga_util.cpp
+// ---------------------
+// Utility FPGA routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Headers
+//------------------------------------------------------------------------------
+#include <stdint.h>
+#include "ecdsa_model.h"
+#include "fpga_lowlevel.h"
+#include "fpga_util.h"
+
+
+//------------------------------------------------------------------------------
+bool fpga_buffer_is_zero(FPGA_BUFFER *x)
+//------------------------------------------------------------------------------
+//
+// Returns true when large multi-word integer x is zero.
+//
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+		// try to find non-zero words
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		if (x->words[w]) return false;
+
+		// no non-zero words found
+	return true;
+}
+
+
+//------------------------------------------------------------------------------
+void fpga_buffer_copy(FPGA_BUFFER *src, FPGA_BUFFER *dst)
+//------------------------------------------------------------------------------
+//
+// Copies large multi-word integer from src into dst.
+//
+//------------------------------------------------------------------------------
+{
+	int w;	// word counter
+
+		// copy all the words from src into dst
+	for (w=0; w<OPERAND_NUM_WORDS; w++)
+		dst->words[w] = src->words[w];
+}
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/fpga_util.h b/fpga_util.h
new file mode 100644
index 0000000..24b2aa2
--- /dev/null
+++ b/fpga_util.h
@@ -0,0 +1,49 @@
+//------------------------------------------------------------------------------
+//
+// fpga_util.h
+// ---------------------
+// Utility FPGA routines
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+
+//------------------------------------------------------------------------------
+// Prototypes
+//------------------------------------------------------------------------------
+bool	fpga_buffer_is_zero		(FPGA_BUFFER *x);
+void	fpga_buffer_copy		(FPGA_BUFFER *src, FPGA_BUFFER *dst);
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
-- 
cgit v1.2.3