From 4e0581c98e289e79af09d95b747f9932a14c89fd Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Tue, 17 Apr 2018 15:05:22 +0300 Subject: Initial commit of P-256 point multiplier suitable for ECDH. --- README.md | 98 ++++ bench/tb_point_multiplier_256.v | 341 ++++++++++++++ rtl/curve/point_dbl_add_256.v | 864 ++++++++++++++++++++++++++++++++++ rtl/curve/point_mul_256.v | 849 +++++++++++++++++++++++++++++++++ rtl/ecdhp256.v | 192 ++++++++ rtl/ecdhp256_wrapper.v | 177 +++++++ stm32_driver/ecdhp256_driver_sample.c | 261 ++++++++++ 7 files changed, 2782 insertions(+) create mode 100644 README.md create mode 100644 bench/tb_point_multiplier_256.v create mode 100644 rtl/curve/point_dbl_add_256.v create mode 100644 rtl/curve/point_mul_256.v create mode 100644 rtl/ecdhp256.v create mode 100644 rtl/ecdhp256_wrapper.v create mode 100644 stm32_driver/ecdhp256_driver_sample.c diff --git a/README.md b/README.md new file mode 100644 index 0000000..755f822 --- /dev/null +++ b/README.md @@ -0,0 +1,98 @@ +# ecdhp256 + +## Core Description + +This core implements the scalar point multiplier for ECDSA curve P-256. It can be used during generation of public keys, the core can also be used as part of the signing operation, it can also do ECDH key exchange. + +## API Specification + +The core interface is similar to other Cryptech cores. FMC memory map looks like the following: + +`0x0000 | NAME0` +`0x0004 | NAME1` +`0x0008 | VERSION` + +`0x0020 | CONTROL` +`0x0024 | STATUS` + +`0x0100 | K0` +`0x0104 | K1` +`...` +`0x011C | K7` + +`0x0120 | XIN0` +`0x0124 | XIN1` +`...` +`0x013C | XIN7` + +`0x0140 | YIN0` +`0x0144 | YIN1` +`...` +`0x015C | YIN7` + +`0x0160 | XOUT0` +`0x0164 | XOUT1` +`...` +`0x017C | XOUT7` + +`0x0180 | YOUT0` +`0x0184 | YOUT1` +`...` +`0x019C | YOUT7` + +The core has the following registers: + + * **NAME0**, **NAME1** +Read-only core name ("ecdhp256"). + + * **VERSION** +Read-only core version, currently "0.10". + + * **CONTROL** +Control register bits: +[31:2] Don't care, always read as 0 +[1] "next" control bit +[0] Don't care, always read as 0 +The core starts multiplication when the "next" control bit changes from 0 to 1. This way when the bit is set, the core will only perform one multiplication and then stop. To start another operation, the bit must be cleared at first and then set to 1 again. + + * **STATUS** +Read-only status register bits: +[31:2] Don't care, always read as 0 +[1] "valid" control bit +[0] "ready" control bit (always read as 1) +The "valid" control bit is cleared as soon as the core starts operation, and gets set after the multiplication operations is complete. Note, that unlike some other Cryptech cores, this core doesn't need any special initialization, so the "ready" control bit is simply hardwired to always read as 1. This is to keep general core interface consistency. + + * **K0**-**K7** +Buffer for the 256-bit multiplication factor (multiplier) K. The core will compute R(XOUT, YOUT) = K * P(XIN, YIN). K0 is the least significant 32-bit word of K, i.e. bits [31:0], while K7 is the most significant 32-bit word of K, i.e. bits [255:224]. + + * **XIN0**-**XIN7**, **YIN0**-**YIN7** +Writeable buffers for the 256-bit coordinates X and Y of the input multiplicand P(XIN, YIN). Values should be in affine coordinates. XIN0 and YIN0 contain the least significant 32-bit words, i.e. bits [31:0], while XIN7 and YIN7 contain the most significant 32-bit words, i.e. bits [255:224]. Fill the buffers with coordinates of the base point during public key generation and during multiplication by the per-message (random) number. Fill the buffers with coordinates of Bob's public key to derive Alice's copy of the shared secret key. + + * **XIN0**-**XIN7**, **YIN0**-**YIN7** +Read-only buffers for the 256-bit coordinates X and Y of the product R(XOUT, YOUT). Values are returned in affine coordinates. XOUT0 and YOUT0 contain the least significant 32-bit words, i.e. bits [31:0], while XOUT7 and YOUT7 contain the most significant 32-bit words, i.e. bits [255:224]. + +## Implementation Details + +The top-level core module contains block memory buffers for input and output operands and the base point multiplier, that reads from the input buffer and writes to the output buffers. + +The base point multiplier itself consists of the following: + * Buffers for storage of temporary values + * Configurable "worker" unit + * Microprograms for the worker unit + * Multi-word mover unit + * Modular inversion unit + +The "worker" unit can execute five basic operations: + * comparison + * copying + * modular addition + * modular subtraction + * modular multiplications + +There are two primary microprograms, that the worker runs: curve point doubling and addition of curve point to the base point. Those microprograms use projective Jacobian coordinates, so one more microprogram is used to convert the product into affine coordinates with the help of modular inversion unit. + +Note, that the core is supplemented by a reference model written in C, that has extensive comments describing tricky corners of the underlying math. + +## Vendor-specific Primitives + +Cryptech Alpha platform is based on Xilinx Artix-7 200T FPGA, so this core takes advantage of Xilinx-specific DSP slices to carry out math-intensive operations. All vendor-specific math primitives are placed under /rtl/lowlevel/artix7, the core also offers generic replacements under /rtl/lowlevel/generic, they can be used for simulation with 3rd party tools, that are not aware of Xilinx-specific stuff. Selection of vendor/generic primitives is done in ecdsa_lowlevel_settings.v, when porting to other architectures, only those four low-level modules need to be ported. diff --git a/bench/tb_point_multiplier_256.v b/bench/tb_point_multiplier_256.v new file mode 100644 index 0000000..3647a6a --- /dev/null +++ b/bench/tb_point_multiplier_256.v @@ -0,0 +1,341 @@ +//------------------------------------------------------------------------------ +// +// tb_point_multiplier_256.v +// ----------------------------------------------------------------------------- +// Testbench for P-256 point scalar multiplier. +// +// Authors: Pavel Shatov +// +// Copyright (c) 2018, NORDUnet A/S +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// - Neither the name of the NORDUnet nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +`timescale 1ns / 1ps +//------------------------------------------------------------------------------ + +module tb_point_multiplier_256; + + + // + // Test Vectors + // + `include "../../../user/shatov/ecdh_fpga_model/test_vectors/ecdh_test_vectors.v" + + + + // + // Core Parameters + // + localparam WORD_COUNTER_WIDTH = 3; + localparam OPERAND_NUM_WORDS = 8; + + + // + // Clock (100 MHz) + // + reg clk = 1'b0; + always #5 clk = ~clk; + + + // + // Inputs, Outputs + // + reg rst_n; + reg ena; + wire rdy; + + + // + // Buffers (K, PX, PY, QX, QY) + // + wire [WORD_COUNTER_WIDTH-1:0] core_k_addr; + wire [WORD_COUNTER_WIDTH-1:0] core_px_addr; + wire [WORD_COUNTER_WIDTH-1:0] core_py_addr; + wire [WORD_COUNTER_WIDTH-1:0] core_qx_addr; + wire [WORD_COUNTER_WIDTH-1:0] core_qy_addr; + + wire core_qx_wren; + wire core_qy_wren; + + wire [ 32-1:0] core_k_data; + wire [ 32-1:0] core_px_data; + wire [ 32-1:0] core_py_data; + wire [ 32-1:0] core_qx_data; + wire [ 32-1:0] core_qy_data; + + reg [WORD_COUNTER_WIDTH-1:0] tb_k_addr; + reg [WORD_COUNTER_WIDTH-1:0] tb_pxy_addr; + reg [WORD_COUNTER_WIDTH-1:0] tb_qxy_addr; + + reg tb_k_wren; + reg tb_pxy_wren; + + reg [ 31:0] tb_k_data; + reg [ 31:0] tb_px_data; + reg [ 31:0] tb_py_data; + wire [ 31:0] tb_qx_data; + wire [ 31:0] tb_qy_data; + + bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)) + bram_k + ( .clk(clk), + .a_addr(tb_k_addr), .a_wr(tb_k_wren), .a_in(tb_k_data), .a_out(), + .b_addr(core_k_addr), .b_out(core_k_data) + ); + + bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)) + bram_px + ( .clk(clk), + .a_addr(tb_pxy_addr), .a_wr(tb_pxy_wren), .a_in(tb_px_data), .a_out(), + .b_addr(core_px_addr), .b_out(core_px_data) + ); + + bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)) + bram_py + ( .clk(clk), + .a_addr(tb_pxy_addr), .a_wr(tb_pxy_wren), .a_in(tb_py_data), .a_out(), + .b_addr(core_py_addr), .b_out(core_py_data) + ); + + bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)) + bram_qx + ( .clk(clk), + .a_addr(core_qx_addr), .a_wr(core_qx_wren), .a_in(core_qx_data), .a_out(), + .b_addr(tb_qxy_addr), .b_out(tb_qx_data) + ); + + bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH)) + bram_qy + ( .clk(clk), + .a_addr(core_qy_addr), .a_wr(core_qy_wren), .a_in(core_qy_data), .a_out(), + .b_addr(tb_qxy_addr), .b_out(tb_qy_data) + ); + + + // + // UUT + // + point_mul_256 uut + ( + .clk (clk), + .rst_n (rst_n), + + .ena (ena), + .rdy (rdy), + + .k_addr (core_k_addr), + .qx_addr (core_px_addr), + .qy_addr (core_py_addr), + .rx_addr (core_qx_addr), + .ry_addr (core_qy_addr), + + .rx_wren (core_qx_wren), + .ry_wren (core_qy_wren), + + .k_din (core_k_data), + .qx_din (core_px_data), + .qy_din (core_py_data), + .rx_dout (core_qx_data), + .ry_dout (core_qy_data) + ); + + + // + // Testbench Routine + // + reg ok = 1; + initial begin + + /* initialize control inputs */ + rst_n = 0; + ena = 0; + + /* wait for some time */ + #200; + + /* de-assert reset */ + rst_n = 1; + + /* wait for some time */ + #100; + + /* run tests */ + + $display(" 1. H = 2 * G..."); + test_point_multiplier(256'd2, P_256_G_X, P_256_G_Y, P_256_H_X, P_256_H_Y); + + $display(" 2. H = (n + 2) * G..."); + test_point_multiplier(P_256_N + 256'd2, P_256_G_X, P_256_G_Y, P_256_H_X, P_256_H_Y); + + $display(" 3. QA = dA * G..."); + test_point_multiplier(P_256_DA, P_256_G_X, P_256_G_Y, P_256_QA_X, P_256_QA_Y); + + $display(" 4. QB = dB * G..."); + test_point_multiplier(P_256_DB, P_256_G_X, P_256_G_Y, P_256_QB_X, P_256_QB_Y); + + $display(" 5. S = dB * QA..."); + test_point_multiplier(P_256_DB, P_256_QA_X, P_256_QA_Y, P_256_S_X, P_256_S_Y); + + $display(" 6. S = dA * QB..."); + test_point_multiplier(P_256_DA, P_256_QB_X, P_256_QB_Y, P_256_S_X, P_256_S_Y); + + $display(" 7. QA2 = 2 * QA..."); + test_point_multiplier(256'd2, P_256_QA_X, P_256_QA_Y, P_256_QA2_X, P_256_QA2_Y); + + $display(" 8. QA2 = (n + 2) * QA..."); + test_point_multiplier(P_256_N + 256'd2, P_256_QA_X, P_256_QA_Y, P_256_QA2_X, P_256_QA2_Y); + + $display(" 9. QB2 = 2 * QB..."); + test_point_multiplier(256'd2, P_256_QB_X, P_256_QB_Y, P_256_QB2_X, P_256_QB2_Y); + + $display("10. QB2 = (n + 2) * QB..."); + test_point_multiplier(P_256_N + 256'd2, P_256_QB_X, P_256_QB_Y, P_256_QB2_X, P_256_QB2_Y); + + + /* print result */ + if (ok) $display("tb_point_multiplier_256: SUCCESS"); + else $display("tb_point_multiplier_256: FAILURE"); + // + //$finish; + // + end + + + // + // Test Task + // + reg q_ok; + + integer w; + + task test_point_multiplier; + + input [255:0] k; + input [255:0] px; + input [255:0] py; + input [255:0] qx; + input [255:0] qy; + + reg [255:0] k_shreg; + reg [255:0] px_shreg; + reg [255:0] py_shreg; + reg [255:0] qx_shreg; + reg [255:0] qy_shreg; + + begin + + /* start filling memories */ + tb_k_wren = 1; + tb_pxy_wren = 1; + + /* initialize shift registers */ + k_shreg = k; + px_shreg = px; + py_shreg = py; + + /* write all the words */ + for (w=0; w