From cd8f45d313fe760d7f71a425bdbb567afac219d1 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov" Date: Thu, 28 May 2015 01:51:26 +0400 Subject: Initial version of GOST 34.11-2012 (aka Streebog) hash core --- streebog_hash/streebog_core_lps.v | 405 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 405 insertions(+) create mode 100644 streebog_hash/streebog_core_lps.v (limited to 'streebog_hash/streebog_core_lps.v') diff --git a/streebog_hash/streebog_core_lps.v b/streebog_hash/streebog_core_lps.v new file mode 100644 index 0000000..a668f16 --- /dev/null +++ b/streebog_hash/streebog_core_lps.v @@ -0,0 +1,405 @@ +`timescale 1ns / 1ps + +module streebog_core_lps + ( + clk, + ena, rdy, last, + din, dout + ); + + + // + // Parameters + // + parameter PS_PIPELINE_STAGES = 8; // 2, 4, 8 + parameter L_PIPELINE_STAGES = 8; // 2, 4, 8, 16, 32, 64 + + + // + // Ports + // + input wire clk; // core clock + input wire ena; // start transformation flag + output wire rdy; // transformation done flag (dout is valid) + output wire last; // transformation about to complete (rdy flag will be asserted during the next cycle) + input wire [511:0] din; // input data to transform + output wire [511:0] dout; // output data (result of transformation) + + + /* + * This LPS core has parametrized internal pipeline. P and S transformations are combined into one PS transformation and + * have common pipeline. L transformation has its own separate pipeline. The total latency of this core is thus + * PS_PIPELINE_STAGES*L_PIPELINE_STAGES. The fastest version completes the tranformation in 2*2=4 cycles, the slowest + * version requires 8*64=512 cycles. S transformation substitutes bytes according to a lookup table. P transformation does + * permutation of input bytes. L transformation multiplies input data by a special predefined matrix. If you don't understand + * how matrices are multiplied, you should not try to understand how the following code works. This may damage your brain. + * You've been warned. Seriously. + * + */ + + + // + // Constants + // + + /* + * PS transformation operates on 64-bit words. Input data contains 512/64=8 such words. + * Depending on PS pipeline stage count we can transform 1, 2 or 4 words at a time. + * + * L transformation operates on 64-bit words. Depending on L pipeline stage count we + * can transform 1, 2, 4, 8, 16 or 32 bits of a word at a time. + * + */ + + localparam PS_WORDS_AT_ONCE = 8 / PS_PIPELINE_STAGES; + localparam L_BITS_AT_ONCE = 64 / L_PIPELINE_STAGES; + + /* + * These functions return number of bytes needed to store pipeline stage counters. They will + * also prevent users from specifying illegal pipeline widths . This module will not synthesize + * with invalid pipeline stage count, because counter width will not be explicitely defined. + * + */ + + function integer PS_NUM_COUNT_BITS; + input integer x; + begin + case (x) + 2: PS_NUM_COUNT_BITS = 1; + 4: PS_NUM_COUNT_BITS = 2; + 8: PS_NUM_COUNT_BITS = 3; + endcase + end + endfunction + + function integer L_NUM_COUNT_BITS; + input integer y; + begin + case (y) + 2: L_NUM_COUNT_BITS = 1; + 4: L_NUM_COUNT_BITS = 2; + 8: L_NUM_COUNT_BITS = 3; + 16: L_NUM_COUNT_BITS = 4; + 32: L_NUM_COUNT_BITS = 5; + 64: L_NUM_COUNT_BITS = 6; + endcase + end + endfunction + + + // + // Counter Widths + // + localparam L_CNT_BITS = L_NUM_COUNT_BITS(L_PIPELINE_STAGES); // width of L counter + localparam PS_CNT_BITS = PS_NUM_COUNT_BITS(PS_PIPELINE_STAGES); // width of PS counter + + + // + // Input Multiplexor + // + wire [63: 0] din_mux[0:7]; // eight 64-bit words + + /* + * This multiplexor does the P transformation. P transformation is effectively a matrix + * transposition. Input 512-bit word is treated as a 8x8 byte matrix. Multiplexor outputs + * a set of 8 64-bit words. These words are columns of the original matrix (transposition + * turns rows into colums). + * + */ + + genvar i, j; + generate for (i=0; i<8; i=i+1) + begin: gen_din_mux_i + for (j=0; j<8; j=j+1) begin: gen_din_mux_j + assign din_mux[i][8*j + 7 : 8*j] = din[64*j + 8*i + 7 : 64*j + 8*i]; + end + end + endgenerate + + + // + // Output Multiplexor + // + reg [63: 0] dout_mux[0:7]; // eight 64-bit words + + /* + * Output 64-bit subwords are concatenated to form output 512-bit word. + * + */ + + genvar k; + generate for (k=0; k<8; k=k+1) + begin: gen_dout_mux + assign dout[64*k+63:64*k] = dout_mux[k]; + end + endgenerate + + + // + // PS and L Counters + // + + /* + * These counters control internal data flow of this core. For example, if PS has 2 stages and + * L has 4 stages, then the count will look like this: + * ____ + * ENA \\\________________________________ + * _____ _ + * RDY ^ \_______________________________/ + * | | | | | | | | | | + * +----+---+---+---+---+---+---+---+---+---+- + * | PS | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | + * +----+---+---+---+---+---+---+---+---+---+- + * | L | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | 0 | + * +----+---+---+---+---+---+---+---+---+---+- + * ^ ^ | + * | | +--> both counters will be zero during the last cycle + * | | + * +---------------+------------------> preloading of new word(s) into S lookup table(s) + * + */ + + reg [ L_CNT_BITS-1:0] l_count = { L_CNT_BITS{1'b0}}; + reg [PS_CNT_BITS-1:0] ps_count = {PS_CNT_BITS{1'b0}}; + + + // + // Handy Flags + // + + /* + * These flags are used instead of lengthy (z_count == {Z_CNT_BITS{1'bZ}}) comparisons. + * + */ + + wire l_count_done = ( l_count == { L_CNT_BITS{1'b1}}) ? 1 : 0; + wire ps_count_done = (ps_count == {PS_CNT_BITS{1'b1}}) ? 1 : 0; + + wire l_count_zero = ( l_count == { L_CNT_BITS{1'b0}}) ? 1 : 0; + wire ps_count_zero = (ps_count == {PS_CNT_BITS{1'b0}}) ? 1 : 0; + + + // + // Preload Flags + // + + /* + * These flags are used as clock enables for S lookup table. + * + */ + + wire ps_preload_first = (rdy && ena); + wire ps_preload_next = (!rdy && !ps_count_zero && l_count_zero); + + + // + // Last Flag + // + + /* + * This flag indicates that core operation is about to complete. + * + */ + assign last = !rdy && ps_count_zero && l_count_zero; + + + // + // Counter Logic + // + always @(posedge clk) begin + // + if (!rdy && l_count_done) ps_count <= ps_count + 1'b1; // next word(s) + // + if (rdy && ena) l_count <= l_count + 1'b1; // start of transformation + // + if (!rdy && !(ps_count_zero && l_count_zero)) l_count <= l_count + 1'b1; // next part of word(s) + // + end + + + // + // Ready Output Register + // + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; + + + // + // Ready Set and Clear Logic + // + always @(posedge clk) begin + // + if (rdy && ena) rdy_reg <= 0; // start of transformation + // + if (!rdy && l_count_zero && ps_count_zero) rdy_reg <= 1; // end of transformation + // + end + + + // + // S Table Indices + // + + /* + * To transform several words at once a set of indices is required. + * + */ + + wire [ 2: 0] s_in_offset [0:PS_WORDS_AT_ONCE-1]; // indices of words being transformed + wire [63: 0] s_out [0:PS_WORDS_AT_ONCE-1]; // output words of S transformation + + assign s_in_offset[0] = ps_count * PS_WORDS_AT_ONCE; // the first index is defined by PS counter, + // following indices are linearly increasing + + genvar sw, sb; // word and byte counter + generate for (sw=1; sw