`timescale 1ns / 1ps module streebog_core_lps ( clk, ena, rdy, last, din, dout ); // // Parameters // parameter PS_PIPELINE_STAGES = 8; // 2, 4, 8 parameter L_PIPELINE_STAGES = 8; // 2, 4, 8, 16, 32, 64 // // Ports // input wire clk; // core clock input wire ena; // start transformation flag output wire rdy; // transformation done flag (dout is valid) output wire last; // transformation about to complete (rdy flag will be asserted during the next cycle) input wire [511:0] din; // input data to transform output wire [511:0] dout; // output data (result of transformation) /* * This LPS core has parametrized internal pipeline. P and S transformations are combined into one PS transformation and * have common pipeline. L transformation has its own separate pipeline. The total latency of this core is thus * PS_PIPELINE_STAGES*L_PIPELINE_STAGES. The fastest version completes the tranformation in 2*2=4 cycles, the slowest * version requires 8*64=512 cycles. S transformation substitutes bytes according to a lookup table. P transformation does * permutation of input bytes. L transformation multiplies input data by a special predefined matrix. If you don't understand * how matrices are multiplied, you should not try to understand how the following code works. This may damage your brain. * You've been warned. Seriously. * */ // // Constants // /* * PS transformation operates on 64-bit words. Input data contains 512/64=8 such words. * Depending on PS pipeline stage count we can transform 1, 2 or 4 words at a time. * * L transformation operates on 64-bit words. Depending on L pipeline stage count we * can transform 1, 2, 4, 8, 16 or 32 bits of a word at a time. * */ localparam PS_WORDS_AT_ONCE = 8 / PS_PIPELINE_STAGES; localparam L_BITS_AT_ONCE = 64 / L_PIPELINE_STAGES; /* * These functions return number of bytes needed to store pipeline stage counters. They will * also prevent users from specifying illegal pipeline widths . This module will not synthesize * with invalid pipeline stage count, because counter width will not be explicitely defined. * */ function integer PS_NUM_COUNT_BITS; input integer x; begin case (x) 2: PS_NUM_COUNT_BITS = 1; 4: PS_NUM_COUNT_BITS = 2; 8: PS_NUM_COUNT_BITS = 3; endcase end endfunction function integer L_NUM_COUNT_BITS; input integer y; begin case (y) 2: L_NUM_COUNT_BITS = 1; 4: L_NUM_COUNT_BITS = 2; 8: L_NUM_COUNT_BITS = 3; 16: L_NUM_COUNT_BITS = 4; 32: L_NUM_COUNT_BITS = 5; 64: L_NUM_COUNT_BITS = 6; endcase end endfunction // // Counter Widths // localparam L_CNT_BITS = L_NUM_COUNT_BITS(L_PIPELINE_STAGES); // width of L counter localparam PS_CNT_BITS = PS_NUM_COUNT_BITS(PS_PIPELINE_STAGES); // width of PS counter // // Input Multiplexor // wire [63: 0] din_mux[0:7]; // eight 64-bit words /* * This multiplexor does the P transformation. P transformation is effectively a matrix * transposition. Input 512-bit word is treated as a 8x8 byte matrix. Multiplexor outputs * a set of 8 64-bit words. These words are columns of the original matrix (transposition * turns rows into colums). * */ genvar i, j; generate for (i=0; i<8; i=i+1) begin: gen_din_mux_i for (j=0; j<8; j=j+1) begin: gen_din_mux_j assign din_mux[i][8*j + 7 : 8*j] = din[64*j + 8*i + 7 : 64*j + 8*i]; end end endgenerate // // Output Multiplexor // reg [63: 0] dout_mux[0:7]; // eight 64-bit words /* * Output 64-bit subwords are concatenated to form output 512-bit word. * */ genvar k; generate for (k=0; k<8; k=k+1) begin: gen_dout_mux assign dout[64*k+63:64*k] = dout_mux[k]; end endgenerate // // PS and L Counters // /* * These counters control internal data flow of this core. For example, if PS has 2 stages and * L has 4 stages, then the count will look like this: * ____ * ENA \\\________________________________ * _____ _ * RDY ^ \_______________________________/ * | | | | | | | | | | * +----+---+---+---+---+---+---+---+---+---+- * | PS | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | * +----+---+---+---+---+---+---+---+---+---+- * | L | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | 0 | * +----+---+---+---+---+---+---+---+---+---+- * ^ ^ | * | | +--> both counters will be zero during the last cycle * | | * +---------------+------------------> preloading of new word(s) into S lookup table(s) * */ reg [ L_CNT_BITS-1:0] l_count = { L_CNT_BITS{1'b0}}; reg [PS_CNT_BITS-1:0] ps_count = {PS_CNT_BITS{1'b0}}; // // Handy Flags // /* * These flags are used instead of lengthy (z_count == {Z_CNT_BITS{1'bZ}}) comparisons. * */ wire l_count_done = ( l_count == { L_CNT_BITS{1'b1}}) ? 1 : 0; wire ps_count_done = (ps_count == {PS_CNT_BITS{1'b1}}) ? 1 : 0; wire l_count_zero = ( l_count == { L_CNT_BITS{1'b0}}) ? 1 : 0; wire ps_count_zero = (ps_count == {PS_CNT_BITS{1'b0}}) ? 1 : 0; // // Preload Flags // /* * These flags are used as clock enables for S lookup table. * */ wire ps_preload_first = (rdy && ena); wire ps_preload_next = (!rdy && !ps_count_zero && l_count_zero); // // Last Flag // /* * This flag indicates that core operation is about to complete. * */ assign last = !rdy && ps_count_zero && l_count_zero; // // Counter Logic // always @(posedge clk) begin // if (!rdy && l_count_done) ps_count <= ps_count + 1'b1; // next word(s) // if (rdy && ena) l_count <= l_count + 1'b1; // start of transformation // if (!rdy && !(ps_count_zero && l_count_zero)) l_count <= l_count + 1'b1; // next part of word(s) // end // // Ready Output Register // reg rdy_reg = 1'b1; assign rdy = rdy_reg; // // Ready Set and Clear Logic // always @(posedge clk) begin // if (rdy && ena) rdy_reg <= 0; // start of transformation // if (!rdy && l_count_zero && ps_count_zero) rdy_reg <= 1; // end of transformation // end // // S Table Indices // /* * To transform several words at once a set of indices is required. * */ wire [ 2: 0] s_in_offset [0:PS_WORDS_AT_ONCE-1]; // indices of words being transformed wire [63: 0] s_out [0:PS_WORDS_AT_ONCE-1]; // output words of S transformation assign s_in_offset[0] = ps_count * PS_WORDS_AT_ONCE; // the first index is defined by PS counter, // following indices are linearly increasing genvar sw, sb; // word and byte counter generate for (sw=1; sw