`timescale 1ns / 1ps
module streebog_core_adder_s6
ena, rdy,
x, y, sum
// Ports
input wire clk; // core clock
input wire ena; // start addition flag
output wire rdy; // addition done flag (sum is valid)
input wire [511:0] x; // item x
input wire [511:0] y; // item y
output wire [511:0] sum; // x+y
* ISE cannot synthesize adders using fabric that are more than 256 bits wide. Items X and Y are 512-bit wide, so
* Spartan-6 DSP blocks are used to overcome this issue. Every DSP block is configured to add 32 bits at a time,
* so total of 512/32=16 DSP blocks are required to implement addition. Every DSP block is configured to expose
* carry input and output ports. Overflow at 512-bit boundary should be ignored according to the specification,
* that's why only 15 intermediate carry lines are required.
* +-------------------+-------------------+- -+-------------------+
* [X] | 511 : 480 | 479 : 448 | ... | 31 : 0 |
* +------*------------+------*------------+- -+------*------------+
* | | |
* +------|------------+------|------------+- -+------|------------+
* [Y] | | 511 : 480 | | 479 : 448 | ... | | 31 : 0 |
* +------|-----*------+------|------------+- -+------|------------+
* | | | | | |
* | | | | | |
* v v v v v v
* +---+-+---+ +---+-+---+ +---+-+---+
* | A | | B | | A | | B | | A | | B |
* +---------+ +---+-+---+ +---+-+---+
* | DSP #15 | | DSP #15 | | DSP #0 |
* |---------| |---------| |---------|
* | Carry | | Carry | | Carry |
* X --<-Out In-<--C[14]--<-Out In-<--C[13]- ... -C[ 0]--<-Out In-<-- 0
* +---------+ +---------+ +---------+
* | S | | S | | S |
* +---------+ +---------+ +---------+
* | | |
* v v v
* +---------*---------+---------*---------+- -+---------*---------+
* [Z] | 511 : 480 | 479 : 448 | ... | 31 : 0 |
* +-------------------+-------------------+- -+-------------------+
// Internals
wire [511:0] z; // concatenated outputs of adders
wire [14:0] z_carry; // carry lines
reg [511:0] sum_reg; // output register
assign sum = sum_reg;
// Shift Register
* This shift register is re-loaded with "walking one" bit pattern whenever enable
* input is active and adder core is ready. The most significant bit [17] acts as a
* ready flag. Lower 16 bits [15:0] control DSP blocks (Clock Enable). Intermediate
* bit [16] is required to compensate for 1-cycle latency of DSP blocks.
reg [17: 0] ce_shreg = {1'b1, 1'b0, 16'h0000};
assign rdy = ce_shreg[17];
// Shift Register Logic
always @(posedge clk)
if (! rdy) ce_shreg <= {ce_shreg[16:0], 1'b0};
else if (ena) ce_shreg <= {1'b0, 1'b0, 16'h0001};
// Output Register Logic
always @(posedge clk)
if (ce_shreg[16] == 1'b1) sum_reg <= z;
// LSB Adder
adder_s6 adder_s6_lsb
.clk (clk), //
.ce (ce_shreg[0]), // clock enable [0]
.a (x[ 31: 0]), //
.b (y[ 31: 0]), //
.s (z[ 31: 0]), //
.c_in (1'b0), // carry input tied to 0
.c_out (z_carry[0]) // carry[0] to next adder
// MSB Adder
adder_s6 adder_s6_msb
.clk (clk), //
.ce (ce_shreg[15]), // clock enable [15]
.a (x[511:480]), //
.b (y[511:480]), //
.s (z[511:480]), //
.c_in (z_carry[14]), // carry[14] from previous adder
.c_out () // carry output not connected
// Intermediate Adders
genvar i;
generate for (i=1; i<=14; i=i+1)
begin: gen_adder_s6
adder_s6 adder_s6_int
.clk (clk), //
.ce (ce_shreg[i]), // clock enable [1..14]
.a (x[32*i+31:32*i]), //
.b (y[32*i+31:32*i]), //
.s (z[32*i+31:32*i]), //
.c_in (z_carry[i-1]), // carry[0..13] from previous adder
.c_out (z_carry[i]) // carry[1..14] to next adder