summaryrefslogtreecommitdiff
path: root/rtl/modular_multiplier/ed25519_modular_reductor.v
diff options
context:
space:
mode:
Diffstat (limited to 'rtl/modular_multiplier/ed25519_modular_reductor.v')
-rw-r--r--rtl/modular_multiplier/ed25519_modular_reductor.v699
1 files changed, 699 insertions, 0 deletions
diff --git a/rtl/modular_multiplier/ed25519_modular_reductor.v b/rtl/modular_multiplier/ed25519_modular_reductor.v
new file mode 100644
index 0000000..5b50cb3
--- /dev/null
+++ b/rtl/modular_multiplier/ed25519_modular_reductor.v
@@ -0,0 +1,699 @@
+//------------------------------------------------------------------------------
+//
+// ed25519_modular_reductor.v
+// -----------------------------------------------------------------------------
+// Curve 25519 Modular Reductor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2018, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module ed25519_modular_reductor
+(
+ clk, rst_n,
+ ena, rdy,
+ x_addr, y_addr, y_wren,
+ x_din, y_dout
+);
+
+
+ //
+ // Constants
+ //
+ localparam integer OPERAND_NUM_WORDS = 8;
+ localparam integer WORD_COUNTER_WIDTH = 3;
+
+
+ /*
+//
+// Handy Numbers
+//
+localparam [WORD_COUNTER_WIDTH:0] WORD_INDEX_ZERO = 0;
+localparam [WORD_COUNTER_WIDTH:0] WORD_INDEX_LAST = 2 * OPERAND_NUM_WORDS - 1;
+ */
+/*
+
+ //
+ // Include Handy Functions
+ //
+function [WORD_COUNTER_WIDTH:0] WORD_INDEX_PREVIOUS_OR_LAST;
+input [WORD_COUNTER_WIDTH:0] WORD_INDEX_CURRENT;
+begin
+WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
+WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
+end
+endfunction
+*/
+
+ //
+ // Ports
+ //
+ input clk; // system clock
+ input rst_n; // active-low async reset
+
+ input ena; // enable input
+ output rdy; // ready output
+
+ output [WORD_COUNTER_WIDTH :0] x_addr; // index of current X word
+ output [WORD_COUNTER_WIDTH-1:0] p_addr; // index of current P word
+
+ output y_wren; // store current Y word now
+
+ input [31:0] x_din; // current word of X
+ output [31:0] y_dout; // current word of Y
+
+/*
+//
+// Word Indices
+//
+reg [WORD_COUNTER_WIDTH:0] index_x;
+
+
+// map registers to output ports
+assign x_addr = index_x;
+*/
+
+ //
+ // FSM
+ //
+ localparam integer FSM_SHREG_WIDTH = 2;//(2 * OPERAND_NUM_WORDS + 1) + (5 * 2) + 1;
+
+ localparam FSM_SHREG_INIT = {{(FSM_SHREG_WIDTH-1){1'b0}}, 1'b1};
+
+ reg [FSM_SHREG_WIDTH-1:0] fsm_shreg = FSM_SHREG_INIT;
+
+ assign rdy = fsm_shreg[0];
+
+ /*
+wire [2 * OPERAND_NUM_WORDS - 1:0] fsm_shreg_inc_index_x = fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 1 -: 2 * OPERAND_NUM_WORDS];
+wire [2 * OPERAND_NUM_WORDS - 1:0] fsm_shreg_store_word_z = fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 2 -: 2 * OPERAND_NUM_WORDS];
+wire [2 * 5 - 1:0] fsm_shreg_reduce_stages = fsm_shreg[ 1 +: 2 * 5];
+
+wire [5-1:0] fsm_shreg_reduce_stage_start;
+wire [5-1:0] fsm_shreg_reduce_stage_stop;
+
+genvar s;
+generate for (s=0; s<5; s=s+1)
+begin : gen_fsm_shreg_reduce_stages
+assign fsm_shreg_reduce_stage_start[5 - (s + 1)] = fsm_shreg_reduce_stages[2 * (5 - s) - 1];
+assign fsm_shreg_reduce_stage_stop[5 - (s + 1)] = fsm_shreg_reduce_stages[2 * (5 - s) - 2];
+end
+endgenerate
+
+wire inc_index_x = |fsm_shreg_inc_index_x;
+wire store_word_z = |fsm_shreg_store_word_z;
+wire reduce_start = |fsm_shreg_reduce_stage_start;
+wire reduce_stop = |fsm_shreg_reduce_stage_stop;
+wire store_p = fsm_shreg_reduce_stage_stop[0];
+
+
+wire reduce_adder0_done;
+wire reduce_adder1_done;
+wire reduce_subtractor_done;
+
+wire reduce_done_all = reduce_adder0_done & reduce_adder1_done & reduce_subtractor_done;
+
+*/
+ always @(posedge clk or negedge rst_n)
+ //
+ if (rst_n == 1'b0)
+ //
+ fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+ //
+ else begin
+ //
+ if (rdy)
+ //
+ fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+ //
+ else /*if (!reduce_stop || reduce_done_all)*/
+ //
+ fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+ //
+ end
+
+/*
+//
+// Word Index Increment Logic
+//
+always @(posedge clk)
+//
+if (rdy)
+//
+index_x <= WORD_INDEX_LAST;
+//
+else if (inc_index_x)
+//
+index_x <= WORD_INDEX_PREVIOUS_OR_LAST(index_x);
+
+
+//
+// Look-up Table
+//
+
+//
+// Take a look at the corresponding C model for more information
+// on how exactly the math behind reduction works. The first step
+// is to assemble nine 256-bit values ("z-words") from 32-bit parts
+// of the full 512-bit product ("c-word"). The problem with z5 is
+// that it contains c13 two times. This implementation scans from
+// c15 to c0 and writes current part of c-word into corresponding
+// parts of z-words. Since those 32-bit parts are stored in block
+// memories, one source word can only be written to one location in
+// every z-word at a time. The trick is to delay c13 and then write
+// the delayed value at the corresponding location in z5 instead of
+// the next c12. "z_save" flag is used to indicate that the current
+// word should be delayed and written once again during the next cycle.
+//
+
+reg [9*WORD_COUNTER_WIDTH-1:0] z_addr; //
+reg [9 -1:0] z_wren; //
+reg [9 -1:0] z_mask; // mask input to store zero word
+reg [9 -1:0] z_save; // save previous word once again
+
+always @(posedge clk)
+//
+if (inc_index_x)
+//
+case (index_x)
+//
+// s9 s8 s7 s6 s5 s4 s3 s2 s1
+// || || || || || || || || ||
+4'd00: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd00};
+4'd01: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd01};
+4'd02: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd02};
+4'd03: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd03};
+4'd04: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd04};
+4'd05: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd05};
+4'd06: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd06};
+4'd07: z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd07};
+4'd08: z_addr <= {3'd02, 3'd03, 3'd04, 3'd06, 3'd07, 3'd00, 3'd00, 3'd00, 3'dxx};
+4'd09: z_addr <= {3'd03, 3'd04, 3'd06, 3'd03, 3'd00, 3'd01, 3'd01, 3'd01, 3'dxx};
+4'd10: z_addr <= {3'd04, 3'd05, 3'd05, 3'd07, 3'd01, 3'd02, 3'd02, 3'd02, 3'dxx};
+4'd11: z_addr <= {3'd05, 3'd06, 3'd07, 3'd00, 3'd02, 3'd03, 3'd07, 3'd03, 3'dxx};
+4'd12: z_addr <= {3'd06, 3'd07, 3'd00, 3'd01, 3'd06, 3'd04, 3'd03, 3'd04, 3'dxx};
+4'd13: z_addr <= {3'd07, 3'd00, 3'd01, 3'd02, 3'd03, 3'd05, 3'd04, 3'd05, 3'dxx};
+4'd14: z_addr <= {3'd00, 3'd01, 3'd02, 3'd04, 3'd04, 3'd06, 3'd05, 3'd06, 3'dxx};
+4'd15: z_addr <= {3'd01, 3'd02, 3'd03, 3'd05, 3'd05, 3'd07, 3'd06, 3'd07, 3'dxx};
+//
+default: z_addr <= {9*WORD_COUNTER_WIDTH{1'bX}};
+//
+endcase
+
+always @(posedge clk)
+//
+case (index_x)
+//
+// 9 8 7 6 5 4 3 2 1
+// | | | | | | | | |
+4'd00: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd01: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd02: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd03: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd04: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd05: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd06: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd07: z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+4'd08: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+4'd09: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+4'd10: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+4'd11: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+4'd12: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+4'd13: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+4'd14: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+4'd15: z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+//
+default: z_wren <= {9{1'b0}};
+//
+endcase
+
+always @(posedge clk)
+//
+if (inc_index_x)
+//
+case (index_x)
+//
+// 9 8 7 6 5 4 3 2 1
+// | | | | | | | | |
+4'd00: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd01: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd02: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd03: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd04: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd05: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd06: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd07: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd08: z_mask <= {1'b1, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+4'd09: z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+4'd10: z_mask <= {1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+4'd11: z_mask <= {1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0, 1'b0};
+4'd12: z_mask <= {1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
+4'd13: z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
+4'd14: z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd15: z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+//
+default: z_mask <= {9{1'bX}};
+//
+endcase
+
+always @(posedge clk)
+//
+if (inc_index_x)
+//
+case (index_x)
+//
+// 9 8 7 6 5 4 3 2 1
+// | | | | | | | | |
+4'd00: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd01: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd02: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd03: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd04: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd05: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd06: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd07: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd08: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd09: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd10: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd11: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd12: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd13: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd14: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+4'd15: z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+//
+default: z_save <= {9{1'bX}};
+//
+endcase
+
+
+//
+// Intermediate Numbers
+//
+reg [WORD_COUNTER_WIDTH-1:0] reduce_z_addr[1:9];
+wire [ 32-1:0] reduce_z_dout[1:9];
+
+reg [31: 0] x_din_dly;
+always @(posedge clk)
+//
+x_din_dly <= x_din;
+
+
+genvar z;
+generate for (z=1; z<=9; z=z+1)
+//
+begin : gen_z_bram
+//
+bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+bram_c_inst
+(
+.clk (clk),
+
+.a_addr (z_addr[(z-1) * WORD_COUNTER_WIDTH +: WORD_COUNTER_WIDTH]),
+.a_wr (z_wren[z-1] & store_word_z),
+.a_in (z_mask[z-1] ? {32{1'b0}} : (z_save[z-1] ? x_din_dly : x_din)),
+.a_out (),
+
+.b_addr (reduce_z_addr[z]),
+.b_out (reduce_z_dout[z])
+);
+//
+end
+//
+endgenerate
+
+
+
+
+wire [ 32-1:0] bram_sum0_wr_din;
+wire [WORD_COUNTER_WIDTH-1:0] bram_sum0_wr_addr;
+wire bram_sum0_wr_wren;
+
+wire [ 32-1:0] bram_sum1_wr_din;
+wire [WORD_COUNTER_WIDTH-1:0] bram_sum1_wr_addr;
+wire bram_sum1_wr_wren;
+
+wire [ 32-1:0] bram_diff_wr_din;
+wire [WORD_COUNTER_WIDTH-1:0] bram_diff_wr_addr;
+wire bram_diff_wr_wren;
+
+wire [ 32-1:0] bram_sum0_rd_dout;
+reg [WORD_COUNTER_WIDTH-1:0] bram_sum0_rd_addr;
+
+wire [ 32-1:0] bram_sum1_rd_dout;
+reg [WORD_COUNTER_WIDTH-1:0] bram_sum1_rd_addr;
+
+wire [ 32-1:0] bram_diff_rd_dout;
+reg [WORD_COUNTER_WIDTH-1:0] bram_diff_rd_addr;
+
+
+bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+bram_sum0_inst
+(
+.clk (clk),
+
+.a_addr (bram_sum0_wr_addr),
+.a_wr (bram_sum0_wr_wren),
+.a_in (bram_sum0_wr_din),
+.a_out (),
+
+.b_addr (bram_sum0_rd_addr),
+.b_out (bram_sum0_rd_dout)
+);
+
+bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+bram_sum1_inst
+(
+.clk (clk),
+
+.a_addr (bram_sum1_wr_addr),
+.a_wr (bram_sum1_wr_wren),
+.a_in (bram_sum1_wr_din),
+.a_out (),
+
+.b_addr (bram_sum1_rd_addr),
+.b_out (bram_sum1_rd_dout)
+);
+
+bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+bram_diff_inst
+(
+.clk (clk),
+
+.a_addr (bram_diff_wr_addr),
+.a_wr (bram_diff_wr_wren),
+.a_in (bram_diff_wr_din),
+.a_out (),
+
+.b_addr (bram_diff_rd_addr),
+.b_out (bram_diff_rd_dout)
+);
+
+
+wire [WORD_COUNTER_WIDTH-1:0] adder0_ab_addr;
+wire [WORD_COUNTER_WIDTH-1:0] adder1_ab_addr;
+wire [WORD_COUNTER_WIDTH-1:0] subtractor_ab_addr;
+
+reg [ 32-1:0] adder0_a_din;
+reg [ 32-1:0] adder0_b_din;
+
+reg [ 32-1:0] adder1_a_din;
+reg [ 32-1:0] adder1_b_din;
+
+reg [ 32-1:0] subtractor_a_din;
+reg [ 32-1:0] subtractor_b_din;
+
+// n_addr - only 1 output, because all modules are in sync
+
+modular_adder #
+(
+.OPERAND_NUM_WORDS (OPERAND_NUM_WORDS),
+.WORD_COUNTER_WIDTH (WORD_COUNTER_WIDTH)
+)
+adder_inst0
+(
+.clk (clk),
+.rst_n (rst_n),
+
+.ena (reduce_start),
+.rdy (reduce_adder0_done),
+
+.ab_addr (adder0_ab_addr),
+.n_addr (),
+.s_addr (bram_sum0_wr_addr),
+.s_wren (bram_sum0_wr_wren),
+
+.a_din (adder0_a_din),
+.b_din (adder0_b_din),
+.n_din (n_din),
+.s_dout (bram_sum0_wr_din)
+);
+
+modular_adder #
+(
+.OPERAND_NUM_WORDS (OPERAND_NUM_WORDS),
+.WORD_COUNTER_WIDTH (WORD_COUNTER_WIDTH)
+)
+adder_inst1
+(
+.clk (clk),
+.rst_n (rst_n),
+
+.ena (reduce_start),
+.rdy (reduce_adder1_done),
+
+.ab_addr (adder1_ab_addr),
+.n_addr (),
+.s_addr (bram_sum1_wr_addr),
+.s_wren (bram_sum1_wr_wren),
+
+.a_din (adder1_a_din),
+.b_din (adder1_b_din),
+.n_din (n_din),
+.s_dout (bram_sum1_wr_din)
+);
+
+modular_subtractor #
+(
+.OPERAND_NUM_WORDS (OPERAND_NUM_WORDS),
+.WORD_COUNTER_WIDTH (WORD_COUNTER_WIDTH)
+)
+subtractor_inst
+(
+.clk (clk),
+.rst_n (rst_n),
+
+.ena (reduce_start),
+.rdy (reduce_subtractor_done),
+
+.ab_addr (subtractor_ab_addr),
+.n_addr (n_addr),
+.d_addr (bram_diff_wr_addr),
+.d_wren (bram_diff_wr_wren),
+
+.a_din (subtractor_a_din),
+.b_din (subtractor_b_din),
+.n_din (n_din),
+.d_dout (bram_diff_wr_din)
+);
+
+
+//
+// Address (Operand) Selector
+//
+always @(*)
+//
+case (fsm_shreg_reduce_stage_stop)
+//
+5'b10000: begin
+reduce_z_addr[1] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[2] = adder0_ab_addr;
+reduce_z_addr[3] = adder1_ab_addr;
+reduce_z_addr[4] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[5] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[6] = subtractor_ab_addr;
+reduce_z_addr[7] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[8] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[9] = {WORD_COUNTER_WIDTH{1'bX}};
+bram_sum0_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+bram_sum1_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+end
+//
+5'b01000: begin
+reduce_z_addr[1] = adder0_ab_addr;
+reduce_z_addr[2] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[3] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[4] = adder1_ab_addr;
+reduce_z_addr[5] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[6] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[7] = subtractor_ab_addr;
+reduce_z_addr[8] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[9] = {WORD_COUNTER_WIDTH{1'bX}};
+bram_sum0_rd_addr = adder0_ab_addr;
+bram_sum1_rd_addr = adder1_ab_addr;
+bram_diff_rd_addr = subtractor_ab_addr;
+end
+//
+5'b00100: begin
+reduce_z_addr[1] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[2] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[3] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[4] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[5] = adder0_ab_addr;
+reduce_z_addr[6] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[7] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[8] = subtractor_ab_addr;
+reduce_z_addr[9] = {WORD_COUNTER_WIDTH{1'bX}};
+bram_sum0_rd_addr = adder0_ab_addr;
+bram_sum1_rd_addr = adder1_ab_addr;
+bram_diff_rd_addr = subtractor_ab_addr;
+end
+//
+5'b00010: begin
+reduce_z_addr[1] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[2] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[3] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[4] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[5] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[6] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[7] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[8] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[9] = subtractor_ab_addr;
+bram_sum0_rd_addr = adder0_ab_addr;
+bram_sum1_rd_addr = adder0_ab_addr;
+bram_diff_rd_addr = subtractor_ab_addr;
+end
+//
+5'b00001: begin
+reduce_z_addr[1] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[2] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[3] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[4] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[5] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[6] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[7] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[8] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[9] = {WORD_COUNTER_WIDTH{1'bX}};
+bram_sum0_rd_addr = adder0_ab_addr;
+bram_sum1_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+bram_diff_rd_addr = adder0_ab_addr;
+end
+//
+default: begin
+reduce_z_addr[1] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[2] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[3] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[4] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[5] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[6] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[7] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[8] = {WORD_COUNTER_WIDTH{1'bX}};
+reduce_z_addr[9] = {WORD_COUNTER_WIDTH{1'bX}};
+bram_sum0_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+bram_sum1_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+end
+//
+endcase
+
+
+//
+// adder 0
+//
+always @(*) begin
+//
+case (fsm_shreg_reduce_stage_stop)
+5'b10000: adder0_a_din = reduce_z_dout[2];
+5'b01000: adder0_a_din = bram_sum0_rd_dout;
+5'b00100: adder0_a_din = bram_sum0_rd_dout;
+5'b00010: adder0_a_din = bram_sum0_rd_dout;
+5'b00001: adder0_a_din = bram_sum0_rd_dout;
+default: adder0_a_din = {32{1'bX}};
+endcase
+//
+case (fsm_shreg_reduce_stage_stop)
+5'b10000: adder0_b_din = reduce_z_dout[2];
+5'b01000: adder0_b_din = reduce_z_dout[1];
+5'b00100: adder0_b_din = reduce_z_dout[5];
+5'b00010: adder0_b_din = bram_sum1_rd_dout;
+5'b00001: adder0_b_din = bram_diff_rd_dout;
+default: adder0_b_din = {32{1'bX}};
+endcase
+//
+end
+
+//
+// adder 1
+//
+always @(*) begin
+//
+case (fsm_shreg_reduce_stage_stop)
+5'b10000: adder1_a_din = reduce_z_dout[3];
+5'b01000: adder1_a_din = bram_sum1_rd_dout;
+5'b00100: adder1_a_din = bram_sum1_rd_dout;
+5'b00010: adder1_a_din = {32{1'bX}};
+5'b00001: adder1_a_din = {32{1'bX}};
+default: adder1_a_din = {32{1'bX}};
+endcase
+//
+case (fsm_shreg_reduce_stage_stop)
+5'b10000: adder1_b_din = reduce_z_dout[3];
+5'b01000: adder1_b_din = reduce_z_dout[4];
+5'b00100: adder1_b_din = {32{1'b0}};
+5'b00010: adder1_b_din = {32{1'bX}};
+5'b00001: adder1_b_din = {32{1'bX}};
+default: adder1_b_din = {32{1'bX}};
+endcase
+//
+end
+
+
+//
+// subtractor
+//
+always @(*) begin
+//
+case (fsm_shreg_reduce_stage_stop)
+5'b10000: subtractor_a_din = {32{1'b0}};
+5'b01000: subtractor_a_din = bram_diff_rd_dout;
+5'b00100: subtractor_a_din = bram_diff_rd_dout;
+5'b00010: subtractor_a_din = bram_diff_rd_dout;
+5'b00001: subtractor_a_din = {32{1'bX}};
+default: subtractor_a_din = {32{1'bX}};
+endcase
+//
+case (fsm_shreg_reduce_stage_stop)
+5'b10000: subtractor_b_din = reduce_z_dout[6];
+5'b01000: subtractor_b_din = reduce_z_dout[7];
+5'b00100: subtractor_b_din = reduce_z_dout[8];
+5'b00010: subtractor_b_din = reduce_z_dout[9];
+5'b00001: subtractor_b_din = {32{1'bX}};
+default: subtractor_b_din = {32{1'bX}};
+endcase
+//
+end
+
+
+//
+// Address Mapping
+//
+assign p_addr = bram_sum0_wr_addr;
+assign p_wren = bram_sum0_wr_wren & store_p;
+assign p_dout = bram_sum0_wr_din;
+*/
+
+
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------