From 89f913c3aa2a6dad35630f3882a06b99e0978105 Mon Sep 17 00:00:00 2001
From: Rob Austein <sra@hactrn.net>
Date: Tue, 7 Mar 2017 19:52:36 -0500
Subject: Promote to a repository in the core tree.

Change name of reset signal from rst_n to reset_n for consistancy with
other Cryptech cores.

Code common between this core and the ecdsa384 core split out into a
separate library repository.

Minor cleanup (Windows-isms, indentation).
---
 rtl/modular/modular_adder.v                        |  298 -----
 .../modular_invertor/helper/modinv_helper_copy.v   |  148 ---
 .../modular_invertor/helper/modinv_helper_init.v   |  172 ---
 .../helper/modinv_helper_invert_compare.v          |  286 ----
 .../helper/modinv_helper_invert_precalc.v          |  408 ------
 .../helper/modinv_helper_invert_update.v           |  257 ----
 .../helper/modinv_helper_reduce_precalc.v          |  328 -----
 .../helper/modinv_helper_reduce_update.v           |  153 ---
 rtl/modular/modular_invertor/modinv_clog2.v        |   10 -
 rtl/modular/modular_invertor/modular_invertor.v    |  981 --------------
 rtl/modular/modular_multiplier_256.v               |  804 ++++++------
 rtl/modular/modular_reductor_256.v                 | 1384 ++++++++++----------
 rtl/modular/modular_subtractor.v                   |  292 -----
 13 files changed, 1094 insertions(+), 4427 deletions(-)
 delete mode 100644 rtl/modular/modular_adder.v
 delete mode 100644 rtl/modular/modular_invertor/helper/modinv_helper_copy.v
 delete mode 100644 rtl/modular/modular_invertor/helper/modinv_helper_init.v
 delete mode 100644 rtl/modular/modular_invertor/helper/modinv_helper_invert_compare.v
 delete mode 100644 rtl/modular/modular_invertor/helper/modinv_helper_invert_precalc.v
 delete mode 100644 rtl/modular/modular_invertor/helper/modinv_helper_invert_update.v
 delete mode 100644 rtl/modular/modular_invertor/helper/modinv_helper_reduce_precalc.v
 delete mode 100644 rtl/modular/modular_invertor/helper/modinv_helper_reduce_update.v
 delete mode 100644 rtl/modular/modular_invertor/modinv_clog2.v
 delete mode 100644 rtl/modular/modular_invertor/modular_invertor.v
 delete mode 100644 rtl/modular/modular_subtractor.v

(limited to 'rtl/modular')

diff --git a/rtl/modular/modular_adder.v b/rtl/modular/modular_adder.v
deleted file mode 100644
index 5641feb..0000000
--- a/rtl/modular/modular_adder.v
+++ /dev/null
@@ -1,298 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// modular_adder.v
-// -----------------------------------------------------------------------------
-// Modular adder.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-module modular_adder
-	(
-		clk, rst_n,
-		ena, rdy,
-		ab_addr, n_addr, s_addr, s_wren,
-		a_din, b_din, n_din, s_dout
-	);
-
-
-		//
-		// Parameters
-		//
-	parameter	OPERAND_NUM_WORDS		= 8;
-	parameter	WORD_COUNTER_WIDTH	= 3;
-	
-	
-		//
-		// Handy Numbers
-		//
-	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
-	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
-	
-	
-		//
-		// Handy Functions
-		//
-	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
-		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
-		begin
-			WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
-				WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
-		end
-	endfunction
-	
-	
-		//
-		// Ports
-		//
-	input		wire										clk;			// system clock
-	input		wire										rst_n;		// active-low async reset
-	
-	input		wire										ena;			// enable input
-	output	wire										rdy;			// ready output
-	
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	ab_addr;		// index of current A and B words
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;		// index of current N word
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	s_addr;		// index of current S word
-	output	wire										s_wren;		// store current S word now
-	
-	input		wire	[                  31:0]	a_din;		// A
-	input		wire	[                  31:0]	b_din;		// B
-	input		wire	[                  31:0]	n_din;		// N
-	output	wire	[                  31:0]	s_dout;		// S = (A + B) mod N
-	
-	
-		//
-		// Word Indices
-		//
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_ab;
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_n;
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_s;
-
-		/* map registers to output ports */
-	assign ab_addr	= index_ab;
-	assign n_addr	= index_n;
-	assign s_addr	= index_s;
-
-
-		//
-		// Adder
-		//
-	wire	[31: 0]	add32_s;
-	wire				add32_c_in;
-	wire				add32_c_out;
-	
-	adder32_wrapper adder32
-	(
-		.clk		(clk),
-		.a			(a_din),
-		.b			(b_din),
-		.s			(add32_s),
-		.c_in		(add32_c_in),
-		.c_out	(add32_c_out)
-	);
-	
-	
-		//
-		// Subtractor
-		//
-	wire	[31: 0]	sub32_d;
-	wire				sub32_b_in;
-	wire				sub32_b_out;
-	
-	subtractor32_wrapper subtractor32
-	(
-		.clk		(clk),
-		.a			(add32_s),
-		.b			(n_din),
-		.d			(sub32_d),
-		.b_in		(sub32_b_in),
-		.b_out	(sub32_b_out)
-	);
-	
-	
-		//
-		// FSM
-		//
-		
-	localparam FSM_SHREG_WIDTH = 2*OPERAND_NUM_WORDS + 5;
-	
-	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
-	
-	assign rdy = fsm_shreg[0];
-	
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_n		= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_sum_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_sum_ab_n	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 3)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_data_s	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 3)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_s		= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 4)];
-	
-	wire fsm_latch_msb_carry	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
-	wire fsm_latch_msb_borrow	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 3)];
-	
-	wire inc_index_ab		= |fsm_shreg_inc_index_ab;
-	wire inc_index_n		= |fsm_shreg_inc_index_n;
-	wire store_sum_ab		= |fsm_shreg_store_sum_ab;
-	wire store_sum_ab_n	= |fsm_shreg_store_sum_ab_n;
-	wire store_data_s		= |fsm_shreg_store_data_s;
-	wire inc_index_s		= |fsm_shreg_inc_index_s;
-	
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)
-			//
-			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
-			//
-		else begin
-			//
-			if (rdy)	fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
-			//
-			else		fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
-			//
-		end
-		
-		
-		
-		
-	
-	
-	
-		//
-		// Carry & Borrow Masking Logic
-		//
-	reg	add32_c_mask;
-	reg	sub32_b_mask;
-	
-	always @(posedge clk) begin
-		//
-		add32_c_mask <= (index_ab == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
-		sub32_b_mask <= (index_n  == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
-		//
-	end
-		
-	assign add32_c_in = add32_c_out & ~add32_c_mask;
-	assign sub32_b_in = sub32_b_out & ~sub32_b_mask;
-	
-	
-		//
-		// Carry & Borrow Latch Logic
-		//
-	reg add32_carry_latch;
-	reg sub32_borrow_latch;
-	
-	always @(posedge clk) begin
-		//
-		if (fsm_latch_msb_carry) add32_carry_latch <= add32_c_out;
-		if (fsm_latch_msb_borrow) sub32_borrow_latch <= sub32_b_out;
-		//
-	end
-
-		
-		//
-		// Intermediate Results
-		//
-	reg	[32*OPERAND_NUM_WORDS-1:0]		s_ab;
-	reg	[32*OPERAND_NUM_WORDS-1:0]		s_ab_n;
-	
-	always @(posedge clk)
-		//
-		if (store_data_s) begin
-			//
-			s_ab		<= {{32{1'bX}}, s_ab[32*OPERAND_NUM_WORDS-1:32]};
-			s_ab_n	<= {{32{1'bX}}, s_ab_n[32*OPERAND_NUM_WORDS-1:32]};		
-			//
-		end else begin
-			//
-			if (store_sum_ab) s_ab <= {add32_s, s_ab[32*OPERAND_NUM_WORDS-1:32]};
-			if (store_sum_ab_n) s_ab_n <= {sub32_d, s_ab_n[32*OPERAND_NUM_WORDS-1:32]};
-			//
-		end
-	
-	
-		//
-		// Word Index Increment Logic
-		//
-	always @(posedge clk)
-		//
-		if (rdy) begin
-			//
-			index_ab		<= WORD_INDEX_ZERO;
-			index_n		<= WORD_INDEX_ZERO;
-			index_s		<= WORD_INDEX_ZERO;
-			//
-		end else begin
-			//
-			if (inc_index_ab) index_ab <= WORD_INDEX_NEXT_OR_ZERO(index_ab);
-			if (inc_index_n)	index_n	<= WORD_INDEX_NEXT_OR_ZERO(index_n);
-			if (inc_index_s)	index_s	<= WORD_INDEX_NEXT_OR_ZERO(index_s);
-			//
-		end
-	
-	
-			//
-			// Output Sum Selector
-			//
-	wire	mux_select_ab = sub32_borrow_latch && !add32_carry_latch;
-			
-	
-			//
-			// Output Data and Write Enable Logic
-			//
-	reg				s_wren_reg;
-	reg	[31: 0]	s_dout_reg;
-	wire	[31: 0]	s_dout_mux = mux_select_ab ? s_ab[31:0] : s_ab_n[31:0];
-	
-	assign s_wren = s_wren_reg;
-	assign s_dout = s_dout_reg;
-	
-	always @(posedge clk)
-		//
-		if (rdy) begin
-			//
-			s_wren_reg	<= 1'b0;
-			s_dout_reg	<= {32{1'bX}};
-			//
-		end else begin
-			//
-			s_wren_reg <= store_data_s;
-			s_dout_reg <= store_data_s ? s_dout_mux : {32{1'bX}};
-			//
-		end			
-
-	
-endmodule
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_copy.v b/rtl/modular/modular_invertor/helper/modinv_helper_copy.v
deleted file mode 100644
index 07c1b4f..0000000
--- a/rtl/modular/modular_invertor/helper/modinv_helper_copy.v
+++ /dev/null
@@ -1,148 +0,0 @@
-`timescale 1ns / 1ps
-
-module modinv_helper_copy
-	(
-		clk, rst_n,
-		ena, rdy,
-		s_addr,  s_din,
-		a1_addr,        a1_wren, a1_dout
-	);
-	
-	
-		//
-		// Parameters
-		//
-	parameter OPERAND_NUM_WORDS	= 8;
-	parameter OPERAND_ADDR_BITS	= 3;
-	
-	parameter BUFFER_NUM_WORDS		= 9;
-	parameter BUFFER_ADDR_BITS		= 4;
-	
-	
-		//
-		// clog2
-		//
-`include "..\modinv_clog2.v"
-	
-	
-		//
-		// Constants
-		//
-	localparam PROC_NUM_CYCLES	= OPERAND_NUM_WORDS + 2;
-	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
-	
-	
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	
-	input		wire									ena;
-	output	wire									rdy;
-
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	s_addr;
-	output	wire	[OPERAND_ADDR_BITS-1:0]	a1_addr;
-	
-	output	wire									a1_wren;
-	
-	input		wire	[                 31:0]	s_din;
-
-	output	wire	[                 31:0]	a1_dout;
-
-
-		//
-		// Counter
-		//
-	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
-
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
-																	proc_cnt + 1'b1 : proc_cnt_zero;
-	
-		//
-		// Addresses
-		//
-	reg	[OPERAND_ADDR_BITS-1:0]	addr_s;
-
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_s_max		= OPERAND_NUM_WORDS - 1;
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_s_zero		= {OPERAND_ADDR_BITS{1'b0}};
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_s_next		= (addr_s < addr_s_max) ?
-																		addr_s + 1'b1 : addr_s_zero;
-																		
-	reg	[OPERAND_ADDR_BITS-1:0]	addr_a1;
-	
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_a1_max		= OPERAND_NUM_WORDS - 1;
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_a1_zero	= {OPERAND_ADDR_BITS{1'b0}};
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_a1_next	= (addr_a1 < addr_a1_max) ?
-																		addr_a1 + 1'b1 : addr_a1_zero;
-																		
-	assign s_addr  = {{(BUFFER_ADDR_BITS - OPERAND_ADDR_BITS){1'b0}}, addr_s};
-	assign a1_addr = addr_a1;
-	
-		
-		//
-		// Ready Flag
-		//
-	assign rdy = (proc_cnt == proc_cnt_zero);
-	
-	
-		//
-		// Address Increment Logic
-		//
-	wire	inc_addr_s;
-	wire	inc_addr_a1;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_s_start		= 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_s_stop		= OPERAND_NUM_WORDS + 0;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_a1_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_a1_stop		= OPERAND_NUM_WORDS + 1;
-
-	assign inc_addr_s		= (proc_cnt >= cnt_inc_addr_s_start)  && (proc_cnt <= cnt_inc_addr_s_stop);
-	assign inc_addr_a1	= (proc_cnt >= cnt_inc_addr_a1_start) && (proc_cnt <= cnt_inc_addr_a1_stop);
-	
-	always @(posedge clk) begin
-		//
-		if (inc_addr_s)	addr_s <= addr_s_next;
-		else					addr_s <= addr_s_zero;
-		//
-		if (inc_addr_a1)	addr_a1 <= addr_a1_next;
-		else					addr_a1 <= addr_a1_zero;
-		//
-	end
-	
-	
-		//
-		// Write Enable Logic
-		//
-	wire	wren_a1;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_a1_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_a1_stop	= OPERAND_NUM_WORDS + 1;
-
-	assign wren_a1 = (proc_cnt >= cnt_wren_a1_start) && (proc_cnt <= cnt_wren_a1_stop);
-
-	assign a1_wren = wren_a1;
-	
-	
-		//
-		// Data Logic
-		//
-	assign a1_dout = s_din;
-	
-	
-		//
-		// Primary Counter Logic
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
-		else begin
-			if (!rdy)		proc_cnt <= proc_cnt_next;
-			else if (ena)	proc_cnt <= proc_cnt_next;
-		end
-
-
-endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_init.v b/rtl/modular/modular_invertor/helper/modinv_helper_init.v
deleted file mode 100644
index 0468134..0000000
--- a/rtl/modular/modular_invertor/helper/modinv_helper_init.v
+++ /dev/null
@@ -1,172 +0,0 @@
-`timescale 1ns / 1ps
-
-module modinv_helper_init
-	(
-		clk, rst_n,
-		ena, rdy,
-		a_addr, a_din,
-		q_addr, q_din,
-		r_addr, r_wren, r_dout,
-		s_addr, s_wren, s_dout,
-		u_addr, u_wren, u_dout,
-		v_addr, v_wren, v_dout
-	);
-	
-	
-		//
-		// Parameters
-		//
-	parameter OPERAND_NUM_WORDS	= 8;
-	parameter OPERAND_ADDR_BITS	= 3;
-	
-	parameter BUFFER_NUM_WORDS		= 9;
-	parameter BUFFER_ADDR_BITS		= 4;
-	
-	
-		//
-		// clog2
-		//
-`include "..\modinv_clog2.v"
-	
-	
-		//
-		// Constants
-		//
-	localparam PROC_NUM_CYCLES	= OPERAND_NUM_WORDS + 3;
-	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
-	
-	
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	input		wire									ena;
-	output	wire									rdy;
-
-	output	wire	[OPERAND_ADDR_BITS-1:0]	a_addr;
-	output	wire	[OPERAND_ADDR_BITS-1:0]	q_addr;
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	r_addr;
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	s_addr;
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	u_addr;
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	v_addr;
-	
-	output	wire									r_wren;
-	output	wire									s_wren;
-	output	wire									u_wren;
-	output	wire									v_wren;
-	
-	input		wire	[                 31:0]	a_din;
-	input		wire	[                 31:0]	q_din;
-	output	wire	[                 31:0]	r_dout;
-	output	wire	[                 31:0]	s_dout;
-	output	wire	[                 31:0]	u_dout;
-	output	wire	[                 31:0]	v_dout;
-
-
-		//
-		// Counter
-		//
-	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
-
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
-																	proc_cnt + 1'b1 : proc_cnt_zero;
-	
-		//
-		// Addresses
-		//
-	reg	[OPERAND_ADDR_BITS-1:0]	addr_aq;
-
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_aq_max		= OPERAND_NUM_WORDS - 1;
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_aq_zero	= {OPERAND_ADDR_BITS{1'b0}};
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_aq_next	= (addr_aq < addr_aq_max) ?
-																		addr_aq + 1'b1 : addr_aq_zero;
-																		
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_rsuv;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_rsuv_max	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_rsuv_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_rsuv_next	= (addr_rsuv < addr_rsuv_max) ?
-																		addr_rsuv + 1'b1 : addr_rsuv_zero;
-																		
-	assign a_addr = addr_aq;
-	assign q_addr = addr_aq;
-	
-	assign r_addr = addr_rsuv;
-	assign s_addr = addr_rsuv;
-	assign u_addr = addr_rsuv;
-	assign v_addr = addr_rsuv;
-	
-		
-		//
-		// Ready Flag
-		//
-	assign rdy = (proc_cnt == proc_cnt_zero);
-	
-	
-		//
-		// Address Increment Logic
-		//
-	wire	inc_addr_aq;
-	wire	inc_addr_rsuv;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_aq_start	= 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_aq_stop		= OPERAND_NUM_WORDS;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_rsuv_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_rsuv_stop	= BUFFER_NUM_WORDS + 1;
-
-	assign inc_addr_aq   = (proc_cnt >= cnt_inc_addr_aq_start)   && (proc_cnt <= cnt_inc_addr_aq_stop);
-	assign inc_addr_rsuv = (proc_cnt >= cnt_inc_addr_rsuv_start) && (proc_cnt <= cnt_inc_addr_rsuv_stop);
-	
-	always @(posedge clk) begin
-		//
-		if (inc_addr_aq)	addr_aq <= addr_aq_next;
-		else					addr_aq <= addr_aq_zero;
-		//
-		if (inc_addr_rsuv)	addr_rsuv <= addr_rsuv_next;
-		else						addr_rsuv <= addr_rsuv_zero;
-		//
-	end
-	
-	
-		//
-		// Write Enable Logic
-		//
-	wire	wren_rsuv;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_rsuv_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_rsuv_stop	= BUFFER_NUM_WORDS + 1;
-
-	assign wren_rsuv = (proc_cnt >= cnt_wren_rsuv_start) && (proc_cnt <= cnt_wren_rsuv_stop);
-
-	assign r_wren = wren_rsuv;
-	assign s_wren = wren_rsuv;
-	assign u_wren = wren_rsuv;
-	assign v_wren = wren_rsuv;
-	
-	
-		//
-		// Data Logic
-		//
-	assign r_dout = 32'd0;
-	assign s_dout = (proc_cnt == cnt_wren_rsuv_start) ? 32'd1 : 32'd0;
-	assign u_dout = (proc_cnt != cnt_wren_rsuv_stop)  ? q_din : 32'd0;
-	assign v_dout = (proc_cnt != cnt_wren_rsuv_stop)  ? a_din : 32'd0;
-	
-	
-		//
-		// Primary Counter Logic
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
-		else begin
-			if (!rdy)		proc_cnt <= proc_cnt_next;
-			else if (ena)	proc_cnt <= proc_cnt_next;
-		end
-
-
-endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_invert_compare.v b/rtl/modular/modular_invertor/helper/modinv_helper_invert_compare.v
deleted file mode 100644
index 6b65eb1..0000000
--- a/rtl/modular/modular_invertor/helper/modinv_helper_invert_compare.v
+++ /dev/null
@@ -1,286 +0,0 @@
-`timescale 1ns / 1ps
-
-module modinv_helper_invert_compare
-	(
-		clk, rst_n,
-		ena, rdy,
-		
-		u_addr, u_din,
-		v_addr, v_din,
-		
-		u_gt_v, v_eq_1,
-		u_is_even, v_is_even
-	);
-	
-
-		//
-		// Parameters
-		//
-	parameter BUFFER_NUM_WORDS		= 9;
-	parameter BUFFER_ADDR_BITS		= 4;
-	
-	
-		//
-		// clog2
-		//
-`include "..\modinv_clog2.v"
-	
-	
-		//
-		// Constants
-		//
-	localparam PROC_NUM_CYCLES	= 1 * BUFFER_NUM_WORDS + 10;
-	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
-	
-	
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	input		wire									ena;
-	output	wire									rdy;
-
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
-		
-	input		wire	[              32-1:0]	u_din;
-	input		wire	[              32-1:0]	v_din;
-		
-	output	wire									u_gt_v;
-	output	wire									v_eq_1;
-	output	wire									u_is_even;
-	output	wire									v_is_even;
-
-
-		//
-		// Counter
-		//
-	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
-
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
-																	proc_cnt + 1'b1 : proc_cnt_zero;
-	
-		//
-		// Addresses
-		//
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
-
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_prev	= (addr_in > addr_in_zero) ?
-																		addr_in - 1'b1 : addr_in_last;
-																			
-	assign u_addr					= addr_in;
-	assign v_addr					= addr_in;	
-	
-	
-		//
-		// Ready Flag
-		//
-	assign rdy = (proc_cnt == proc_cnt_zero);
-	
-	
-		//
-		// Address Decrement Logic
-		//
-	wire	dec_addr_in;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_start	= 0 * BUFFER_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_stop		= 1 * BUFFER_NUM_WORDS + 0;
-	
-	assign dec_addr_in   = (proc_cnt >= cnt_dec_addr_in_start)   && (proc_cnt <= cnt_dec_addr_in_stop);
-	
-	always @(posedge clk)
-		//
-		if (rdy)						addr_in <= addr_in_last;
-		else if (dec_addr_in)	addr_in <= addr_in_prev;
-	
-	
-		//
-		// Comparison Stage Flags
-		//
-	wire	calc_leg;
-	wire	calc_leg_final;
-	wire	calc_parity;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_calc_leg_start	= 0 * BUFFER_NUM_WORDS + 3;
-	wire	[PROC_CNT_BITS-1:0]	cnt_calc_leg_stop		= 1 * BUFFER_NUM_WORDS + 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_calc_parity		= 1 * BUFFER_NUM_WORDS + 1;
-	
-	assign calc_leg = (proc_cnt >= cnt_calc_leg_start) && (proc_cnt <= cnt_calc_leg_stop);
-	assign calc_leg_final = (proc_cnt == cnt_calc_leg_stop);
-	assign calc_parity = (proc_cnt == cnt_calc_parity);
-
-	
-		//
-		// Dummy Input
-		//
-	reg	sub32_din_1_lsb;
-	wire	[31: 0]	sub32_din_1 = {{31{1'b0}}, sub32_din_1_lsb};
-	
-	always @(posedge clk)
-		//
-		sub32_din_1_lsb <= (addr_in == addr_in_zero) ? 1'b1 : 1'b0;
-	
-	
-		//
-		// Subtractor (u - v)
-		//
-	wire	[31: 0]	sub32_u_minus_v_difference_out;
-	wire				sub32_u_minus_v_borrow_in;
-	wire				sub32_u_minus_v_borrow_out;
-	
-	subtractor32_wrapper sub32_u_minus_v
-	(
-		.clk		(clk),
-		.a			(u_din),
-		.b			(v_din),
-		.d			(sub32_u_minus_v_difference_out),
-		.b_in		(sub32_u_minus_v_borrow_in),
-		.b_out	(sub32_u_minus_v_borrow_out)
-	);
-	
-	
-		//
-		// Subtractor (v - 1)
-		//
-	wire	[31: 0]	sub32_v_minus_1_difference_out;
-	wire				sub32_v_minus_1_borrow_in;
-	wire				sub32_v_minus_1_borrow_out;
-	
-	subtractor32_wrapper sub32_v_minus_1
-	(
-		.clk		(clk),
-		.a			(v_din),
-		.b			(sub32_din_1),
-		.d			(sub32_v_minus_1_difference_out),
-		.b_in		(sub32_v_minus_1_borrow_in),
-		.b_out	(sub32_v_minus_1_borrow_out)
-	);
-	
-	
-	
-		//
-		// Borrow Masking Logic
-		//
-	reg	mask_borrow;
-	
-	always @(posedge clk)
-		//
-		mask_borrow <= ((proc_cnt > cnt_dec_addr_in_start) && (proc_cnt <= cnt_dec_addr_in_stop)) ?
-			1'b0 : 1'b1;
-		
-	assign sub32_u_minus_v_borrow_in = sub32_u_minus_v_borrow_out & ~mask_borrow;
-	assign sub32_v_minus_1_borrow_in = sub32_v_minus_1_borrow_out & ~mask_borrow;
-	
-		
-		//
-		// Comparison Logic
-		//
-	reg	cmp_u_v_l;
-	reg	cmp_u_v_e;
-	reg	cmp_u_v_g;
-
-	reg	cmp_v_1_l;
-	reg	cmp_v_1_e;
-	reg	cmp_v_1_g;
-
-	wire	cmp_unresolved_u_v = !(cmp_u_v_l || cmp_u_v_g);
-	wire	cmp_unresolved_v_1 = !(cmp_v_1_l || cmp_v_1_g);
-
-	wire	cmp_u_v_borrow_is_set			= (sub32_u_minus_v_borrow_out     ==  1'b1) ? 1'b1 : 1'b0;
-	wire	cmp_u_v_difference_is_nonzero	= (sub32_u_minus_v_difference_out != 32'd0) ? 1'b1 : 1'b0;
-
-	wire	cmp_v_1_borrow_is_set			= (sub32_v_minus_1_borrow_out     ==  1'b1) ? 1'b1 : 1'b0;
-	wire	cmp_v_1_difference_is_nonzero	= (sub32_v_minus_1_difference_out != 32'd0) ? 1'b1 : 1'b0;
-
-	reg	u_is_even_reg;
-	reg	v_is_even_reg;
-
-	always @(posedge clk)
-		//
-		if (rdy) begin
-			//
-			if (ena) begin
-				//
-				cmp_u_v_l		<= 1'b0;
-				cmp_u_v_e		<= 1'b0;
-				cmp_u_v_g		<= 1'b0;
-				//
-				cmp_v_1_l		<= 1'b0;
-				cmp_v_1_e		<= 1'b0;
-				cmp_v_1_g		<= 1'b0;
-				//
-				u_is_even_reg	<= 1'bX;
-				v_is_even_reg	<= 1'bX;
-				//
-			end
-			//
-		end else begin
-			//
-			// parity
-			//
-			if (calc_parity) begin
-				u_is_even_reg <= ~u_din[0];
-				v_is_even_reg <= ~v_din[0];
-			end
-			//
-			// u <> v
-			//
-			if (cmp_unresolved_u_v && calc_leg) begin
-				//
-				if (cmp_u_v_borrow_is_set)
-					cmp_u_v_l <= 1'b1;
-				//
-				if (!cmp_u_v_borrow_is_set && cmp_u_v_difference_is_nonzero)
-					cmp_u_v_g <= 1'b1;
-				//
-				if (!cmp_u_v_borrow_is_set && !cmp_u_v_difference_is_nonzero && calc_leg_final)
-					cmp_u_v_e <= 1'b1;
-				//
-			end
-			//
-			// v <> 1
-			//
-			if (cmp_unresolved_v_1 && calc_leg) begin
-				//
-				if (cmp_v_1_borrow_is_set)
-					cmp_v_1_l <= 1'b1;
-				//
-				if (!cmp_v_1_borrow_is_set && cmp_v_1_difference_is_nonzero)
-					cmp_v_1_g <= 1'b1;
-				//
-				if (!cmp_v_1_borrow_is_set && !cmp_v_1_difference_is_nonzero && calc_leg_final)
-					cmp_v_1_e <= 1'b1;
-				//
-			end			
-			//
-		end
-
-
-		//
-		// Output Flags
-		//
-	assign u_gt_v = !cmp_u_v_l && !cmp_u_v_e &&  cmp_u_v_g;
-	assign v_eq_1 = !cmp_v_1_l &&  cmp_v_1_e && !cmp_v_1_g;
-	
-	assign u_is_even = u_is_even_reg;
-	assign v_is_even = v_is_even_reg;
-
-
-		//
-		// Primary Counter Logic
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
-		else begin
-			if (!rdy)		proc_cnt <= proc_cnt_next;
-			else if (ena)	proc_cnt <= proc_cnt_next;
-		end
-
-
-endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_invert_precalc.v b/rtl/modular/modular_invertor/helper/modinv_helper_invert_precalc.v
deleted file mode 100644
index ab15563..0000000
--- a/rtl/modular/modular_invertor/helper/modinv_helper_invert_precalc.v
+++ /dev/null
@@ -1,408 +0,0 @@
-`timescale 1ns / 1ps
-
-module modinv_helper_invert_precalc
-	(
-		clk, rst_n,
-		ena, rdy,
-		
-		r_addr, r_din,
-		s_addr, s_din,
-		u_addr, u_din,
-		v_addr, v_din,
-		
-		r_dbl_addr,          r_dbl_wren,          r_dbl_dout,
-		s_dbl_addr,          s_dbl_wren,          s_dbl_dout,
-		r_plus_s_addr,       r_plus_s_wren,       r_plus_s_dout,
-		u_half_addr,         u_half_wren,         u_half_dout,
-		v_half_addr,         v_half_wren,         v_half_dout,
-		u_minus_v_addr,      u_minus_v_wren,      u_minus_v_dout,      u_minus_v_din,
-		v_minus_u_addr,      v_minus_u_wren,      v_minus_u_dout,      v_minus_u_din,
-		u_minus_v_half_addr, u_minus_v_half_wren, u_minus_v_half_dout,
-		v_minus_u_half_addr, v_minus_u_half_wren, v_minus_u_half_dout
-	);
-	
-
-		//
-		// Parameters
-		//
-	parameter BUFFER_NUM_WORDS		= 9;
-	parameter BUFFER_ADDR_BITS		= 4;
-	
-	
-		//
-		// clog2
-		//
-`include "..\modinv_clog2.v"
-	
-	
-		//
-		// Constants
-		//
-	localparam PROC_NUM_CYCLES	= 2 * BUFFER_NUM_WORDS + 4;
-	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
-	
-	
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	input		wire									ena;
-	output	wire									rdy;
-
-	output	wire	[BUFFER_ADDR_BITS-1:0]	r_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	s_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
-		
-	input		wire	[              32-1:0]	r_din;
-	input		wire	[              32-1:0]	s_din;
-	input		wire	[              32-1:0]	u_din;
-	input		wire	[              32-1:0]	v_din;
-		
-	output	wire	[BUFFER_ADDR_BITS-1:0]	r_dbl_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	s_dbl_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	r_plus_s_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_half_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_half_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_minus_v_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_minus_u_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_minus_v_half_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_minus_u_half_addr;
-		
-	output	wire	[              32-1:0]	r_dbl_dout;
-	output	wire	[              32-1:0]	s_dbl_dout;
-	output	wire	[              32-1:0]	r_plus_s_dout;
-	output	wire	[              32-1:0]	u_half_dout;
-	output	wire	[              32-1:0]	v_half_dout;
-	output	wire	[              32-1:0]	u_minus_v_dout;
-	output	wire	[              32-1:0]	v_minus_u_dout;
-	output	wire	[              32-1:0]	u_minus_v_half_dout;
-	output	wire	[              32-1:0]	v_minus_u_half_dout;
-		
-	output	wire									r_dbl_wren;
-	output	wire									s_dbl_wren;
-	output	wire									r_plus_s_wren;
-	output	wire									u_half_wren;
-	output	wire									v_half_wren;
-	output	wire									u_minus_v_wren;
-	output	wire									v_minus_u_wren;
-	output	wire									u_minus_v_half_wren;
-	output	wire									v_minus_u_half_wren;
-	
-	input		wire	[              32-1:0]	u_minus_v_din;
-	input		wire	[              32-1:0]	v_minus_u_din;
-	
-
-
-		//
-		// Counter
-		//
-	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
-
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
-																	proc_cnt + 1'b1 : proc_cnt_zero;
-	
-		//
-		// Addresses
-		//
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
-
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_next	= (addr_in < addr_in_last) ?
-																		addr_in + 1'b1 : addr_in_zero;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_prev	= (addr_in > addr_in_zero) ?
-																		addr_in - 1'b1 : addr_in_zero;
-																		
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_out1;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_next	= (addr_out1 < addr_out1_last) ?
-																		addr_out1 + 1'b1 : addr_out1_zero;
-																		
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_out2;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_next	= (addr_out2 < addr_out2_last) ?
-																		addr_out2 + 1'b1 : addr_out2_zero;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_prev	= (addr_out2 > addr_out2_zero) ?
-																		addr_out2 - 1'b1 : addr_out2_zero;
-																		
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_out3;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_prev	= (addr_out3 > addr_out3_zero) ?
-																		addr_out3 - 1'b1 : addr_out3_last;
-
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_out4;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out4_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out4_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out4_prev	= (addr_out4 > addr_out4_zero) ?
-																		addr_out4 - 1'b1 : addr_out4_last;
-
-	
-	assign r_addr					= addr_in;
-	assign s_addr					= addr_in;
-	assign u_addr					= addr_in;
-	assign v_addr					= addr_in;
-		
-	assign r_dbl_addr				= addr_out1;
-	assign s_dbl_addr				= addr_out1;
-	assign r_plus_s_addr			= addr_out2;
-	assign u_half_addr			= addr_out3;
-	assign v_half_addr			= addr_out3;
-	assign u_minus_v_addr		= addr_out2;
-	assign v_minus_u_addr		= addr_out2;
-	assign u_minus_v_half_addr	= addr_out4;
-	assign v_minus_u_half_addr	= addr_out4;
-	
-		
-		//
-		// Ready Flag
-		//
-	assign rdy = (proc_cnt == proc_cnt_zero);
-	
-	
-		//
-		// Address Increment/Decrement Logic
-		//
-	wire	inc_addr_in;
-	wire	dec_addr_in;
-	wire	inc_addr_out1;
-	wire	inc_addr_out2;
-	wire	dec_addr_out2;
-	wire	dec_addr_out3;
-	wire	dec_addr_out4;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_start	= 0 * BUFFER_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_stop		= 1 * BUFFER_NUM_WORDS - 1;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_start	= 0 * BUFFER_NUM_WORDS + 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_stop	= 1 * BUFFER_NUM_WORDS + 1;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out2_start	= 0 * BUFFER_NUM_WORDS + 3;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out2_stop	= 1 * BUFFER_NUM_WORDS + 1;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_start	= 1 * BUFFER_NUM_WORDS + 3;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_stop	= 2 * BUFFER_NUM_WORDS + 1;	
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_start	= 1 * BUFFER_NUM_WORDS + 0;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_in_stop		= 2 * BUFFER_NUM_WORDS - 2;	
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_start	= 1 * BUFFER_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_stop	= 2 * BUFFER_NUM_WORDS + 0;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out4_start	= 1 * BUFFER_NUM_WORDS + 4;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out4_stop	= 2 * BUFFER_NUM_WORDS + 3;	
-
-	assign inc_addr_in   = (proc_cnt >= cnt_inc_addr_in_start)   && (proc_cnt <= cnt_inc_addr_in_stop);
-	assign dec_addr_in   = (proc_cnt >= cnt_dec_addr_in_start)   && (proc_cnt <= cnt_dec_addr_in_stop);
-	assign inc_addr_out1 = (proc_cnt >= cnt_inc_addr_out1_start) && (proc_cnt <= cnt_inc_addr_out1_stop);
-	assign inc_addr_out2 = (proc_cnt >= cnt_inc_addr_out2_start) && (proc_cnt <= cnt_inc_addr_out2_stop);
-	assign dec_addr_out2 = (proc_cnt >= cnt_dec_addr_out2_start) && (proc_cnt <= cnt_dec_addr_out2_stop);
-	assign dec_addr_out3 = (proc_cnt >= cnt_dec_addr_out3_start) && (proc_cnt <= cnt_dec_addr_out3_stop);
-	assign dec_addr_out4 = (proc_cnt >= cnt_dec_addr_out4_start) && (proc_cnt <= cnt_dec_addr_out4_stop);
-	
-	
-	always @(posedge clk) begin
-		//
-		if (rdy) begin
-			//
-			addr_in 		<= addr_in_zero;
-			addr_out1	<= addr_out1_zero;
-			addr_out2	<= addr_out2_zero;
-			addr_out3	<= addr_out3_last;
-			addr_out4	<= addr_out4_last;
-			//
-		end else begin
-			//
-			if (inc_addr_in)				addr_in <= addr_in_next;
-			else if (dec_addr_in)		addr_in <= addr_in_prev;
-			//
-			if (inc_addr_out1)			addr_out1 <= addr_out1_next;
-			else								addr_out1 <= addr_out1_zero;
-			//
-			if (inc_addr_out2)			addr_out2 <= addr_out2_next;
-			else if (dec_addr_out2)		addr_out2 <= addr_out2_prev;
-			//
-			if (dec_addr_out3)			addr_out3 <= addr_out3_prev;
-			else								addr_out3 <= addr_out3_last;
-			//
-			if (dec_addr_out4)			addr_out4 <= addr_out4_prev;
-			else								addr_out4 <= addr_out4_last;
-			//
-		end
-		//
-	end
-	
-	
-		//
-		// Write Enable Logic
-		//
-	wire	wren_out1;
-	wire	wren_out2;
-	wire	wren_out3;
-	wire	wren_out4;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_start	= 0 * BUFFER_NUM_WORDS + 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_stop	= 1 * BUFFER_NUM_WORDS + 1;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_start	= 0 * BUFFER_NUM_WORDS + 3;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_stop	= 1 * BUFFER_NUM_WORDS + 2;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_start	= 1 * BUFFER_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_stop	= 2 * BUFFER_NUM_WORDS + 0;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out4_start	= 1 * BUFFER_NUM_WORDS + 4;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out4_stop	= 2 * BUFFER_NUM_WORDS + 3;
-
-	assign wren_out1 = (proc_cnt >= cnt_wren_out1_start) && (proc_cnt <= cnt_wren_out1_stop);
-	assign wren_out2 = (proc_cnt >= cnt_wren_out2_start) && (proc_cnt <= cnt_wren_out2_stop);
-	assign wren_out3 = (proc_cnt >= cnt_wren_out3_start) && (proc_cnt <= cnt_wren_out3_stop);
-	assign wren_out4 = (proc_cnt >= cnt_wren_out4_start) && (proc_cnt <= cnt_wren_out4_stop);
-
-	assign r_dbl_wren				= wren_out1;
-	assign s_dbl_wren				= wren_out1;
-	assign r_plus_s_wren			= wren_out2;
-	assign u_half_wren			= wren_out3;
-	assign v_half_wren			= wren_out3;
-	assign u_minus_v_wren		= wren_out2;
-	assign v_minus_u_wren		= wren_out2;
-	assign u_minus_v_half_wren	= wren_out4;
-	assign v_minus_u_half_wren	= wren_out4;
-
-
-		//
-		// Adder (r + s)
-		//
-	wire	[31: 0]	add32_r_plus_s_sum_out;
-	wire				add32_r_plus_s_carry_in;
-	wire				add32_r_plus_s_carry_out;
-	
-	adder32_wrapper add32_r_plus_s
-	(
-		.clk		(clk),
-		.a			(r_din),
-		.b			(s_din),
-		.s			(add32_r_plus_s_sum_out),
-		.c_in		(add32_r_plus_s_carry_in),
-		.c_out	(add32_r_plus_s_carry_out)
-	);
-	
-		//
-		// Subtractor (u - v)
-		//
-	wire	[31: 0]	sub32_u_minus_v_difference_out;
-	wire				sub32_u_minus_v_borrow_in;
-	wire				sub32_u_minus_v_borrow_out;
-	
-	subtractor32_wrapper sub32_u_minus_v
-	(
-		.clk		(clk),
-		.a			(u_din),
-		.b			(v_din),
-		.d			(sub32_u_minus_v_difference_out),
-		.b_in		(sub32_u_minus_v_borrow_in),
-		.b_out	(sub32_u_minus_v_borrow_out)
-	);
-	
-		//
-		// Subtractor (v - u)
-		//
-	wire	[31: 0]	sub32_v_minus_u_difference_out;
-	wire				sub32_v_minus_u_borrow_in;
-	wire				sub32_v_minus_u_borrow_out;
-	
-	subtractor32_wrapper sub32_v_minus_u
-	(
-		.clk		(clk),
-		.a			(v_din),
-		.b			(u_din),
-		.d			(sub32_v_minus_u_difference_out),
-		.b_in		(sub32_v_minus_u_borrow_in),
-		.b_out	(sub32_v_minus_u_borrow_out)
-	);
-	
-	
-		//
-		// Carry & Borrow Masking Logic
-		//
-	reg	mask_carry_borrow;
-	
-	always @(posedge clk)
-		//
-		mask_carry_borrow <= ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ?
-			1'b0 : 1'b1;
-		
-	assign add32_r_plus_s_carry_in   = add32_r_plus_s_carry_out   & ~mask_carry_borrow;
-	assign sub32_u_minus_v_borrow_in = sub32_u_minus_v_borrow_out & ~mask_carry_borrow;
-	assign sub32_v_minus_u_borrow_in = sub32_v_minus_u_borrow_out & ~mask_carry_borrow;
-	
-	
-		//
-		// Carry Bits
-		//
-	reg	r_dbl_carry;
-	reg	s_dbl_carry;
-	reg	u_half_carry;
-	reg	v_half_carry;
-	reg	u_minus_v_half_carry;
-	reg	v_minus_u_half_carry;
-	
-	always @(posedge clk) begin
-		
-		r_dbl_carry					<= ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ?
-											r_din[31] : 1'b0;
-								
-		s_dbl_carry					<= ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ?
-											s_din[31] : 1'b0;
-								
-		u_half_carry				<= ((proc_cnt >= cnt_wren_out3_start) && (proc_cnt < cnt_wren_out3_stop)) ?
-											u_din[0] : 1'b0;
-		
-		v_half_carry				<= ((proc_cnt >= cnt_wren_out3_start) && (proc_cnt < cnt_wren_out3_stop)) ?
-											v_din[0] : 1'b0;
-									
-		u_minus_v_half_carry		<= ((proc_cnt >= cnt_wren_out4_start) && (proc_cnt < cnt_wren_out4_stop)) ?
-											u_minus_v_din[0] : 1'b0;
-		
-		v_minus_u_half_carry		<= ((proc_cnt >= cnt_wren_out4_start) && (proc_cnt < cnt_wren_out4_stop)) ?
-											v_minus_u_din[0] : 1'b0;
-
-	end
-	
-	
-		//
-		// Data Mapper
-		//
-	assign r_dbl_dout				= {r_din[30:0], r_dbl_carry};
-	assign s_dbl_dout				= {s_din[30:0], s_dbl_carry};
-	assign r_plus_s_dout			= add32_r_plus_s_sum_out;
-	assign u_half_dout			= {u_half_carry, u_din[31:1]};
-	assign v_half_dout			= {v_half_carry, v_din[31:1]};
-	assign u_minus_v_dout		= sub32_u_minus_v_difference_out;
-	assign v_minus_u_dout		= sub32_v_minus_u_difference_out;
-	assign u_minus_v_half_dout	= {u_minus_v_half_carry, u_minus_v_din[31:1]};
-	assign v_minus_u_half_dout	= {v_minus_u_half_carry, v_minus_u_din[31:1]};
-	
-	
-		//
-		// Primary Counter Logic
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
-		else begin
-			if (!rdy)		proc_cnt <= proc_cnt_next;
-			else if (ena)	proc_cnt <= proc_cnt_next;
-		end
-
-
-endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_invert_update.v b/rtl/modular/modular_invertor/helper/modinv_helper_invert_update.v
deleted file mode 100644
index 0cd6ac5..0000000
--- a/rtl/modular/modular_invertor/helper/modinv_helper_invert_update.v
+++ /dev/null
@@ -1,257 +0,0 @@
-`timescale 1ns / 1ps
-
-module modinv_helper_invert_update
-	(
-		clk, rst_n,
-		ena, rdy,
-		
-		u_gt_v, v_eq_1,
-		u_is_even, v_is_even,
-		
-		r_addr, r_wren, r_dout,
-		s_addr, s_wren, s_dout,
-		u_addr, u_wren, u_dout,
-		v_addr, v_wren, v_dout,
-		
-		r_dbl_addr,          r_dbl_din,
-		s_dbl_addr,          s_dbl_din,
-		r_plus_s_addr,       r_plus_s_din,
-		u_half_addr,         u_half_din,
-		v_half_addr,         v_half_din,
-		u_minus_v_half_addr, u_minus_v_half_din,
-		v_minus_u_half_addr, v_minus_u_half_din
-	);
-	
-	
-		//
-		// Parameters
-		//
-	parameter BUFFER_NUM_WORDS		= 9;
-	parameter BUFFER_ADDR_BITS		= 4;
-	
-	
-		//
-		// clog2
-		//
-`include "..\modinv_clog2.v"
-	
-	
-		//
-		// Constants
-		//
-	localparam PROC_NUM_CYCLES	= BUFFER_NUM_WORDS + 3;
-	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
-	
-	
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	input		wire									ena;
-	output	wire									rdy;
-
-	input		wire									u_gt_v;
-	input		wire									v_eq_1;
-	input		wire									u_is_even;
-	input		wire									v_is_even;
-		
-	output	wire	[BUFFER_ADDR_BITS-1:0]	r_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	s_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
-		
-	output	wire									r_wren;
-	output	wire									s_wren;
-	output	wire									u_wren;
-	output	wire									v_wren;
-		
-	output	wire	[              32-1:0]	r_dout;
-	output	wire	[              32-1:0]	s_dout;
-	output	wire	[              32-1:0]	u_dout;
-	output	wire	[              32-1:0]	v_dout;
-		
-	output	wire	[BUFFER_ADDR_BITS-1:0]	r_dbl_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	s_dbl_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	r_plus_s_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_half_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_half_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_minus_v_half_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_minus_u_half_addr;
-		
-	input		wire	[              32-1:0]	r_dbl_din;
-	input		wire	[              32-1:0]	s_dbl_din;
-	input		wire	[              32-1:0]	r_plus_s_din;
-	input		wire	[              32-1:0]	u_half_din;
-	input		wire	[              32-1:0]	v_half_din;
-	input		wire	[              32-1:0]	u_minus_v_half_din;
-	input		wire	[              32-1:0]	v_minus_u_half_din;
-		
-	
-		//
-		// Counter
-		//
-	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
-
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
-																	proc_cnt + 1'b1 : proc_cnt_zero;
-	
-		//
-		// Addresses
-		//
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
-
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_max		= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_next	= (addr_in < addr_in_max) ?
-																		addr_in + 1'b1 : addr_in_zero;
-																		
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_out;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_max	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_next	= (addr_out < addr_out_max) ?
-																		addr_out + 1'b1 : addr_out_zero;
-																		
-	assign r_addr					= addr_out;
-	assign s_addr					= addr_out;
-	assign u_addr					= addr_out;
-	assign v_addr					= addr_out;
-	
-	assign r_dbl_addr				= addr_in;
-	assign s_dbl_addr				= addr_in;
-	assign r_plus_s_addr			= addr_in;
-	assign u_half_addr			= addr_in;
-	assign v_half_addr			= addr_in;
-	assign u_minus_v_half_addr	= addr_in;
-	assign v_minus_u_half_addr	= addr_in;
-	
-	
-		//
-		// Ready Flag
-		//
-	assign rdy = (proc_cnt == proc_cnt_zero);
-	
-	
-		//
-		// Address Increment Logic
-		//
-	wire	inc_addr_in;
-	wire	inc_addr_out;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_start	= 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_stop		= BUFFER_NUM_WORDS;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_stop	= BUFFER_NUM_WORDS + 1;
-
-	assign inc_addr_in  = (proc_cnt >= cnt_inc_addr_in_start)  && (proc_cnt <= cnt_inc_addr_in_stop);
-	assign inc_addr_out = (proc_cnt >= cnt_inc_addr_out_start) && (proc_cnt <= cnt_inc_addr_out_stop);
-	
-	always @(posedge clk) begin
-		//
-		if (inc_addr_in)	addr_in <= addr_in_next;
-		else					addr_in <= addr_in_zero;
-		//
-		if (inc_addr_out)	addr_out <= addr_out_next;
-		else					addr_out <= addr_out_zero;
-		//
-	end
-	
-		//
-		// Write Enable Logic
-		//
-	wire	wren_out;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_stop		= BUFFER_NUM_WORDS + 1;
-
-	assign wren_out = (proc_cnt >= cnt_wren_out_start) && (proc_cnt <= cnt_wren_out_stop);
-
-	reg	r_wren_allow;
-	reg	s_wren_allow;
-	reg	u_wren_allow;
-	reg	v_wren_allow;
-
-	assign r_wren = wren_out && r_wren_allow && !v_eq_1 && !rdy;
-	assign s_wren = wren_out && s_wren_allow && !v_eq_1 && !rdy;
-	assign u_wren = wren_out && u_wren_allow && !v_eq_1 && !rdy;
-	assign v_wren = wren_out && v_wren_allow && !v_eq_1 && !rdy;
-	
-	
-		//
-		// Data Logic
-		//
-	reg	[31: 0]	r_dout_mux;
-	reg	[31: 0]	s_dout_mux;
-	reg	[31: 0]	u_dout_mux;
-	reg	[31: 0]	v_dout_mux;
-	
-	assign r_dout = r_dout_mux;
-	assign s_dout = s_dout_mux;
-	assign u_dout = u_dout_mux;
-	assign v_dout = v_dout_mux;
-	
-	always @(*) begin
-		//
-		// r, s, u, v
-		//
-		if (u_is_even) begin
-			//
-			u_dout_mux		= u_half_din;
-			v_dout_mux		= {32{1'bX}};
-			r_dout_mux		= {32{1'bX}};
-			s_dout_mux		= s_dbl_din;
-			//
-			u_wren_allow	= 1'b1;
-			v_wren_allow	= 1'b0;
-			r_wren_allow	= 1'b0;
-			s_wren_allow	= 1'b1;
-			//
-		end else begin
-			//
-			if (v_is_even) begin
-				//
-				u_dout_mux		= {32{1'bX}};
-				v_dout_mux		= v_half_din;
-				r_dout_mux		= r_dbl_din;
-				s_dout_mux		= {32{1'bX}};
-				//
-				u_wren_allow	= 1'b0;
-				v_wren_allow	= 1'b1;
-				r_wren_allow	= 1'b1;
-				s_wren_allow	= 1'b0;
-				//
-			end else begin
-				//
-				u_dout_mux		=  u_gt_v ? u_minus_v_half_din : {32{1'bX}};
-				v_dout_mux		=  u_gt_v ? {32{1'bX}}         : v_minus_u_half_din;
-				r_dout_mux		=  u_gt_v ? r_plus_s_din       : r_dbl_din;
-				s_dout_mux		=  u_gt_v ? s_dbl_din          : r_plus_s_din;
-				//
-				u_wren_allow	=  u_gt_v;
-				v_wren_allow	= !u_gt_v;
-				r_wren_allow	=  1'b1;
-				s_wren_allow	=  1'b1;
-				//
-			end
-			//
-		end
-		//
-	end
-		
-		
-		//
-		// Primary Counter Logic
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
-		else begin
-			if (!rdy)		proc_cnt <= proc_cnt_next;
-			else if (ena)	proc_cnt <= proc_cnt_next;
-		end
-
-endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_reduce_precalc.v b/rtl/modular/modular_invertor/helper/modinv_helper_reduce_precalc.v
deleted file mode 100644
index fb858a6..0000000
--- a/rtl/modular/modular_invertor/helper/modinv_helper_reduce_precalc.v
+++ /dev/null
@@ -1,328 +0,0 @@
-`timescale 1ns / 1ps
-
-module modinv_helper_reduce_precalc
-	(
-		clk, rst_n,
-		ena, rdy,
-		
-		k,
-		
-		s_is_odd, k_is_nul,
-		
-		r_addr, r_din, r_wren, r_dout,
-		s_addr, s_din,
-		u_addr,        u_wren, u_dout,
-		v_addr,        v_wren, v_dout,
-		q_addr, q_din
-	);
-	
-
-		//
-		// Parameters
-		//
-	parameter OPERAND_NUM_WORDS	= 8;
-	parameter OPERAND_ADDR_BITS	= 3;
-	parameter BUFFER_NUM_WORDS		= 9;
-	parameter BUFFER_ADDR_BITS		= 4;
-	parameter K_NUM_BITS				= 10;
-	
-	
-		//
-		// clog2
-		//
-`include "..\modinv_clog2.v"
-	
-	
-		//
-		// Constants
-		//
-	localparam PROC_NUM_CYCLES	= 2 * BUFFER_NUM_WORDS + 4;
-	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
-	
-	
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	input		wire									ena;
-	output	wire									rdy;
-
-	input		wire	[       K_NUM_BITS-1:0]	k;
-		
-	output	wire									s_is_odd;
-	output	wire									k_is_nul;
-
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	r_addr;
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	s_addr;
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	u_addr;
-	output	wire	[ BUFFER_ADDR_BITS-1:0]	v_addr;
-	output	wire	[OPERAND_ADDR_BITS-1:0]	q_addr;
-
-	input		wire	[              32-1:0]	r_din;
-	input		wire	[              32-1:0]	s_din;
-	input		wire	[              32-1:0]	q_din;
-	
-	output	wire									r_wren;
-	output	wire									u_wren;
-	output	wire									v_wren;
-	
-	output	wire	[              32-1:0]	r_dout;
-	output	wire	[              32-1:0]	u_dout;
-	output	wire	[              32-1:0]	v_dout;
-				
-
-		//
-		// Counter
-		//
-	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
-
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
-																	proc_cnt + 1'b1 : proc_cnt_zero;
-	
-		//
-		// Addresses
-		//
-	reg	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf;
-	reg	[OPERAND_ADDR_BITS-1:0]	addr_in_op;
-	reg	[ BUFFER_ADDR_BITS-1:0]	addr_out1;
-	reg	[ BUFFER_ADDR_BITS-1:0]	addr_out2;
-	reg	[ BUFFER_ADDR_BITS-1:0]	addr_out3;
-
-	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_last	= BUFFER_NUM_WORDS - 1;
-	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_next	= (addr_in_buf < addr_in_buf_last) ?
-																		addr_in_buf + 1'b1 : addr_in_buf_zero;
-	wire	[ BUFFER_ADDR_BITS-1:0]	addr_in_buf_prev	= (addr_in_buf > addr_in_buf_zero) ?
-																		addr_in_buf - 1'b1 : addr_in_buf_zero;
-
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_in_op_last	= OPERAND_NUM_WORDS - 1;
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_in_op_zero	= {OPERAND_ADDR_BITS{1'b0}};
-	wire	[OPERAND_ADDR_BITS-1:0]	addr_in_op_next	= (addr_in_op < addr_in_op_last) ?
-																		addr_in_op + 1'b1 : addr_in_op_zero;
-																		
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_next	= (addr_out1 < addr_out1_last) ?
-																		addr_out1 + 1'b1 : addr_out1_zero;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out1_prev	= (addr_out1 > addr_out1_zero) ?
-																		addr_out1 - 1'b1 : addr_out1_zero;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out2_prev	= (addr_out2 > addr_out2_zero) ?
-																		addr_out2 - 1'b1 : addr_out2_last;
-
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_last	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out3_prev	= (addr_out3 > addr_out3_zero) ?
-																		addr_out3 - 1'b1 : addr_out3_last;
-
-	
-	assign s_addr = addr_in_buf;
-	assign q_addr = addr_in_op;
-	assign r_addr = addr_out1;
-	assign u_addr = addr_out2;
-	assign v_addr = addr_out3;
-	
-	
-		//
-		// Ready Flag
-		//
-	assign rdy = (proc_cnt == proc_cnt_zero);
-		
-		
-		//
-		// Address Increment/Decrement Logic
-		//
-	wire	inc_addr_buf_in;
-	wire	dec_addr_buf_in;
-	wire	inc_addr_op_in;
-	wire	inc_addr_out1;
-	wire	dec_addr_out1;
-	wire	dec_addr_out2;
-	wire	dec_addr_out3;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_calc_flags					= 0 * BUFFER_NUM_WORDS + 2;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_buf_in_start	= 0 * BUFFER_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_buf_in_stop	= 1 * BUFFER_NUM_WORDS - 1;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_buf_in_start	= 1 * BUFFER_NUM_WORDS + 0;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_buf_in_stop	= 2 * BUFFER_NUM_WORDS - 2;	
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_op_in_start	= 0 * OPERAND_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_op_in_stop		= 1 * OPERAND_NUM_WORDS + 0;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_start		= 0 * BUFFER_NUM_WORDS + 3;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out1_stop		= 1 * BUFFER_NUM_WORDS + 1;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out1_start		= 1 * BUFFER_NUM_WORDS + 3;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out1_stop		= 2 * BUFFER_NUM_WORDS + 1;	
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_start		= 1 * BUFFER_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out2_stop		= 2 * BUFFER_NUM_WORDS + 0;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_start		= 1 * BUFFER_NUM_WORDS + 4;
-	wire	[PROC_CNT_BITS-1:0]	cnt_dec_addr_out3_stop		= 2 * BUFFER_NUM_WORDS + 3;
-
-	assign inc_addr_buf_in = (proc_cnt >= cnt_inc_addr_buf_in_start) && (proc_cnt <= cnt_inc_addr_buf_in_stop);
-	assign dec_addr_buf_in = (proc_cnt >= cnt_dec_addr_buf_in_start) && (proc_cnt <= cnt_dec_addr_buf_in_stop);
-	assign inc_addr_op_in  = (proc_cnt >= cnt_inc_addr_op_in_start)  && (proc_cnt <= cnt_inc_addr_op_in_stop);
-	assign inc_addr_out1   = (proc_cnt >= cnt_inc_addr_out1_start) && (proc_cnt <= cnt_inc_addr_out1_stop);
-	assign dec_addr_out1   = (proc_cnt >= cnt_dec_addr_out1_start) && (proc_cnt <= cnt_dec_addr_out1_stop);
-	assign dec_addr_out2   = (proc_cnt >= cnt_dec_addr_out2_start) && (proc_cnt <= cnt_dec_addr_out2_stop);
-	assign dec_addr_out3   = (proc_cnt >= cnt_dec_addr_out3_start) && (proc_cnt <= cnt_dec_addr_out3_stop);
-
-	always @(posedge clk) begin
-		//
-		if (rdy) begin
-			//
-			addr_in_buf		<= addr_in_buf_zero;
-			addr_in_op		<= addr_in_op_zero;
-			addr_out1		<= addr_out1_zero;
-			addr_out2		<= addr_out2_last;
-			addr_out3		<= addr_out3_last;
-			//
-		end else begin
-			//
-			if (inc_addr_buf_in)			addr_in_buf	<= addr_in_buf_next;
-			else if (dec_addr_buf_in)	addr_in_buf	<= addr_in_buf_prev;
-			//
-			if (inc_addr_op_in)			addr_in_op	<= addr_in_op_next;
-			else								addr_in_op	<= addr_in_op_zero;
-			//
-			if (inc_addr_out1)			addr_out1	<= addr_out1_next;
-			else if (dec_addr_out1)		addr_out1	<= addr_out1_prev;
-			//
-			if (dec_addr_out2)			addr_out2	<= addr_out2_prev;
-			else								addr_out2	<= addr_out2_last;
-			//
-			if (dec_addr_out3)			addr_out3	<= addr_out3_prev;
-			else								addr_out3	<= addr_out3_last;
-			//
-		end
-		//
-	end
-	
-	
-		//
-		// Write Enable Logic
-		//
-	wire	wren_out1;
-	wire	wren_out2;
-	wire	wren_out3;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_start	= 0 * BUFFER_NUM_WORDS + 3;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out1_stop	= 1 * BUFFER_NUM_WORDS + 2;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_start	= 1 * BUFFER_NUM_WORDS + 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out2_stop	= 2 * BUFFER_NUM_WORDS + 0;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_start	= 1 * BUFFER_NUM_WORDS + 4;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out3_stop	= 2 * BUFFER_NUM_WORDS + 3;
-
-	assign wren_out1 = (proc_cnt >= cnt_wren_out1_start) && (proc_cnt <= cnt_wren_out1_stop);
-	assign wren_out2 = (proc_cnt >= cnt_wren_out2_start) && (proc_cnt <= cnt_wren_out2_stop);
-	assign wren_out3 = (proc_cnt >= cnt_wren_out3_start) && (proc_cnt <= cnt_wren_out3_stop);
-
-	assign r_wren = wren_out1;
-	assign u_wren = wren_out2;
-	assign v_wren = wren_out3;
-	
-		//
-		// Adder (s + q)
-		//
-	wire	[31: 0]	q_din_masked;
-	wire	[31: 0]	add32_s_plus_q_sum_out;
-	wire				add32_s_plus_q_carry_in;
-	wire				add32_s_plus_q_carry_out;
-	
-	adder32_wrapper add32_r_plus_s
-	(
-		.clk		(clk),
-		.a			(s_din),
-		.b			(q_din_masked),
-		.s			(add32_s_plus_q_sum_out),
-		.c_in		(add32_s_plus_q_carry_in),
-		.c_out	(add32_s_plus_q_carry_out)
-	);
-		
-		
-		//
-		// Carry Masking Logic
-		//
-	wire	mask_carry;
-
-	assign mask_carry = ((proc_cnt >= cnt_wren_out1_start) && (proc_cnt < cnt_wren_out1_stop)) ? 1'b0 : 1'b1;
-
-
-		//
-		// Addend Masking Logic
-		//
-	reg	q_din_mask;
-	
-	always @(posedge clk)
-		q_din_mask <= (addr_in_buf == addr_in_buf_last) ? 1'b1 : 1'b0;
-	
-	assign q_din_masked = q_din_mask ? {32{1'b0}} : q_din;
-	
-	assign add32_s_plus_q_carry_in = add32_s_plus_q_carry_out & ~mask_carry;
-
-
-		//
-		// Carry Bits
-		//
-	reg	s_half_carry;
-	reg	s_plus_q_half_carry;
-	
-	always @(posedge clk) begin
-		//					
-		s_half_carry				<= ((proc_cnt >= cnt_wren_out2_start) && (proc_cnt < cnt_wren_out2_stop)) ?
-											s_din[0] : 1'b0;
-		//
-		s_plus_q_half_carry		<= ((proc_cnt >= cnt_wren_out3_start) && (proc_cnt < cnt_wren_out3_stop)) ?
-											r_din[0] : 1'b0;
-		//
-	end
-
-		//
-		// Data Mapper
-		//
-	assign r_dout = add32_s_plus_q_sum_out;
-	assign u_dout = {s_half_carry,        s_din[31:1]};
-	assign v_dout = {s_plus_q_half_carry, r_din[31:1]};
-	
-	
-		//
-		// Primary Counter Logic
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
-		else begin
-			if (!rdy)		proc_cnt <= proc_cnt_next;
-			else if (ena)	proc_cnt <= proc_cnt_next;
-		end
-		
-		
-		//
-		// Output Flags
-		//
-	reg	s_is_odd_reg;
-	reg	k_is_nul_reg;
-	
-	assign s_is_odd = s_is_odd_reg;
-	assign k_is_nul = k_is_nul_reg;
-
-	always @(posedge clk)
-		//
-		if (proc_cnt == cnt_calc_flags) begin
-			s_is_odd_reg <= s_din[0];
-			k_is_nul_reg <= (k == {K_NUM_BITS{1'b0}}) ? 1'b1 : 1'b0;
-		end
-
-
-endmodule
diff --git a/rtl/modular/modular_invertor/helper/modinv_helper_reduce_update.v b/rtl/modular/modular_invertor/helper/modinv_helper_reduce_update.v
deleted file mode 100644
index ea5b854..0000000
--- a/rtl/modular/modular_invertor/helper/modinv_helper_reduce_update.v
+++ /dev/null
@@ -1,153 +0,0 @@
-`timescale 1ns / 1ps
-
-module modinv_helper_reduce_update
-	(
-		clk, rst_n,
-		ena, rdy,
-		
-		s_is_odd, k_is_nul,
-		
-		s_addr, s_wren, s_dout,
-		u_addr,                 u_din,
-		v_addr,                 v_din
-	);
-	
-	
-		//
-		// Parameters
-		//
-	parameter BUFFER_NUM_WORDS		= 9;
-	parameter BUFFER_ADDR_BITS		= 4;
-	
-	
-		//
-		// clog2
-		//
-`include "..\modinv_clog2.v"
-	
-	
-		//
-		// Constants
-		//
-	localparam PROC_NUM_CYCLES	= BUFFER_NUM_WORDS + 3;
-	localparam PROC_CNT_BITS	= clog2(PROC_NUM_CYCLES);
-	
-	
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	input		wire									ena;
-	output	wire									rdy;
-
-	input		wire									s_is_odd;
-	input		wire									k_is_nul;
-		
-	output	wire	[BUFFER_ADDR_BITS-1:0]	s_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	u_addr;
-	output	wire	[BUFFER_ADDR_BITS-1:0]	v_addr;
-		
-	output	wire									s_wren;
-		
-	output	wire	[              32-1:0]	s_dout;
-
-	input		wire	[              32-1:0]	u_din;
-	input		wire	[              32-1:0]	v_din;
-		
-	
-		//
-		// Counter
-		//
-	reg	[PROC_CNT_BITS-1:0]	proc_cnt;
-
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_max	= PROC_NUM_CYCLES - 1;
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_zero	= {PROC_CNT_BITS{1'b0}};
-	wire	[PROC_CNT_BITS-1:0]	proc_cnt_next	= (proc_cnt < proc_cnt_max) ?
-																	proc_cnt + 1'b1 : proc_cnt_zero;
-	
-		//
-		// Addresses
-		//
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_in;
-
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_max		= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_in_next	= (addr_in < addr_in_max) ?
-																		addr_in + 1'b1 : addr_in_zero;
-																		
-	reg	[BUFFER_ADDR_BITS-1:0]	addr_out;
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_max	= BUFFER_NUM_WORDS - 1;
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_zero	= {BUFFER_ADDR_BITS{1'b0}};
-	wire	[BUFFER_ADDR_BITS-1:0]	addr_out_next	= (addr_out < addr_out_max) ?
-																		addr_out + 1'b1 : addr_out_zero;
-																		
-	assign s_addr					= addr_out;
-	assign u_addr					= addr_in;
-	assign v_addr					= addr_in;
-	
-	
-		//
-		// Ready Flag
-		//
-	assign rdy = (proc_cnt == proc_cnt_zero);
-	
-	
-		//
-		// Address Increment Logic
-		//
-	wire	inc_addr_in;
-	wire	inc_addr_out;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_start	= 1;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_in_stop		= BUFFER_NUM_WORDS;
-	
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_inc_addr_out_stop	= BUFFER_NUM_WORDS + 1;
-
-	assign inc_addr_in  = (proc_cnt >= cnt_inc_addr_in_start)  && (proc_cnt <= cnt_inc_addr_in_stop);
-	assign inc_addr_out = (proc_cnt >= cnt_inc_addr_out_start) && (proc_cnt <= cnt_inc_addr_out_stop);
-	
-	always @(posedge clk) begin
-		//
-		if (inc_addr_in)	addr_in <= addr_in_next;
-		else					addr_in <= addr_in_zero;
-		//
-		if (inc_addr_out)	addr_out <= addr_out_next;
-		else					addr_out <= addr_out_zero;
-		//
-	end
-	
-		//
-		// Write Enable Logic
-		//
-	wire	wren_out;
-
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_start	= 2;
-	wire	[PROC_CNT_BITS-1:0]	cnt_wren_out_stop		= BUFFER_NUM_WORDS + 1;
-
-	assign wren_out = (proc_cnt >= cnt_wren_out_start) && (proc_cnt <= cnt_wren_out_stop);
-
-	assign s_wren = wren_out && !k_is_nul; //s_wren_allow && !v_eq_1 && !rdy;
-	
-	
-		//
-		// Data Logic
-		//
-	assign s_dout = s_is_odd ? v_din : u_din;
-
-		
-		//
-		// Primary Counter Logic
-		//
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) proc_cnt <= proc_cnt_zero;
-		else begin
-			if (!rdy)		proc_cnt <= proc_cnt_next;
-			else if (ena)	proc_cnt <= proc_cnt_next;
-		end
-
-
-endmodule
diff --git a/rtl/modular/modular_invertor/modinv_clog2.v b/rtl/modular/modular_invertor/modinv_clog2.v
deleted file mode 100644
index 2f7b64d..0000000
--- a/rtl/modular/modular_invertor/modinv_clog2.v
+++ /dev/null
@@ -1,10 +0,0 @@
-function	integer clog2;
-	input	integer value;
-			integer result;
-	begin
-		value = value - 1;
-		for (result = 0; value > 0; result = result + 1)
-			value = value >> 1;
-		clog2 = result;
-	end
-endfunction
diff --git a/rtl/modular/modular_invertor/modular_invertor.v b/rtl/modular/modular_invertor/modular_invertor.v
deleted file mode 100644
index e9f2460..0000000
--- a/rtl/modular/modular_invertor/modular_invertor.v
+++ /dev/null
@@ -1,981 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// modular_invertor.v
-// -----------------------------------------------------------------------------
-// Modular invertor.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-module modular_invertor
-	(
-		clk, rst_n,
-		ena, rdy,
-		a_addr, q_addr, a1_addr, a1_wren,
-		a_din, q_din, a1_dout
-	);
-
-
-		//
-		// Parameters
-		//
-	parameter MAX_OPERAND_WIDTH = 256;
-	
-	
-		//
-		// clog2
-		//
-`include "modinv_clog2.v"
-
-
-		//
-		// More Parameters
-		//
-	localparam OPERAND_NUM_WORDS	= MAX_OPERAND_WIDTH / 32;
-	localparam OPERAND_ADDR_BITS	= clog2(OPERAND_NUM_WORDS);
-	
-	localparam BUFFER_NUM_WORDS	= OPERAND_NUM_WORDS + 1;
-	localparam BUFFER_ADDR_BITS	= clog2(BUFFER_NUM_WORDS);
-	
-	localparam LOOP_NUM_ROUNDS		= 2 * MAX_OPERAND_WIDTH;
-	localparam ROUND_COUNTER_BITS	= clog2(LOOP_NUM_ROUNDS);
-	
-	localparam K_NUM_BITS			= clog2(LOOP_NUM_ROUNDS + 1);
-	
-
-		//
-		// Ports
-		//
-	input		wire									clk;
-	input		wire									rst_n;
-	
-	input		wire									ena;
-	output	wire									rdy;
-	
-	output	wire	[OPERAND_ADDR_BITS-1:0]	a_addr;
-	output	reg	[OPERAND_ADDR_BITS-1:0]	q_addr;
-	output	wire	[OPERAND_ADDR_BITS-1:0]	a1_addr;
-	output	wire									a1_wren;
-	
-	input		wire	[32-1:0]						a_din;
-	input		wire	[32-1:0]						q_din;
-	output	wire	[32-1:0]						a1_dout;
-
-
-		//
-		// "Redundant" Power of 2 (K)
-		//
-	reg	[K_NUM_BITS-1:0]	k;
-
-	
-		//
-		// Buffers
-		//
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_r_wr_addr;
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_r_rd_addr;
-	reg									buf_r_wr_en;
-	reg	[              32-1:0]	buf_r_wr_din;
-	wire	[              32-1:0]	buf_r_wr_dout;
-	wire	[              32-1:0]	buf_r_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_r
-	(	.clk(clk),	
-		.a_addr(buf_r_wr_addr), .a_out(buf_r_wr_dout), .a_wr(buf_r_wr_en), .a_in(buf_r_wr_din),
-		.b_addr(buf_r_rd_addr), .b_out(buf_r_rd_dout)
-	);
-	
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_s_wr_addr;
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_s_rd_addr;
-	reg									buf_s_wr_en;
-	reg	[              32-1:0]	buf_s_wr_din;
-	wire	[              32-1:0]	buf_s_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_s
-	(	.clk(clk),	
-		.a_addr(buf_s_wr_addr), .a_out(),              .a_wr(buf_s_wr_en), .a_in(buf_s_wr_din),
-		.b_addr(buf_s_rd_addr), .b_out(buf_s_rd_dout)
-	);
-	
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_u_wr_addr;
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_u_rd_addr;
-	reg									buf_u_wr_en;
-	reg	[              32-1:0]	buf_u_wr_din;
-	wire	[              32-1:0]	buf_u_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_u
-	(	.clk(clk),	
-		.a_addr(buf_u_wr_addr), .a_out(),              .a_wr(buf_u_wr_en), .a_in(buf_u_wr_din),
-		.b_addr(buf_u_rd_addr), .b_out(buf_u_rd_dout)
-	);
-	
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_v_wr_addr;
-	reg	[BUFFER_ADDR_BITS-1:0]	buf_v_rd_addr;
-	reg									buf_v_wr_en;
-	reg	[              32-1:0]	buf_v_wr_din;
-	wire	[              32-1:0]	buf_v_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_v
-	(	.clk(clk),	
-		.a_addr(buf_v_wr_addr), .a_out(),              .a_wr(buf_v_wr_en), .a_in(buf_v_wr_din),
-		.b_addr(buf_v_rd_addr), .b_out(buf_v_rd_dout)
-	);	
-
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_dbl_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_dbl_rd_addr;
-	wire									buf_r_dbl_wr_en;
-	wire	[              32-1:0]	buf_r_dbl_wr_din;
-	wire	[              32-1:0]	buf_r_dbl_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_r_dbl
-	(	.clk(clk),	
-		.a_addr(buf_r_dbl_wr_addr), .a_out(),                  .a_wr(buf_r_dbl_wr_en), .a_in(buf_r_dbl_wr_din),
-		.b_addr(buf_r_dbl_rd_addr), .b_out(buf_r_dbl_rd_dout)
-	);
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_s_dbl_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_s_dbl_rd_addr;
-	wire									buf_s_dbl_wr_en;
-	wire	[              32-1:0]	buf_s_dbl_wr_din;
-	wire	[              32-1:0]	buf_s_dbl_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_s_dbl
-	(	.clk(clk),	
-		.a_addr(buf_s_dbl_wr_addr), .a_out(),                  .a_wr(buf_s_dbl_wr_en), .a_in(buf_s_dbl_wr_din),
-		.b_addr(buf_s_dbl_rd_addr), .b_out(buf_s_dbl_rd_dout)
-	);
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_plus_s_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_r_plus_s_rd_addr;
-	wire									buf_r_plus_s_wr_en;
-	wire	[              32-1:0]	buf_r_plus_s_wr_din;
-	wire	[              32-1:0]	buf_r_plus_s_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_r_plus_s
-	(	.clk(clk),	
-		.a_addr(buf_r_plus_s_wr_addr), .a_out(),                     .a_wr(buf_r_plus_s_wr_en), .a_in(buf_r_plus_s_wr_din),
-		.b_addr(buf_r_plus_s_rd_addr), .b_out(buf_r_plus_s_rd_dout)
-	);
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_rd_addr;
-	wire									buf_u_minus_v_wr_en;
-	wire	[              32-1:0]	buf_u_minus_v_wr_din;
-	wire	[              32-1:0]	buf_u_minus_v_wr_dout;
-
-	assign buf_u_minus_v_rd_addr = ~buf_u_minus_v_wr_addr;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_u_minus_v
-	(	.clk(clk),	
-		.a_addr(buf_u_minus_v_wr_addr), .a_out(buf_u_minus_v_wr_dout), .a_wr(buf_u_minus_v_wr_en), .a_in(buf_u_minus_v_wr_din),
-		.b_addr(buf_u_minus_v_rd_addr), .b_out()
-	);
-
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_rd_addr;
-	wire									buf_v_minus_u_wr_en;
-	wire	[              32-1:0]	buf_v_minus_u_wr_din;
-	wire	[              32-1:0]	buf_v_minus_u_wr_dout;
-	
-	assign buf_v_minus_u_rd_addr = ~buf_v_minus_u_wr_addr;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_v_minus_u
-	(	.clk(clk),	
-		.a_addr(buf_v_minus_u_wr_addr), .a_out(buf_v_minus_u_wr_dout), .a_wr(buf_v_minus_u_wr_en), .a_in(buf_v_minus_u_wr_din),
-		.b_addr(buf_v_minus_u_rd_addr), .b_out()
-	);
-
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_half_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_half_rd_addr;
-	wire									buf_u_half_wr_en;
-	wire	[              32-1:0]	buf_u_half_wr_din;
-	wire	[              32-1:0]	buf_u_half_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_u_half
-	(	.clk(clk),	
-		.a_addr(buf_u_half_wr_addr), .a_out(),                   .a_wr(buf_u_half_wr_en), .a_in(buf_u_half_wr_din),
-		.b_addr(buf_u_half_rd_addr), .b_out(buf_u_half_rd_dout)
-	);
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_half_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_half_rd_addr;
-	wire									buf_v_half_wr_en;
-	wire	[              32-1:0]	buf_v_half_wr_din;
-	wire	[              32-1:0]	buf_v_half_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_v_half
-	(	.clk(clk),	
-		.a_addr(buf_v_half_wr_addr), .a_out(),                   .a_wr(buf_v_half_wr_en), .a_in(buf_v_half_wr_din),
-		.b_addr(buf_v_half_rd_addr), .b_out(buf_v_half_rd_dout)
-	);
-	
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_half_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_u_minus_v_half_rd_addr;
-	wire									buf_u_minus_v_half_wr_en;
-	wire	[              32-1:0]	buf_u_minus_v_half_wr_din;
-	wire	[              32-1:0]	buf_u_minus_v_half_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_u_minus_v_half
-	(	.clk(clk),	
-		.a_addr(buf_u_minus_v_half_wr_addr), .a_out(),                           .a_wr(buf_u_minus_v_half_wr_en), .a_in(buf_u_minus_v_half_wr_din),
-		.b_addr(buf_u_minus_v_half_rd_addr), .b_out(buf_u_minus_v_half_rd_dout)
-	);
-
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_half_wr_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	buf_v_minus_u_half_rd_addr;
-	wire									buf_v_minus_u_half_wr_en;
-	wire	[              32-1:0]	buf_v_minus_u_half_wr_din;
-	wire	[              32-1:0]	buf_v_minus_u_half_rd_dout;
-
-	bram_1rw_1ro_readfirst #
-	(	.MEM_WIDTH(32), .MEM_ADDR_BITS(BUFFER_ADDR_BITS)
-	)
-	buf_v_minus_u_half
-	(	.clk(clk),	
-		.a_addr(buf_v_minus_u_half_wr_addr), .a_out(),                           .a_wr(buf_v_minus_u_half_wr_en), .a_in(buf_v_minus_u_half_wr_din),
-		.b_addr(buf_v_minus_u_half_rd_addr), .b_out(buf_v_minus_u_half_rd_dout)
-	);
-
-
-		//
-		// Helper Modules
-		//
-	wire helper_init_ena;
-	wire helper_invert_precalc_ena;
-	wire helper_invert_compare_ena;
-	wire helper_invert_update_ena;
-	wire helper_reduce_precalc_ena;
-	wire helper_reduce_update_ena;
-	wire helper_copy_ena;
-	
-	wire helper_init_rdy;
-	wire helper_invert_precalc_rdy;
-	wire helper_invert_compare_rdy;
-	wire helper_invert_update_rdy;
-	wire helper_reduce_precalc_rdy;
-	wire helper_reduce_update_rdy;
-	wire helper_copy_rdy;
-	
-	wire helper_init_done				= helper_init_rdy           && !helper_init_ena;
-	wire helper_invert_precalc_done	= helper_invert_precalc_rdy && !helper_invert_precalc_ena;
-	wire helper_invert_compare_done	= helper_invert_compare_rdy && !helper_invert_compare_ena;
-	wire helper_invert_update_done	= helper_invert_update_rdy  && !helper_invert_update_ena;
-	wire helper_reduce_precalc_done	= helper_reduce_precalc_rdy && !helper_reduce_precalc_ena;
-	wire helper_reduce_update_done	= helper_reduce_update_rdy  && !helper_reduce_update_ena;
-	wire helper_copy_done				= helper_copy_rdy           && !helper_copy_ena;
-	
-	
-		//
-		// Helper Module - Initialization
-		//
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_r_addr;
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_s_addr;
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_u_addr;
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_init_v_addr;
-	wire	[OPERAND_ADDR_BITS-1:0]	helper_init_q_addr;
-	
-	wire									helper_init_r_wren;
-	wire									helper_init_s_wren;
-	wire									helper_init_u_wren;
-	wire									helper_init_v_wren;
-	
-	wire	[              32-1:0]	helper_init_r_data;
-	wire	[              32-1:0]	helper_init_s_data;
-	wire	[              32-1:0]	helper_init_u_data;
-	wire	[              32-1:0]	helper_init_v_data;
-	
-	modinv_helper_init #
-	(
-		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
-		.OPERAND_ADDR_BITS	(OPERAND_ADDR_BITS),
-	
-		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
-		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
-	)
-	helper_init
-	(
-		.clk 		(clk),
-		.rst_n	(rst_n),
-		
-		.ena 		(helper_init_ena),
-		.rdy 		(helper_init_rdy),
-		
-		.a_addr	(a_addr),
-		.q_addr	(helper_init_q_addr),
-		
-		.r_addr	(helper_init_r_addr),
-		.s_addr	(helper_init_s_addr),
-		.u_addr	(helper_init_u_addr),
-		.v_addr	(helper_init_v_addr),
-		
-		.q_din	(q_din),
-		.a_din	(a_din),
-		
-		.r_dout	(helper_init_r_data),
-		.s_dout	(helper_init_s_data),
-		.u_dout	(helper_init_u_data),
-		.v_dout	(helper_init_v_data),
-		
-		.r_wren	(helper_init_r_wren),
-		.s_wren	(helper_init_s_wren),
-		.u_wren	(helper_init_u_wren),
-		.v_wren	(helper_init_v_wren)
-	);
-	
-	
-		//
-		// Helper Module - Inversion Pre-Calculation
-		//
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_r_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_s_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_u_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_precalc_v_addr;
-	
-	modinv_helper_invert_precalc #
-	(
-		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
-		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
-	)
-	helper_invert_precalc
-	(
-		.clk 							(clk),
-		.rst_n						(rst_n),
-		
-		.ena 							(helper_invert_precalc_ena),
-		.rdy 							(helper_invert_precalc_rdy),
-		
-		.r_addr						(helper_invert_precalc_r_addr),
-		.s_addr						(helper_invert_precalc_s_addr),
-		.u_addr						(helper_invert_precalc_u_addr),
-		.v_addr						(helper_invert_precalc_v_addr),
-		
-		.r_din						(buf_r_rd_dout),
-		.s_din						(buf_s_rd_dout),
-		.u_din						(buf_u_rd_dout),
-		.v_din						(buf_v_rd_dout),
-		
-		.r_dbl_addr					(buf_r_dbl_wr_addr),
-		.s_dbl_addr					(buf_s_dbl_wr_addr),
-		.r_plus_s_addr				(buf_r_plus_s_wr_addr),
-		
-		.u_half_addr				(buf_u_half_wr_addr),
-		.v_half_addr				(buf_v_half_wr_addr),
-		.u_minus_v_addr			(buf_u_minus_v_wr_addr),
-		.v_minus_u_addr			(buf_v_minus_u_wr_addr),
-		.u_minus_v_half_addr		(buf_u_minus_v_half_wr_addr),
-		.v_minus_u_half_addr		(buf_v_minus_u_half_wr_addr),
-		
-		.r_dbl_dout					(buf_r_dbl_wr_din),
-		.s_dbl_dout					(buf_s_dbl_wr_din),
-		.r_plus_s_dout				(buf_r_plus_s_wr_din),
-		
-		.u_half_dout				(buf_u_half_wr_din),
-		.v_half_dout				(buf_v_half_wr_din),
-		.u_minus_v_dout			(buf_u_minus_v_wr_din),
-		.v_minus_u_dout			(buf_v_minus_u_wr_din),
-		.u_minus_v_half_dout		(buf_u_minus_v_half_wr_din),
-		.v_minus_u_half_dout		(buf_v_minus_u_half_wr_din),
-		
-		.r_dbl_wren					(buf_r_dbl_wr_en),
-		.s_dbl_wren					(buf_s_dbl_wr_en),
-		.r_plus_s_wren				(buf_r_plus_s_wr_en),
-		
-		.u_half_wren				(buf_u_half_wr_en),
-		.v_half_wren				(buf_v_half_wr_en),
-		.u_minus_v_wren			(buf_u_minus_v_wr_en),
-		.v_minus_u_wren			(buf_v_minus_u_wr_en),
-		.u_minus_v_half_wren		(buf_u_minus_v_half_wr_en),
-		.v_minus_u_half_wren		(buf_v_minus_u_half_wr_en),
-		
-		.u_minus_v_din				(buf_u_minus_v_wr_dout),
-		.v_minus_u_din				(buf_v_minus_u_wr_dout)
-	);
-	
-	
-		//
-		// Helper Module - Inversion Comparison
-		//
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_compare_u_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_compare_v_addr;
-
-	wire	flag_invert_u_gt_v;
-	wire	flag_invert_v_eq_1;
-	wire	flag_invert_u_is_even;
-	wire	flag_invert_v_is_even;
-
-	modinv_helper_invert_compare #
-	(
-		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
-		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
-	)
-	helper_invert_compare
-	(
-		.clk 			(clk),
-		.rst_n		(rst_n),
-		
-		.ena 			(helper_invert_compare_ena),
-		.rdy 			(helper_invert_compare_rdy),
-				
-		.u_addr		(helper_invert_compare_u_addr),
-		.v_addr		(helper_invert_compare_v_addr),
-		
-		.u_din		(buf_u_rd_dout),
-		.v_din		(buf_v_rd_dout),
-		
-		.u_gt_v		(flag_invert_u_gt_v),
-		.v_eq_1		(flag_invert_v_eq_1),
-		.u_is_even	(flag_invert_u_is_even),
-		.v_is_even	(flag_invert_v_is_even)
-	);
-	
-		
-		//
-		// Helper Module - Inversion Update
-		//
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_r_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_s_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_u_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_invert_update_v_addr;
-	
-	wire									helper_invert_update_r_wren;
-	wire									helper_invert_update_s_wren;
-	wire									helper_invert_update_u_wren;
-	wire									helper_invert_update_v_wren;
-	
-	wire	[              32-1:0]	helper_invert_update_r_data;
-	wire	[              32-1:0]	helper_invert_update_s_data;
-	wire	[              32-1:0]	helper_invert_update_u_data;
-	wire	[              32-1:0]	helper_invert_update_v_data;
-	
-	modinv_helper_invert_update #
-	(
-		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
-		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
-	)
-	helper_invert_update
-	(
-		.clk 							(clk),
-		.rst_n						(rst_n),
-		
-		.ena 							(helper_invert_update_ena),
-		.rdy 							(helper_invert_update_rdy),
-		
-		.u_gt_v						(flag_invert_u_gt_v),
-		.v_eq_1						(flag_invert_v_eq_1),
-		.u_is_even					(flag_invert_u_is_even),
-		.v_is_even					(flag_invert_v_is_even),
-		
-		.r_addr						(helper_invert_update_r_addr),
-		.s_addr						(helper_invert_update_s_addr),
-		.u_addr						(helper_invert_update_u_addr),
-		.v_addr						(helper_invert_update_v_addr),
-		
-		.r_wren						(helper_invert_update_r_wren),
-		.s_wren						(helper_invert_update_s_wren),
-		.u_wren						(helper_invert_update_u_wren),
-		.v_wren						(helper_invert_update_v_wren),
-		
-		.r_dout						(helper_invert_update_r_data),
-		.s_dout						(helper_invert_update_s_data),
-		.u_dout						(helper_invert_update_u_data),
-		.v_dout						(helper_invert_update_v_data),
-		
-		.r_dbl_addr					(buf_r_dbl_rd_addr),
-		.s_dbl_addr					(buf_s_dbl_rd_addr),
-		.r_plus_s_addr				(buf_r_plus_s_rd_addr),
-		.u_half_addr				(buf_u_half_rd_addr),
-		.v_half_addr				(buf_v_half_rd_addr),
-		.u_minus_v_half_addr		(buf_u_minus_v_half_rd_addr),
-		.v_minus_u_half_addr		(buf_v_minus_u_half_rd_addr),
-		
-		.r_dbl_din					(buf_r_dbl_rd_dout),
-		.s_dbl_din					(buf_s_dbl_rd_dout),
-		.r_plus_s_din				(buf_r_plus_s_rd_dout),
-		.u_half_din					(buf_u_half_rd_dout),
-		.v_half_din					(buf_v_half_rd_dout),
-		.u_minus_v_half_din		(buf_u_minus_v_half_rd_dout),
-		.v_minus_u_half_din		(buf_v_minus_u_half_rd_dout)
-	);
-	
-	
-		//
-		// Helper Module - Reduction Pre-Calculation
-		//
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_r_addr;
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_s_addr;
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_u_addr;
-	wire	[ BUFFER_ADDR_BITS-1:0]	helper_reduce_precalc_v_addr;
-	wire	[OPERAND_ADDR_BITS-1:0]	helper_reduce_precalc_q_addr;
-	
-	wire									helper_reduce_precalc_r_wren;
-	wire									helper_reduce_precalc_u_wren;
-	wire									helper_reduce_precalc_v_wren;
-	
-	wire	[              32-1:0]	helper_reduce_precalc_r_data;
-	wire	[              32-1:0]	helper_reduce_precalc_u_data;
-	wire	[              32-1:0]	helper_reduce_precalc_v_data;
-
-	wire	flag_reduce_s_is_odd;
-	wire	flag_invert_k_is_nul;
-	
-	modinv_helper_reduce_precalc #
-	(
-		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
-		.OPERAND_ADDR_BITS	(OPERAND_ADDR_BITS),
-		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
-		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS),
-		.K_NUM_BITS				(K_NUM_BITS)
-	)
-	helper_reduce_precalc
-	(
-		.clk 			(clk),
-		.rst_n		(rst_n),
-		
-		.ena 			(helper_reduce_precalc_ena),
-		.rdy 			(helper_reduce_precalc_rdy),
-		
-		.r_addr		(helper_reduce_precalc_r_addr),
-		.s_addr		(helper_reduce_precalc_s_addr),
-		.u_addr		(helper_reduce_precalc_u_addr),
-		.v_addr		(helper_reduce_precalc_v_addr),
-		.q_addr		(helper_reduce_precalc_q_addr),
-		
-		.k				(k),
-		
-		.s_is_odd	(flag_reduce_s_is_odd),
-		.k_is_nul	(flag_reduce_k_is_nul),
-		
-		.r_din		(buf_r_wr_dout),
-		.s_din		(buf_s_rd_dout),
-		.q_din		(q_din),
-		
-		.r_wren		(helper_reduce_precalc_r_wren),
-		.u_wren		(helper_reduce_precalc_u_wren),
-		.v_wren		(helper_reduce_precalc_v_wren),
-		
-		.r_dout		(helper_reduce_precalc_r_data),
-		.u_dout		(helper_reduce_precalc_u_data),
-		.v_dout		(helper_reduce_precalc_v_data)
-	);
-	
-		//
-		// Helper Module - Reduction Update
-		//
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_reduce_update_s_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_reduce_update_u_addr;
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_reduce_update_v_addr;
-	
-	wire									helper_reduce_update_s_wren;
-	
-	wire	[              32-1:0]	helper_reduce_update_s_data;
-	
-	modinv_helper_reduce_update #
-	(
-		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
-		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
-	)
-	helper_reduce_update
-	(
-		.clk 							(clk),
-		.rst_n						(rst_n),
-		
-		.ena 							(helper_reduce_update_ena),
-		.rdy 							(helper_reduce_update_rdy),
-		
-		.s_is_odd					(flag_reduce_s_is_odd),
-		.k_is_nul					(flag_reduce_k_is_nul),
-		
-		.s_addr						(helper_reduce_update_s_addr),
-		.u_addr						(helper_reduce_update_u_addr),
-		.v_addr						(helper_reduce_update_v_addr),
-		
-		.s_wren						(helper_reduce_update_s_wren),
-		
-		.s_dout						(helper_reduce_update_s_data),
-				
-		.u_din						(buf_u_rd_dout),
-		.v_din						(buf_v_rd_dout)
-	);
-	
-	
-		//
-		// Helper Module - Copying
-		//
-	wire	[BUFFER_ADDR_BITS-1:0]	helper_copy_s_addr;
-		
-	modinv_helper_copy #
-	(
-		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
-		.OPERAND_ADDR_BITS	(OPERAND_ADDR_BITS),
-	
-		.BUFFER_NUM_WORDS		(BUFFER_NUM_WORDS),
-		.BUFFER_ADDR_BITS		(BUFFER_ADDR_BITS)
-	)
-	helper_copy
-	(
-		.clk 		(clk),
-		.rst_n	(rst_n),
-		
-		.ena 		(helper_copy_ena),
-		.rdy 		(helper_copy_rdy),
-		
-		.s_addr	(helper_copy_s_addr),
-		.a1_addr	(a1_addr),
-		
-		.s_din	(buf_s_rd_dout),
-		
-		.a1_dout	(a1_dout),
-		
-		.a1_wren	(a1_wren)
-	);
-	
-	
-		//
-		// Round Counter
-		//
-	reg	[ROUND_COUNTER_BITS-1:0]	round_counter;
-	wire	[ROUND_COUNTER_BITS-1:0]	round_counter_max = LOOP_NUM_ROUNDS - 1;
-	wire	[ROUND_COUNTER_BITS-1:0]	round_counter_zero = {ROUND_COUNTER_BITS{1'b0}};
-	wire	[ROUND_COUNTER_BITS-1:0]	round_counter_next =
-		(round_counter < round_counter_max) ? round_counter + 1'b1 : round_counter_zero;
-
-	
-		//
-		// FSM
-		//
-	localparam FSM_STATE_IDLE				= 4'd0;
-	
-	localparam FSM_STATE_INIT				= 4'd1;
-	
-	localparam FSM_STATE_INVERT_PRECALC	= 4'd11;
-	localparam FSM_STATE_INVERT_COMPARE	= 4'd12;
-	localparam FSM_STATE_INVERT_UPDATE	= 4'd13;
-	
-	localparam FSM_STATE_REDUCE_PRECALC	= 4'd14;
-	localparam FSM_STATE_REDUCE_UPDATE	= 4'd15;
-	
-	localparam FSM_STATE_COPY				= 4'd2;
-	
-	localparam FSM_STATE_DONE				= 4'd3;
-	
-	reg [3:0] fsm_state = FSM_STATE_IDLE;
-	reg [3:0] fsm_state_dly = FSM_STATE_IDLE;
-	
-	wire fsm_state_new = (fsm_state != fsm_state_dly);
-
-	wire [3:0] fsm_state_invert_next = (round_counter < round_counter_max) ?
-		FSM_STATE_INVERT_PRECALC : FSM_STATE_REDUCE_PRECALC;
-		
-	wire [3:0] fsm_state_reduce_next = (round_counter < round_counter_max) ?
-		FSM_STATE_REDUCE_PRECALC : FSM_STATE_COPY;
-	
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE;
-		else case (fsm_state)
-			FSM_STATE_IDLE:				fsm_state <= ena                        ? FSM_STATE_INIT           : FSM_STATE_IDLE;
-			FSM_STATE_INIT:				fsm_state <= helper_init_done           ? FSM_STATE_INVERT_PRECALC : FSM_STATE_INIT;
-			FSM_STATE_INVERT_PRECALC:	fsm_state <= helper_invert_precalc_done ? FSM_STATE_INVERT_COMPARE : FSM_STATE_INVERT_PRECALC;
-			FSM_STATE_INVERT_COMPARE:	fsm_state <= helper_invert_compare_done ? FSM_STATE_INVERT_UPDATE  : FSM_STATE_INVERT_COMPARE;
-			FSM_STATE_INVERT_UPDATE:	fsm_state <= helper_invert_update_done  ? fsm_state_invert_next    : FSM_STATE_INVERT_UPDATE;
-			FSM_STATE_REDUCE_PRECALC:	fsm_state <= helper_reduce_precalc_done ? FSM_STATE_REDUCE_UPDATE  : FSM_STATE_REDUCE_PRECALC;
-			FSM_STATE_REDUCE_UPDATE:	fsm_state <= helper_reduce_update_done  ? fsm_state_reduce_next    : FSM_STATE_REDUCE_UPDATE;
-			FSM_STATE_COPY:				fsm_state <= helper_copy_done           ? FSM_STATE_DONE           : FSM_STATE_COPY;
-			FSM_STATE_DONE:				fsm_state <= FSM_STATE_IDLE;
-			default:							fsm_state <= FSM_STATE_IDLE;
-		endcase
-		
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)	fsm_state_dly <= FSM_STATE_IDLE;
-		else						fsm_state_dly <= fsm_state;
-
-
-	assign helper_init_ena				= (fsm_state == FSM_STATE_INIT)           && fsm_state_new;
-	assign helper_invert_precalc_ena	= (fsm_state == FSM_STATE_INVERT_PRECALC) && fsm_state_new;
-	assign helper_invert_compare_ena	= (fsm_state == FSM_STATE_INVERT_COMPARE) && fsm_state_new;
-	assign helper_invert_update_ena	= (fsm_state == FSM_STATE_INVERT_UPDATE)  && fsm_state_new;
-	assign helper_reduce_precalc_ena	= (fsm_state == FSM_STATE_REDUCE_PRECALC) && fsm_state_new;
-	assign helper_reduce_update_ena	= (fsm_state == FSM_STATE_REDUCE_UPDATE)  && fsm_state_new;
-	assign helper_copy_ena				= (fsm_state == FSM_STATE_COPY)           && fsm_state_new;
-	
-	
-		//
-		// Counter Increment
-		//
-	always @(posedge clk) begin
-		//
-		if ((fsm_state == FSM_STATE_INIT) && helper_init_done)
-			round_counter <= round_counter_zero;
-		//	
-		if ((fsm_state == FSM_STATE_INVERT_UPDATE) && helper_invert_update_done)
-			round_counter <= round_counter_next;
-		//
-		if ((fsm_state == FSM_STATE_REDUCE_UPDATE) && helper_reduce_update_done)
-			round_counter <= round_counter_next;
-		//
-	end
-		
-		
-		//
-		// Q Address Selector
-		//
-	always @(*) begin
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				q_addr = helper_init_q_addr;
-			FSM_STATE_REDUCE_PRECALC:	q_addr = helper_reduce_precalc_q_addr;
-			default:							q_addr = {OPERAND_ADDR_BITS{1'bX}};
-		endcase
-		//
-	end
-	
-	
-		//
-		// Buffer Address Selector
-		//
-	always @(*) begin
-		//
-		// Write Ports
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_r_wr_addr = helper_init_r_addr;
-			FSM_STATE_INVERT_UPDATE:	buf_r_wr_addr = helper_invert_update_r_addr;
-			FSM_STATE_REDUCE_PRECALC:	buf_r_wr_addr = helper_reduce_precalc_r_addr;
-			default:							buf_r_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_s_wr_addr = helper_init_s_addr;
-			FSM_STATE_INVERT_UPDATE:	buf_s_wr_addr = helper_invert_update_s_addr;
-			FSM_STATE_REDUCE_UPDATE:	buf_s_wr_addr = helper_reduce_update_s_addr;
-			default:							buf_s_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_u_wr_addr = helper_init_u_addr;
-			FSM_STATE_INVERT_UPDATE:	buf_u_wr_addr = helper_invert_update_u_addr;
-			FSM_STATE_REDUCE_PRECALC:	buf_u_wr_addr = helper_reduce_precalc_u_addr;
-			default:							buf_u_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_v_wr_addr = helper_init_v_addr;
-			FSM_STATE_INVERT_UPDATE:	buf_v_wr_addr = helper_invert_update_v_addr;
-			FSM_STATE_REDUCE_PRECALC:	buf_v_wr_addr = helper_reduce_precalc_v_addr;
-			default:							buf_v_wr_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-		// Read Ports
-		//
-		case (fsm_state)
-			FSM_STATE_INVERT_PRECALC:	buf_r_rd_addr = helper_invert_precalc_r_addr;
-			default:							buf_r_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INVERT_PRECALC:	buf_s_rd_addr = helper_invert_precalc_s_addr;
-			FSM_STATE_REDUCE_PRECALC:	buf_s_rd_addr = helper_reduce_precalc_s_addr;
-			FSM_STATE_COPY:				buf_s_rd_addr = helper_copy_s_addr;
-			default:							buf_s_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INVERT_PRECALC:	buf_u_rd_addr = helper_invert_precalc_u_addr;
-			FSM_STATE_INVERT_COMPARE:	buf_u_rd_addr = helper_invert_compare_u_addr;
-			FSM_STATE_REDUCE_UPDATE:	buf_u_rd_addr = helper_reduce_update_u_addr;
-			default:							buf_u_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INVERT_PRECALC:	buf_v_rd_addr = helper_invert_precalc_v_addr;
-			FSM_STATE_INVERT_COMPARE:	buf_v_rd_addr = helper_invert_compare_v_addr;
-			FSM_STATE_REDUCE_UPDATE:	buf_v_rd_addr = helper_reduce_update_v_addr;
-			default:							buf_v_rd_addr = {BUFFER_ADDR_BITS{1'bX}};
-		endcase
-		//
-	end
-	
-	
-		//
-		// Buffer Write Enable Logic
-		//
-	always @(*) begin
-		//
-		// Write Ports
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_r_wr_en  = helper_init_r_wren;
-			FSM_STATE_INVERT_UPDATE:	buf_r_wr_en = helper_invert_update_r_wren;
-			FSM_STATE_REDUCE_PRECALC:	buf_r_wr_en = helper_reduce_precalc_r_wren;
-			default:							buf_r_wr_en = 1'b0;
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_s_wr_en = helper_init_s_wren;
-			FSM_STATE_INVERT_UPDATE:	buf_s_wr_en = helper_invert_update_s_wren;
-			FSM_STATE_REDUCE_UPDATE:	buf_s_wr_en = helper_reduce_update_s_wren;
-			default:							buf_s_wr_en = 1'b0;
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_u_wr_en = helper_init_u_wren;
-			FSM_STATE_INVERT_UPDATE:	buf_u_wr_en = helper_invert_update_u_wren;
-			FSM_STATE_REDUCE_PRECALC:	buf_u_wr_en = helper_reduce_precalc_u_wren;
-			default:							buf_u_wr_en = 1'b0;
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_v_wr_en = helper_init_v_wren;
-			FSM_STATE_INVERT_UPDATE:	buf_v_wr_en = helper_invert_update_v_wren;
-			FSM_STATE_REDUCE_PRECALC:	buf_v_wr_en = helper_reduce_precalc_v_wren;
-			default:							buf_v_wr_en = 1'b0;
-		endcase
-		//
-	end
-	
-	
-		//
-		// Buffer Write Data Selector
-		//
-	always @(*) begin
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_r_wr_din = helper_init_r_data;
-			FSM_STATE_INVERT_UPDATE:	buf_r_wr_din = helper_invert_update_r_data;
-			FSM_STATE_REDUCE_PRECALC:	buf_r_wr_din = helper_reduce_precalc_r_data;
-			default:							buf_r_wr_din = {32{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_s_wr_din = helper_init_s_data;
-			FSM_STATE_INVERT_UPDATE:	buf_s_wr_din = helper_invert_update_s_data;
-			FSM_STATE_REDUCE_UPDATE:	buf_s_wr_din = helper_reduce_update_s_data;
-			default:							buf_s_wr_din = {32{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_u_wr_din = helper_init_u_data;
-			FSM_STATE_INVERT_UPDATE:	buf_u_wr_din = helper_invert_update_u_data;
-			FSM_STATE_REDUCE_PRECALC:	buf_u_wr_din = helper_reduce_precalc_u_data;
-			default:							buf_u_wr_din = {32{1'bX}};
-		endcase
-		//
-		case (fsm_state)
-			FSM_STATE_INIT:				buf_v_wr_din = helper_init_v_data;
-			FSM_STATE_INVERT_UPDATE:	buf_v_wr_din = helper_invert_update_v_data;
-			FSM_STATE_REDUCE_PRECALC:	buf_v_wr_din = helper_reduce_precalc_v_data;
-			default:							buf_v_wr_din = {32{1'bX}};
-		endcase
-		//
-	end
-	
-	
-		//
-		// Ready Logic
-		//
-	reg rdy_reg = 1'b1;
-
-	assign rdy = rdy_reg;
-	
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0) rdy_reg <= 1'b1;
-		else begin
-		
-				/* clear */
-			if (rdy && ena) rdy_reg <= 1'b0;
-			
-				/* set */
-			if (!rdy && (fsm_state == FSM_STATE_DONE)) rdy_reg <= 1'b1;
-			
-		end
-	
-		
-		//
-		// Store Redundant Power of 2 (K)
-		//
-	always @(posedge clk)
-		//
-		if (helper_init_ena)
-			k <= {K_NUM_BITS{1'b0}};
-		else begin
-		
-			if (helper_invert_update_ena && !flag_invert_v_eq_1)
-				k <= k + 1'b1;
-				
-			if (helper_reduce_update_ena && (k != {K_NUM_BITS{1'b0}}))
-				k <= k - 1'b1;
-				
-		end
-	
-endmodule
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_multiplier_256.v b/rtl/modular/modular_multiplier_256.v
index c2f2661..2b35233 100644
--- a/rtl/modular/modular_multiplier_256.v
+++ b/rtl/modular/modular_multiplier_256.v
@@ -1,402 +1,402 @@
-//------------------------------------------------------------------------------
-//
-// modular_multiplier_256.v
-// -----------------------------------------------------------------------------
-// Modular multiplier.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-module modular_multiplier_256
-	(
-		clk, rst_n,
-		ena, rdy,
-		a_addr, b_addr, n_addr, p_addr, p_wren,
-		a_din, b_din, n_din, p_dout
-	);
-	
-	
-		//
-		// Constants
-		//
-	localparam	OPERAND_NUM_WORDS					= 8;
-	localparam	WORD_COUNTER_WIDTH				= 3;
-	
-	
-		//
-		// Handy Numbers
-		//
-	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
-	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
-	
-	
-		//
-		// Handy Functions
-		//
-	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
-		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
-		begin
-			WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
-				WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
-		end
-	endfunction
-	
-	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_PREVIOUS_OR_LAST;
-		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
-		begin
-			WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
-				WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
-		end
-	endfunction
-	
-	
-		//
-		// Ports
-		//
-	input		wire										clk;		// system clock
-	input		wire										rst_n;	// active-low async reset
-	
-	input		wire										ena;		// enable input
-	output	wire										rdy;		// ready output
-	
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	a_addr;	// index of current A word
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	b_addr;	// index of current B word
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;	// index of current N word
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	p_addr;	// index of current P word
-	output	wire										p_wren;	// store current P word now	
-	
-	input		wire	[                  31:0]	a_din;	// A
-	input		wire	[                  31:0]	b_din;	// B
-	input		wire	[                  31:0]	n_din;	// N (must be P-256!)
-	output	wire	[                  31:0]	p_dout;	// P = A * B mod N
-	
-	
-		//
-		// Word Indices
-		//
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_a;
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_b;
-		
-		/* map registers to output ports */
-	assign a_addr	= index_a;
-	assign b_addr	= index_b;
-	
-		//
-		// FSM
-		//
-	localparam	FSM_SHREG_WIDTH	= (1 * OPERAND_NUM_WORDS + 1) + (2 * OPERAND_NUM_WORDS + 1) + (2 * OPERAND_NUM_WORDS + 2) + (0 * OPERAND_NUM_WORDS + 2) + 1;
-	
-	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
-	
-	assign rdy = fsm_shreg[0];
-	
-	wire [1 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_inc_index_a	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
-	wire [1 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_store_word_a	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
-	wire [2 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_inc_index_b	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 1)];
-	wire [2 * OPERAND_NUM_WORDS-2:0]	fsm_shreg_store_si_msb	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 1)];
-	wire [0 * OPERAND_NUM_WORDS-0:0] fsm_shreg_store_si_lsb	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 2)];
-	wire [2 * OPERAND_NUM_WORDS-2:0]	fsm_shreg_shift_si		= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 1)];
-	wire [0 * OPERAND_NUM_WORDS-0:0]	fsm_shreg_mask_cw1_sum	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 4)];
-	wire [2 * OPERAND_NUM_WORDS-1:0]	fsm_shreg_store_c_word	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 4)];
-	wire [0 * OPERAND_NUM_WORDS-0:0]	fsm_shreg_reduce_start	= fsm_shreg[FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 5)];
-	wire [0 * OPERAND_NUM_WORDS-0:0]	fsm_shreg_reduce_stop	= fsm_shreg[FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 6) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 6)];
-	
-	wire inc_index_a		= |fsm_shreg_inc_index_a;
-	wire store_word_a		= |fsm_shreg_store_word_a;
-	wire inc_index_b		= |fsm_shreg_inc_index_b;
-	wire clear_mac_ab		= |fsm_shreg_inc_index_b;
-	wire shift_wide_a		= |fsm_shreg_inc_index_b;
-	wire enable_mac_ab	= |fsm_shreg_inc_index_b;
-	wire store_si_msb		= |fsm_shreg_store_si_msb;
-	wire store_si_lsb		=  fsm_shreg_store_si_lsb;
-	wire shift_si			= |fsm_shreg_shift_si;
-	wire mask_cw1_sum		=  fsm_shreg_mask_cw1_sum;
-	wire store_c_word		= |fsm_shreg_store_c_word;
-	wire reduce_start		=  fsm_shreg_reduce_start;
-	wire reduce_stop		=  fsm_shreg_reduce_stop;
-	
-	
-		//
-		// FSM Logic
-		//
-	wire	reduce_done;
-		
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)
-			//
-			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
-			//
-		else begin
-			//
-			if (rdy)
-				fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
-			//
-			else if (!reduce_stop || reduce_done)
-				fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
-			//
-		end
-	
-		
-		//
-		// Word Index Increment Logic
-		//
-	reg	index_b_ff;
-	
-	always @(posedge clk)
-		//
-		if (inc_index_b) index_b_ff <= ~index_b_ff;
-		else index_b_ff <= 1'b0;
-	
-	always @(posedge clk)
-		//
-		if (rdy) begin
-			//
-			index_a		<= WORD_INDEX_ZERO;
-			index_b		<= WORD_INDEX_LAST;
-			//
-		end else begin
-			//
-			if (inc_index_a)						index_a	<= WORD_INDEX_NEXT_OR_ZERO(index_a);
-			if (inc_index_b && !index_b_ff)	index_b	<= WORD_INDEX_PREVIOUS_OR_LAST(index_b);
-			//
-		end
-		
-		
-		//
-		// Wide Operand Buffer
-		//
-	reg	[255:0]	buf_a_wide;
-	
-	always @(posedge clk)
-		//
-		if (store_word_a)
-			buf_a_wide <= {buf_a_wide[16 +: 256 - 3 * 16], {a_din[15:0], a_din[31:16]}, buf_a_wide[256 - 2 * 16 +: 16]};
-		else if (shift_wide_a)
-			buf_a_wide <= {buf_a_wide[256-(16+1):0], buf_a_wide[256-16+:16]};
-		
-		
-		//
-		// Multiplier Array
-		//
-	wire	mac_inhibit;			// control signal to pause all accumulators
-	
-	wire	[46: 0]	mac[0:15];	// outputs of all accumulators
-	reg	[15: 0]	mac_clear;	// individual per-accumulator clear flag
-	
-	assign mac_inhibit = ~enable_mac_ab;
-	
-	always @(posedge clk)
-		//
-		if (!clear_mac_ab)
-			mac_clear <= {16{1'b1}};
-		else begin
-		
-			if (mac_clear == {16{1'b1}})
-				mac_clear <= {{14{1'b0}}, 1'b1, {1{1'b0}}};
-			else
-				mac_clear <= (mac_clear[15] == 1'b0) ? {mac_clear[14:0], 1'b0} : {16{1'b1}};
-				
-		
-		end
-	
-		//
-		// Array of parallel multipliers
-		//
-	genvar i;
-	generate for (i=0; i<16; i=i+1)
-		begin : gen_mac_array
-			//
-			mac16_wrapper mac16_inst
-			(
-				.clk		(clk),
-				.ce		(~mac_inhibit),
-				
-				.clr		(mac_clear[i]),
-				
-				.a			(buf_a_wide[16*i+:16]),
-				.b			(index_b_ff ? b_din[15:0] : b_din[31:16]),
-				.s			(mac[i])
-			);
-			//
-		end
-	endgenerate
-	
-		//
-		// Intermediate Words
-		//
-	reg	[47*(2*OPERAND_NUM_WORDS-1)-1:0]	si_msb;
-	reg	[47*(2*OPERAND_NUM_WORDS-0)-1:0]	si_lsb;
-	
-	
-	wire	[47*(2*OPERAND_NUM_WORDS-1)-1:0]	si_msb_new;
-	wire	[47*(2*OPERAND_NUM_WORDS-0)-1:0]	si_lsb_new;
-
-	generate for (i=0; i<16; i=i+1)
-		begin : gen_si_lsb_new
-			assign si_lsb_new[47*i+:47] = mac[15-i];
-		end
-	endgenerate
-	
-	generate for (i=1; i<16; i=i+1)
-		begin : gen_si_msb_new
-			assign si_msb_new[47*(15-i)+:47] = mac_clear[i] ? mac[i] : si_msb[47*(15-i)+:47];
-		end
-	endgenerate
-	
-	always @(posedge clk) begin
-		//
-		if (shift_si) begin
-			si_msb <= {{2*47{1'b0}}, si_msb[15*47-1:2*47]};
-			si_lsb <= {si_msb[2*47-1:0], si_lsb[16*47-1:2*47]};
-		end else begin
-		
-			if (store_si_msb)
-				si_msb <= si_msb_new;
-			
-			if (store_si_lsb)
-				si_lsb <= si_lsb_new;
-		end
-			
-	end
-	
-				
-		//
-		// Accumulators
-		//
-	wire	[46: 0]	add47_cw0_s;
-	wire	[46: 0]	add47_cw1_s;
-	
-	
-		//
-		// cw0, b, cw1, b
-		//
-	reg	[30: 0]	si_prev_dly;
-	reg	[15: 0]	si_next_dly;
-	
-	always @(posedge clk)
-		//
-		if (shift_si)
-			si_prev_dly <= si_lsb[93:63];
-		else
-			si_prev_dly <= {31{1'b0}};
-			
-	always @(posedge clk)
-		//
-		si_next_dly <= si_lsb[62:47];
-	
-	wire	[46: 0]	add47_cw0_a = si_lsb[46:0];
-	wire	[46: 0]	add47_cw0_b = {{16{1'b0}}, si_prev_dly};
-	
-	wire	[46: 0]	add47_cw1_a = add47_cw0_s;
-	wire	[46: 0]	add47_cw1_b = {{15{1'b0}}, si_next_dly, mask_cw1_sum ? {16{1'b0}} : {1'b0, add47_cw1_s[46:32]}};	
-	
-	adder47_wrapper add47_cw0_inst
-	(
-		.clk	(clk),
-		.a		(add47_cw0_a),
-		.b		(add47_cw0_b),
-		.s		(add47_cw0_s)
-	);
-	
-	adder47_wrapper add47_cw1_inst
-	(
-		.clk	(clk),
-		.a		(add47_cw1_a),
-		.b		(add47_cw1_b),
-		.s		(add47_cw1_s)
-	);
-	
-	
-	
-		//
-		// Full-Size Product
-		//
-	reg	[WORD_COUNTER_WIDTH:0]	bram_c_addr;
-	
-	wire	[WORD_COUNTER_WIDTH:0]	reduce_c_addr;
-	wire	[                31:0]	reduce_c_word;
-	
-	always @(posedge clk)
-		//
-		if (store_c_word)
-			bram_c_addr <= bram_c_addr + 1'b1;
-		else
-			bram_c_addr <= {2*WORD_COUNTER_WIDTH{1'b0}};
-	
-	bram_1rw_1ro_readfirst #
-	(
-		.MEM_WIDTH			(32),
-		.MEM_ADDR_BITS		(WORD_COUNTER_WIDTH + 1)
-	)
-	bram_c_inst
-	(
-		.clk		(clk),
-
-		.a_addr	(bram_c_addr),
-		.a_wr		(store_c_word),
-		.a_in		(add47_cw1_s[31:0]),
-		.a_out	(),
-
-		.b_addr	(reduce_c_addr),
-		.b_out	(reduce_c_word)
-	);
-	
-	
-		//
-		// Reduction Stage
-		//
-	modular_reductor_256 reduce_256_inst
-	(
-		.clk		(clk),
-		.rst_n	(rst_n),
-		
-		.ena		(reduce_start),
-		.rdy		(reduce_done),
-		
-		.x_addr	(reduce_c_addr),
-		.n_addr	(n_addr),
-		.p_addr	(p_addr),
-		.p_wren	(p_wren),
-		
-		.x_din	(reduce_c_word),
-		.n_din	(n_din),
-		.p_dout	(p_dout)
-	);
-	
-		
-endmodule
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+//
+// modular_multiplier_256.v
+// -----------------------------------------------------------------------------
+// Modular multiplier.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module modular_multiplier_256
+  (
+   clk, rst_n,
+   ena, rdy,
+   a_addr, b_addr, n_addr, p_addr, p_wren,
+   a_din, b_din, n_din, p_dout
+   );
+
+
+   //
+   // Constants
+   //
+   localparam	OPERAND_NUM_WORDS				= 8;
+   localparam	WORD_COUNTER_WIDTH				= 3;
+
+
+   //
+   // Handy Numbers
+   //
+   localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
+   localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
+
+
+   //
+   // Handy Functions
+   //
+   function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
+      input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+      begin
+	 WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
+				   WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
+      end
+   endfunction
+
+   function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_PREVIOUS_OR_LAST;
+      input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
+      begin
+	 WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
+				       WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
+      end
+   endfunction
+
+
+   //
+   // Ports
+   //
+   input	wire					clk;	// system clock
+   input	wire					rst_n;	// active-low async reset
+
+   input	wire 					ena;	// enable input
+   output	wire 					rdy;	// ready output
+
+   output	wire [WORD_COUNTER_WIDTH-1:0] 		a_addr;	// index of current A word
+   output	wire [WORD_COUNTER_WIDTH-1:0] 		b_addr;	// index of current B word
+   output	wire [WORD_COUNTER_WIDTH-1:0] 		n_addr;	// index of current N word
+   output	wire [WORD_COUNTER_WIDTH-1:0] 		p_addr;	// index of current P word
+   output	wire 					p_wren;	// store current P word now
+
+   input	wire [                  31:0] 		a_din;	// A
+   input	wire [                  31:0] 		b_din;	// B
+   input	wire [                  31:0] 		n_din;	// N (must be P-256!)
+   output	wire [                  31:0] 		p_dout;	// P = A * B mod N
+
+
+   //
+   // Word Indices
+   //
+   reg [WORD_COUNTER_WIDTH-1:0] 			index_a;
+   reg [WORD_COUNTER_WIDTH-1:0] 			index_b;
+
+   /* map registers to output ports */
+   assign a_addr	= index_a;
+   assign b_addr	= index_b;
+
+   //
+   // FSM
+   //
+   localparam	FSM_SHREG_WIDTH	= (1 * OPERAND_NUM_WORDS + 1) + (2 * OPERAND_NUM_WORDS + 1) + (2 * OPERAND_NUM_WORDS + 2) + (0 * OPERAND_NUM_WORDS + 2) + 1;
+
+   reg [FSM_SHREG_WIDTH-1:0] 				fsm_shreg;
+
+   assign rdy = fsm_shreg[0];
+
+   wire [1 * OPERAND_NUM_WORDS-1:0] 			fsm_shreg_inc_index_a	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
+   wire [1 * OPERAND_NUM_WORDS-1:0] 			fsm_shreg_store_word_a	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
+   wire [2 * OPERAND_NUM_WORDS-1:0] 			fsm_shreg_inc_index_b	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 1)];
+   wire [2 * OPERAND_NUM_WORDS-2:0] 			fsm_shreg_store_si_msb	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 1)];
+   wire [0 * OPERAND_NUM_WORDS-0:0] 			fsm_shreg_store_si_lsb	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 2)];
+   wire [2 * OPERAND_NUM_WORDS-2:0] 			fsm_shreg_shift_si	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 1)];
+   wire [0 * OPERAND_NUM_WORDS-0:0] 			fsm_shreg_mask_cw1_sum	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 4)];
+   wire [2 * OPERAND_NUM_WORDS-1:0] 			fsm_shreg_store_c_word	= fsm_shreg[FSM_SHREG_WIDTH - (3 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 4)];
+   wire [0 * OPERAND_NUM_WORDS-0:0] 			fsm_shreg_reduce_start	= fsm_shreg[FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 5)];
+   wire [0 * OPERAND_NUM_WORDS-0:0] 			fsm_shreg_reduce_stop	= fsm_shreg[FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 6) : FSM_SHREG_WIDTH - (5 * OPERAND_NUM_WORDS + 6)];
+
+   wire 						inc_index_a		= |fsm_shreg_inc_index_a;
+   wire 						store_word_a		= |fsm_shreg_store_word_a;
+   wire 						inc_index_b		= |fsm_shreg_inc_index_b;
+   wire 						clear_mac_ab		= |fsm_shreg_inc_index_b;
+   wire 						shift_wide_a		= |fsm_shreg_inc_index_b;
+   wire 						enable_mac_ab		= |fsm_shreg_inc_index_b;
+   wire 						store_si_msb		= |fsm_shreg_store_si_msb;
+   wire 						store_si_lsb		=  fsm_shreg_store_si_lsb;
+   wire 						shift_si		= |fsm_shreg_shift_si;
+   wire 						mask_cw1_sum		=  fsm_shreg_mask_cw1_sum;
+   wire 						store_c_word		= |fsm_shreg_store_c_word;
+   wire 						reduce_start		=  fsm_shreg_reduce_start;
+   wire 						reduce_stop		=  fsm_shreg_reduce_stop;
+
+
+   //
+   // FSM Logic
+   //
+   wire 						reduce_done;
+
+   always @(posedge clk or negedge rst_n)
+     //
+     if (rst_n == 1'b0)
+       //
+       fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+   //
+     else begin
+	//
+	if (rdy)
+	  fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+	//
+	else if (!reduce_stop || reduce_done)
+	  fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+	//
+     end
+
+
+   //
+   // Word Index Increment Logic
+   //
+   reg	index_b_ff;
+
+   always @(posedge clk)
+     //
+     if (inc_index_b) index_b_ff <= ~index_b_ff;
+     else index_b_ff <= 1'b0;
+
+   always @(posedge clk)
+     //
+     if (rdy) begin
+	//
+	index_a		<= WORD_INDEX_ZERO;
+	index_b		<= WORD_INDEX_LAST;
+	//
+     end else begin
+	//
+	if (inc_index_a)		index_a	<= WORD_INDEX_NEXT_OR_ZERO(index_a);
+	if (inc_index_b && !index_b_ff)	index_b	<= WORD_INDEX_PREVIOUS_OR_LAST(index_b);
+	//
+     end
+
+
+   //
+   // Wide Operand Buffer
+   //
+   reg	[255:0]	buf_a_wide;
+
+   always @(posedge clk)
+     //
+     if (store_word_a)
+       buf_a_wide <= {buf_a_wide[16 +: 256 - 3 * 16], {a_din[15:0], a_din[31:16]}, buf_a_wide[256 - 2 * 16 +: 16]};
+     else if (shift_wide_a)
+       buf_a_wide <= {buf_a_wide[256-(16+1):0], buf_a_wide[256-16+:16]};
+
+
+   //
+   // Multiplier Array
+   //
+   wire 	mac_inhibit;	// control signal to pause all accumulators
+
+   wire [46: 0] mac[0:15];	// outputs of all accumulators
+   reg [15: 0] 	mac_clear;	// individual per-accumulator clear flag
+
+   assign mac_inhibit = ~enable_mac_ab;
+
+   always @(posedge clk)
+     //
+     if (!clear_mac_ab)
+       mac_clear <= {16{1'b1}};
+     else begin
+
+	if (mac_clear == {16{1'b1}})
+	  mac_clear <= {{14{1'b0}}, 1'b1, {1{1'b0}}};
+	else
+	  mac_clear <= (mac_clear[15] == 1'b0) ? {mac_clear[14:0], 1'b0} : {16{1'b1}};
+
+
+     end
+
+     //
+     // Array of parallel multipliers
+     //
+     genvar i;
+     generate for (i=0; i<16; i=i+1)
+       begin : gen_mac_array
+	  //
+	  mac16_wrapper mac16_inst
+		     (
+		      .clk		(clk),
+		      .ce		(~mac_inhibit),
+
+		      .clr		(mac_clear[i]),
+
+		      .a		(buf_a_wide[16*i+:16]),
+		      .b		(index_b_ff ? b_din[15:0] : b_din[31:16]),
+		      .s		(mac[i])
+		      );
+	  //
+       end
+     endgenerate
+
+     //
+     // Intermediate Words
+     //
+     reg	[47*(2*OPERAND_NUM_WORDS-1)-1:0]	si_msb;
+     reg	[47*(2*OPERAND_NUM_WORDS-0)-1:0]	si_lsb;
+
+
+     wire	[47*(2*OPERAND_NUM_WORDS-1)-1:0]	si_msb_new;
+     wire	[47*(2*OPERAND_NUM_WORDS-0)-1:0]	si_lsb_new;
+
+     generate for (i=0; i<16; i=i+1)
+       begin : gen_si_lsb_new
+	  assign si_lsb_new[47*i+:47] = mac[15-i];
+       end
+     endgenerate
+
+     generate for (i=1; i<16; i=i+1)
+       begin : gen_si_msb_new
+	  assign si_msb_new[47*(15-i)+:47] = mac_clear[i] ? mac[i] : si_msb[47*(15-i)+:47];
+       end
+     endgenerate
+
+     always @(posedge clk) begin
+	//
+	if (shift_si) begin
+	   si_msb <= {{2*47{1'b0}}, si_msb[15*47-1:2*47]};
+	   si_lsb <= {si_msb[2*47-1:0], si_lsb[16*47-1:2*47]};
+	end else begin
+
+	   if (store_si_msb)
+	     si_msb <= si_msb_new;
+
+	   if (store_si_lsb)
+	     si_lsb <= si_lsb_new;
+	end
+
+     end
+
+
+     //
+     // Accumulators
+     //
+     wire	[46: 0]	add47_cw0_s;
+     wire	[46: 0]	add47_cw1_s;
+
+
+     //
+     // cw0, b, cw1, b
+     //
+     reg	[30: 0]	si_prev_dly;
+     reg	[15: 0]	si_next_dly;
+
+     always @(posedge clk)
+       //
+       if (shift_si)
+	 si_prev_dly <= si_lsb[93:63];
+       else
+	 si_prev_dly <= {31{1'b0}};
+
+       always @(posedge clk)
+	 //
+	 si_next_dly <= si_lsb[62:47];
+
+       wire	[46: 0]	add47_cw0_a = si_lsb[46:0];
+       wire	[46: 0]	add47_cw0_b = {{16{1'b0}}, si_prev_dly};
+
+       wire	[46: 0]	add47_cw1_a = add47_cw0_s;
+       wire	[46: 0]	add47_cw1_b = {{15{1'b0}}, si_next_dly, mask_cw1_sum ? {16{1'b0}} : {1'b0, add47_cw1_s[46:32]}};
+
+       adder47_wrapper add47_cw0_inst
+	 (
+	  .clk	(clk),
+	  .a		(add47_cw0_a),
+	  .b		(add47_cw0_b),
+	  .s		(add47_cw0_s)
+	  );
+
+       adder47_wrapper add47_cw1_inst
+	 (
+	  .clk	(clk),
+	  .a		(add47_cw1_a),
+	  .b		(add47_cw1_b),
+	  .s		(add47_cw1_s)
+	  );
+
+
+
+       //
+       // Full-Size Product
+       //
+       reg	[WORD_COUNTER_WIDTH:0]	bram_c_addr;
+
+       wire	[WORD_COUNTER_WIDTH:0]	reduce_c_addr;
+       wire	[                31:0]	reduce_c_word;
+
+       always @(posedge clk)
+	 //
+	 if (store_c_word)
+	   bram_c_addr <= bram_c_addr + 1'b1;
+	 else
+	   bram_c_addr <= {2*WORD_COUNTER_WIDTH{1'b0}};
+
+	 bram_1rw_1ro_readfirst #
+	   (
+	    .MEM_WIDTH		(32),
+	    .MEM_ADDR_BITS	(WORD_COUNTER_WIDTH + 1)
+	    )
+	 bram_c_inst
+	   (
+	    .clk		(clk),
+
+	    .a_addr		(bram_c_addr),
+	    .a_wr		(store_c_word),
+	    .a_in		(add47_cw1_s[31:0]),
+	    .a_out		(),
+
+	    .b_addr		(reduce_c_addr),
+	    .b_out		(reduce_c_word)
+	    );
+
+
+	 //
+	 // Reduction Stage
+	 //
+	 modular_reductor_256 reduce_256_inst
+	   (
+	    .clk		(clk),
+	    .rst_n		(rst_n),
+
+	    .ena		(reduce_start),
+	    .rdy		(reduce_done),
+
+	    .x_addr		(reduce_c_addr),
+	    .n_addr		(n_addr),
+	    .p_addr		(p_addr),
+	    .p_wren		(p_wren),
+
+	    .x_din		(reduce_c_word),
+	    .n_din		(n_din),
+	    .p_dout		(p_dout)
+	    );
+
+
+	 endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_reductor_256.v b/rtl/modular/modular_reductor_256.v
index e4b346a..6f31570 100644
--- a/rtl/modular/modular_reductor_256.v
+++ b/rtl/modular/modular_reductor_256.v
@@ -1,692 +1,692 @@
-//------------------------------------------------------------------------------
-//
-// modular_reductor_256.v
-// -----------------------------------------------------------------------------
-// Modular reductor.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2015-2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-module modular_reductor_256
-	(
-		clk, rst_n,
-		ena, rdy,
-		x_addr, n_addr, p_addr, p_wren,
-		x_din, n_din, p_dout
-	);
-		
-		//
-		// Constants
-		//
-	localparam	OPERAND_NUM_WORDS		= 8;
-	localparam	WORD_COUNTER_WIDTH	= 3;
-	
-	
-		//
-		// Handy Numbers
-		//
-	localparam	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_ZERO	= 0;
-	localparam	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_LAST	= 2 * OPERAND_NUM_WORDS - 1;
-	
-	
-		//
-		// Handy Functions
-		//
-	function	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_PREVIOUS_OR_LAST;
-		input	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_CURRENT;
-		begin
-			WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
-				WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
-		end
-	endfunction
-	
-	
-		//
-		// Ports
-		//
-	input		wire										clk;		// system clock
-	input		wire										rst_n;	// active-low async reset
-	
-	input		wire										ena;		// enable input
-	output	wire										rdy;		// ready output
-	
-	output	wire	[WORD_COUNTER_WIDTH-0:0]	x_addr;	// index of current X word
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;	// index of current N word
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	p_addr;	// index of current P word
-	output	wire										p_wren;	// store current P word now	
-	
-	input		wire	[                  31:0]	x_din;	// X
-	input		wire	[                  31:0]	n_din;	// N (must be P-256!)
-	output	wire	[                  31:0]	p_dout;	// P = X mod N
-	
-	
-		//
-		// Word Indices
-		//
-	reg	[WORD_COUNTER_WIDTH:0]	index_x;
-	
-		
-		/* map registers to output ports */
-	assign x_addr	= index_x;
-	
-	
-		//
-		// FSM
-		//
-	localparam	FSM_SHREG_WIDTH	= (2 * OPERAND_NUM_WORDS + 1) + (5 * 2) + 1;
-	
-	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
-	
-	assign rdy = fsm_shreg[0];
-	
-	wire [2 * OPERAND_NUM_WORDS - 1:0]	fsm_shreg_inc_index_x	= fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 1 -: 2 * OPERAND_NUM_WORDS];
-	wire [2 * OPERAND_NUM_WORDS - 1:0]	fsm_shreg_store_word_z	= fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 2 -: 2 * OPERAND_NUM_WORDS];
-	wire [2 *                 5 - 1:0]	fsm_shreg_reduce_stages	= fsm_shreg[                                        1 +: 2 *                 5];
-	
-	wire [5-1:0] fsm_shreg_reduce_stage_start;
-	wire [5-1:0] fsm_shreg_reduce_stage_stop;
-	
-	genvar s;
-	generate for (s=0; s<5; s=s+1)
-		begin : gen_fsm_shreg_reduce_stages
-			assign fsm_shreg_reduce_stage_start[5 - (s + 1)]	= fsm_shreg_reduce_stages[2 * (5 - s) - 1];
-			assign fsm_shreg_reduce_stage_stop[5 - (s + 1)]	= fsm_shreg_reduce_stages[2 * (5 - s) - 2];
-		end
-	endgenerate
-	
-	wire inc_index_x	= |fsm_shreg_inc_index_x;
-	wire store_word_z	= |fsm_shreg_store_word_z;
-	wire reduce_start	= |fsm_shreg_reduce_stage_start;
-	wire reduce_stop	= |fsm_shreg_reduce_stage_stop;
-	wire store_p		=  fsm_shreg_reduce_stage_stop[0];
-	
-	
-	wire	reduce_adder0_done;
-	wire	reduce_adder1_done;
-	wire	reduce_subtractor_done;
-	
-	wire	reduce_done_all = reduce_adder0_done & reduce_adder1_done & reduce_subtractor_done;
-	
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)
-			//
-			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
-			//
-		else begin
-			//
-			if (rdy)
-				//
-				fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
-			//
-			else if (!reduce_stop || reduce_done_all)
-				//
-				fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
-			//
-		end
-	
-		
-		//
-		// Word Index Increment Logic
-		//
-	always @(posedge clk)
-		//
-		if (rdy)
-			//
-			index_x <= WORD_INDEX_LAST;
-			//
-		else if (inc_index_x)
-			//
-			index_x	<= WORD_INDEX_PREVIOUS_OR_LAST(index_x);
-			
-			
-		//
-		// Look-up Table
-		//
-		
-		//
-		// Take a look at the corresponding C model for more information
-		// on how exactly the math behind reduction works. The first step
-		// is to assemble nine 256-bit values ("z-words") from 32-bit parts
-		// of the full 512-bit product ("c-word"). The problem with z5 is
-		// that it contains c13 two times. This implementation scans from
-		// c15 to c0 and writes current part of c-word into corresponding
-		// parts of z-words. Since those 32-bit parts are stored in block
-		// memories, one source word can only be written to one location in
-		// every z-word at a time. The trick is to delay c13 and then write
-		// the delayed value at the corresponding location in z5 instead of
-		// the next c12. "z_save" flag is used to indicate that the current
-		// word should be delayed and written once again during the next cycle.
-		//
-		
-	reg	[9*WORD_COUNTER_WIDTH-1:0]	z_addr;	//
-	reg	[9                   -1:0]	z_wren;	//
-	reg	[9                   -1:0]	z_mask;	// mask input to store zero word
-	reg	[9                   -1:0]	z_save;	// save previous word once again
-	
-	always @(posedge clk)
-		//
-		if (inc_index_x)
-			//
-			case (index_x)
-				//
-				//                     s9     s8     s7     s6     s5     s4     s3     s2     s1
-				//                     ||     ||     ||     ||     ||     ||     ||     ||     ||
-				4'd00:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd00};
-				4'd01:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd01};
-				4'd02:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd02};
-				4'd03:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd03};
-				4'd04:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd04};
-				4'd05:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd05};
-				4'd06:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd06};
-				4'd07:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd07};
-				4'd08:	z_addr <= {3'd02, 3'd03, 3'd04, 3'd06, 3'd07, 3'd00, 3'd00, 3'd00, 3'dxx};
-				4'd09:	z_addr <= {3'd03, 3'd04, 3'd06, 3'd03, 3'd00, 3'd01, 3'd01, 3'd01, 3'dxx};
-				4'd10:	z_addr <= {3'd04, 3'd05, 3'd05, 3'd07, 3'd01, 3'd02, 3'd02, 3'd02, 3'dxx};
-				4'd11:	z_addr <= {3'd05, 3'd06, 3'd07, 3'd00, 3'd02, 3'd03, 3'd07, 3'd03, 3'dxx};
-				4'd12:	z_addr <= {3'd06, 3'd07, 3'd00, 3'd01, 3'd06, 3'd04, 3'd03, 3'd04, 3'dxx};
-				4'd13:	z_addr <= {3'd07, 3'd00, 3'd01, 3'd02, 3'd03, 3'd05, 3'd04, 3'd05, 3'dxx};
-				4'd14:	z_addr <= {3'd00, 3'd01, 3'd02, 3'd04, 3'd04, 3'd06, 3'd05, 3'd06, 3'dxx};
-				4'd15:	z_addr <= {3'd01, 3'd02, 3'd03, 3'd05, 3'd05, 3'd07, 3'd06, 3'd07, 3'dxx};
-				//
-            default:	z_addr <= {9*WORD_COUNTER_WIDTH{1'bX}};
-				//
-         endcase
-	
-	always @(posedge clk)
-		//
-		case (index_x)
-			//
-			//                     9     8     7     6     5     4     3     2     1
-			//                     |     |     |     |     |     |     |     |     |
-			4'd00:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd01:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd02:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd03:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd04:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd05:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd06:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd07:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
-			4'd08:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			4'd09:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			4'd10:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			4'd11:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			4'd12:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			4'd13:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			4'd14:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			4'd15:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
-			//
-			default:	z_wren <= {9{1'b0}};
-			//
-		endcase
-		
-	always @(posedge clk)
-		//
-		if (inc_index_x)
-			//
-			case (index_x)
-				//
-				//                     9     8     7     6     5     4     3     2     1
-				//                     |     |     |     |     |     |     |     |     |
-				4'd00:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd01:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd02:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd03:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd04:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd05:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd06:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd07:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd08:	z_mask <= {1'b1, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
-				4'd09:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
-				4'd10:	z_mask <= {1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
-				4'd11:	z_mask <= {1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0, 1'b0};
-				4'd12:	z_mask <= {1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
-				4'd13:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
-				4'd14:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd15:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				//
-            default:	z_mask <= {9{1'bX}};
-				//
-         endcase
-			
-	always @(posedge clk)
-		//
-		if (inc_index_x)
-			//
-			case (index_x)
-				//
-				//                     9     8     7     6     5     4     3     2     1
-				//                     |     |     |     |     |     |     |     |     |
-				4'd00:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd01:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd02:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd03:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd04:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd05:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd06:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd07:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd08:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd09:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd10:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd11:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd12:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd13:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd14:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				4'd15:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
-				//
-            default:	z_save <= {9{1'bX}};
-				//
-         endcase
-		
-		
-		//
-		// Intermediate Numbers
-		//
-	reg	[WORD_COUNTER_WIDTH-1:0]	reduce_z_addr[1:9];
-	wire	[                32-1:0]	reduce_z_dout[1:9];
-	
-	reg	[31: 0]	x_din_dly;
-	always @(posedge clk)
-		//
-		x_din_dly <= x_din;
-		
-	
-	genvar z;
-	generate for (z=1; z<=9; z=z+1)
-		//
-		begin : gen_z_bram
-			//
-			bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
-			bram_c_inst
-			(
-				.clk		(clk),
-
-				.a_addr	(z_addr[(z-1) * WORD_COUNTER_WIDTH +: WORD_COUNTER_WIDTH]),
-				.a_wr		(z_wren[z-1] & store_word_z),
-				.a_in		(z_mask[z-1] ? {32{1'b0}} : (z_save[z-1] ? x_din_dly : x_din)),
-				.a_out	(),
-
-				.b_addr	(reduce_z_addr[z]),
-				.b_out	(reduce_z_dout[z])
-			);
-			//
-		end
-		//
-	endgenerate
-		
-		
-		
-	
-	wire	[                32-1:0]	bram_sum0_wr_din;
-	wire	[WORD_COUNTER_WIDTH-1:0]	bram_sum0_wr_addr;
-	wire										bram_sum0_wr_wren;
-	
-	wire	[                32-1:0]	bram_sum1_wr_din;
-	wire	[WORD_COUNTER_WIDTH-1:0]	bram_sum1_wr_addr;
-	wire										bram_sum1_wr_wren;
-	
-	wire	[                32-1:0]	bram_diff_wr_din;
-	wire	[WORD_COUNTER_WIDTH-1:0]	bram_diff_wr_addr;
-	wire										bram_diff_wr_wren;
-	
-	wire	[                32-1:0]	bram_sum0_rd_dout;
-	reg	[WORD_COUNTER_WIDTH-1:0]	bram_sum0_rd_addr;
-	
-	wire	[                32-1:0]	bram_sum1_rd_dout;
-	reg	[WORD_COUNTER_WIDTH-1:0]	bram_sum1_rd_addr;
-
-	wire	[                32-1:0]	bram_diff_rd_dout;
-	reg	[WORD_COUNTER_WIDTH-1:0]	bram_diff_rd_addr;
-
-	
-	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
-	bram_sum0_inst
-	(
-		.clk		(clk),
-
-		.a_addr	(bram_sum0_wr_addr),
-		.a_wr		(bram_sum0_wr_wren),
-		.a_in		(bram_sum0_wr_din),
-		.a_out	(),
-
-		.b_addr	(bram_sum0_rd_addr),
-		.b_out	(bram_sum0_rd_dout)
-	);
-	
-	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
-	bram_sum1_inst
-	(
-		.clk		(clk),
-
-		.a_addr	(bram_sum1_wr_addr),
-		.a_wr		(bram_sum1_wr_wren),
-		.a_in		(bram_sum1_wr_din),
-		.a_out	(),
-
-		.b_addr	(bram_sum1_rd_addr),
-		.b_out	(bram_sum1_rd_dout)
-	);
-	
-	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
-	bram_diff_inst
-	(
-		.clk		(clk),
-
-		.a_addr	(bram_diff_wr_addr),
-		.a_wr		(bram_diff_wr_wren),
-		.a_in		(bram_diff_wr_din),
-		.a_out	(),
-
-		.b_addr	(bram_diff_rd_addr),
-		.b_out	(bram_diff_rd_dout)
-	);
-	
-	
-	wire	[WORD_COUNTER_WIDTH-1:0]	adder0_ab_addr;
-	wire	[WORD_COUNTER_WIDTH-1:0]	adder1_ab_addr;
-	wire	[WORD_COUNTER_WIDTH-1:0]	subtractor_ab_addr;
-	
-	reg	[                32-1:0]	adder0_a_din;
-	reg	[                32-1:0]	adder0_b_din;
-	
-	reg	[                32-1:0]	adder1_a_din;
-	reg	[                32-1:0]	adder1_b_din;
-	
-	reg	[                32-1:0]	subtractor_a_din;
-	reg	[                32-1:0]	subtractor_b_din;
-	
-	// n_addr - only 1 output, because all modules are in sync
-	
-	modular_adder #
-	(
-		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
-		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH)
-	)
-	adder_inst0
-	(
-		.clk			(clk),
-		.rst_n		(rst_n),
-		
-		.ena			(reduce_start),
-		.rdy			(reduce_adder0_done),
-		
-		.ab_addr		(adder0_ab_addr),
-		.n_addr		(),
-		.s_addr		(bram_sum0_wr_addr),
-		.s_wren		(bram_sum0_wr_wren),
-		
-		.a_din		(adder0_a_din),
-		.b_din		(adder0_b_din),
-		.n_din		(n_din),
-		.s_dout		(bram_sum0_wr_din)
-	);
-	
-	modular_adder #
-	(
-		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
-		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH)
-	)
-	adder_inst1
-	(
-		.clk			(clk),
-		.rst_n		(rst_n),
-		
-		.ena			(reduce_start),
-		.rdy			(reduce_adder1_done),
-		
-		.ab_addr		(adder1_ab_addr),
-		.n_addr		(),
-		.s_addr		(bram_sum1_wr_addr),
-		.s_wren		(bram_sum1_wr_wren),
-		
-		.a_din		(adder1_a_din),
-		.b_din		(adder1_b_din),
-		.n_din		(n_din),
-		.s_dout		(bram_sum1_wr_din)
-	);
-	
-	modular_subtractor #
-	(
-		.OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
-		.WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH)
-	)
-	subtractor_inst
-	(
-		.clk			(clk),
-		.rst_n		(rst_n),
-		
-		.ena			(reduce_start),
-		.rdy			(reduce_subtractor_done),
-		
-		.ab_addr		(subtractor_ab_addr),
-		.n_addr		(n_addr),
-		.d_addr		(bram_diff_wr_addr),
-		.d_wren		(bram_diff_wr_wren),
-		
-		.a_din		(subtractor_a_din),
-		.b_din		(subtractor_b_din),
-		.n_din		(n_din),
-		.d_dout		(bram_diff_wr_din)
-	);
-	
-	
-		//
-		// Address (Operand) Selector
-		//
-	always @(*)
-		//
-		case (fsm_shreg_reduce_stage_stop)
-			//
-			5'b10000: begin
-				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[2]	= adder0_ab_addr;
-				reduce_z_addr[3]	= adder1_ab_addr;
-				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[6]	= subtractor_ab_addr;
-				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_sum0_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
-			end
-			//
-			5'b01000: begin
-				reduce_z_addr[1]	= adder0_ab_addr;
-				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[4]	= adder1_ab_addr;
-				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[7]	= subtractor_ab_addr;
-				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_sum0_rd_addr	= adder0_ab_addr;
-				bram_sum1_rd_addr	= adder1_ab_addr;
-				bram_diff_rd_addr = subtractor_ab_addr;
-			end
-			//
-			5'b00100: begin
-				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[5]	= adder0_ab_addr;
-				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[8]	= subtractor_ab_addr;
-				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_sum0_rd_addr	= adder0_ab_addr;
-				bram_sum1_rd_addr	= adder1_ab_addr;
-				bram_diff_rd_addr = subtractor_ab_addr;
-			end
-			//
-			5'b00010: begin
-				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[9]	= subtractor_ab_addr;
-				bram_sum0_rd_addr	= adder0_ab_addr;
-				bram_sum1_rd_addr	= adder0_ab_addr;
-				bram_diff_rd_addr = subtractor_ab_addr;
-			end
-			//
-			5'b00001: begin
-				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_sum0_rd_addr	= adder0_ab_addr;
-				bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_diff_rd_addr = adder0_ab_addr;
-			end			
-			//
-			default: begin
-				reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
-				reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_sum0_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
-				bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
-			end
-			//
-		endcase
-
-	
-		//
-		// adder 0
-		//
-	always @(*) begin
-		//
-		case (fsm_shreg_reduce_stage_stop)
-			5'b10000:	adder0_a_din = reduce_z_dout[2];
-			5'b01000:	adder0_a_din = bram_sum0_rd_dout;
-			5'b00100:	adder0_a_din = bram_sum0_rd_dout;
-			5'b00010:	adder0_a_din = bram_sum0_rd_dout;
-			5'b00001:	adder0_a_din = bram_sum0_rd_dout;
-			default:		adder0_a_din = {32{1'bX}};
-		endcase
-		//
-		case (fsm_shreg_reduce_stage_stop)
-			5'b10000:	adder0_b_din = reduce_z_dout[2];
-			5'b01000:	adder0_b_din = reduce_z_dout[1];
-			5'b00100:	adder0_b_din = reduce_z_dout[5];
-			5'b00010:	adder0_b_din = bram_sum1_rd_dout;
-			5'b00001:	adder0_b_din = bram_diff_rd_dout;
-			default:		adder0_b_din = {32{1'bX}};
-		endcase
-		//
-	end
-	
-		//
-		// adder 1
-		//
-	always @(*) begin
-		//
-		case (fsm_shreg_reduce_stage_stop)
-			5'b10000:	adder1_a_din = reduce_z_dout[3];
-			5'b01000:	adder1_a_din = bram_sum1_rd_dout;
-			5'b00100:	adder1_a_din = bram_sum1_rd_dout;
-			5'b00010:	adder1_a_din = {32{1'bX}};
-			5'b00001:	adder1_a_din = {32{1'bX}};
-			default:		adder1_a_din = {32{1'bX}};
-		endcase
-		//
-		case (fsm_shreg_reduce_stage_stop)
-			5'b10000:	adder1_b_din = reduce_z_dout[3];
-			5'b01000:	adder1_b_din = reduce_z_dout[4];
-			5'b00100:	adder1_b_din = {32{1'b0}};
-			5'b00010:	adder1_b_din = {32{1'bX}};
-			5'b00001:	adder1_b_din = {32{1'bX}};
-			default:		adder1_b_din = {32{1'bX}};
-		endcase
-		//
-	end
-	
-	
-		//
-		// subtractor
-		//
-	always @(*) begin
-		//
-		case (fsm_shreg_reduce_stage_stop)
-			5'b10000:	subtractor_a_din = {32{1'b0}};
-			5'b01000:	subtractor_a_din = bram_diff_rd_dout;
-			5'b00100:	subtractor_a_din = bram_diff_rd_dout;
-			5'b00010:	subtractor_a_din = bram_diff_rd_dout;
-			5'b00001:	subtractor_a_din = {32{1'bX}};
-			default:		subtractor_a_din = {32{1'bX}};
-		endcase
-		//
-		case (fsm_shreg_reduce_stage_stop)
-			5'b10000:	subtractor_b_din = reduce_z_dout[6];
-			5'b01000:	subtractor_b_din = reduce_z_dout[7];
-			5'b00100:	subtractor_b_din = reduce_z_dout[8];
-			5'b00010:	subtractor_b_din = reduce_z_dout[9];
-			5'b00001:	subtractor_b_din = {32{1'bX}};
-			default:		subtractor_b_din = {32{1'bX}};
-		endcase
-		//
-	end
-
-
-		//
-		// Address Mapping
-		//
-	assign p_addr	= bram_sum0_wr_addr;
-	assign p_wren	= bram_sum0_wr_wren & store_p;
-	assign p_dout	= bram_sum0_wr_din;
-		
-		
-endmodule
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+//
+// modular_reductor_256.v
+// -----------------------------------------------------------------------------
+// Modular reductor.
+//
+// Authors: Pavel Shatov
+//
+// Copyright (c) 2015-2016, NORDUnet A/S
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+
+module modular_reductor_256
+  (
+   clk, rst_n,
+   ena, rdy,
+   x_addr, n_addr, p_addr, p_wren,
+   x_din, n_din, p_dout
+   );
+
+   //
+   // Constants
+   //
+   localparam	OPERAND_NUM_WORDS	= 8;
+   localparam	WORD_COUNTER_WIDTH	= 3;
+
+
+   //
+   // Handy Numbers
+   //
+   localparam	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_ZERO	= 0;
+   localparam	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_LAST	= 2 * OPERAND_NUM_WORDS - 1;
+
+
+   //
+   // Handy Functions
+   //
+   function	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_PREVIOUS_OR_LAST;
+      input	[WORD_COUNTER_WIDTH:0]	WORD_INDEX_CURRENT;
+      begin
+	 WORD_INDEX_PREVIOUS_OR_LAST = (WORD_INDEX_CURRENT > WORD_INDEX_ZERO) ?
+				       WORD_INDEX_CURRENT - 1'b1 : WORD_INDEX_LAST;
+      end
+   endfunction
+
+
+   //
+   // Ports
+   //
+   input		wire				clk;	// system clock
+   input		wire				rst_n;	// active-low async reset
+
+   input		wire				ena;	// enable input
+   output	wire 					rdy;	// ready output
+
+   output	wire [WORD_COUNTER_WIDTH-0:0] 		x_addr;	// index of current X word
+   output	wire [WORD_COUNTER_WIDTH-1:0] 		n_addr;	// index of current N word
+   output	wire [WORD_COUNTER_WIDTH-1:0] 		p_addr;	// index of current P word
+   output	wire 					p_wren;	// store current P word now
+
+   input		wire [                  31:0] 	x_din;	// X
+   input		wire [                  31:0] 	n_din;	// N (must be P-256!)
+   output	wire [                  31:0] 		p_dout;	// P = X mod N
+
+
+   //
+   // Word Indices
+   //
+   reg [WORD_COUNTER_WIDTH:0] 				index_x;
+
+
+   /* map registers to output ports */
+   assign x_addr	= index_x;
+
+
+   //
+   // FSM
+   //
+   localparam	FSM_SHREG_WIDTH	= (2 * OPERAND_NUM_WORDS + 1) + (5 * 2) + 1;
+
+   reg [FSM_SHREG_WIDTH-1:0] 				fsm_shreg;
+
+   assign rdy = fsm_shreg[0];
+
+   wire [2 * OPERAND_NUM_WORDS - 1:0] 			fsm_shreg_inc_index_x	= fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 1 -: 2 * OPERAND_NUM_WORDS];
+   wire [2 * OPERAND_NUM_WORDS - 1:0] 			fsm_shreg_store_word_z	= fsm_shreg[FSM_SHREG_WIDTH - 0*OPERAND_NUM_WORDS - 2 -: 2 * OPERAND_NUM_WORDS];
+   wire [2 *                 5 - 1:0] 			fsm_shreg_reduce_stages	= fsm_shreg[                                        1 +: 2 *                 5];
+
+   wire [5-1:0] 					fsm_shreg_reduce_stage_start;
+   wire [5-1:0] 					fsm_shreg_reduce_stage_stop;
+
+   genvar 						s;
+   generate for (s=0; s<5; s=s+1)
+     begin : gen_fsm_shreg_reduce_stages
+	assign fsm_shreg_reduce_stage_start[5 - (s + 1)]	= fsm_shreg_reduce_stages[2 * (5 - s) - 1];
+	assign fsm_shreg_reduce_stage_stop[5 - (s + 1)]		= fsm_shreg_reduce_stages[2 * (5 - s) - 2];
+     end
+   endgenerate
+
+   wire inc_index_x	= |fsm_shreg_inc_index_x;
+   wire store_word_z	= |fsm_shreg_store_word_z;
+   wire reduce_start	= |fsm_shreg_reduce_stage_start;
+   wire reduce_stop	= |fsm_shreg_reduce_stage_stop;
+   wire store_p		=  fsm_shreg_reduce_stage_stop[0];
+
+
+   wire	reduce_adder0_done;
+   wire	reduce_adder1_done;
+   wire	reduce_subtractor_done;
+
+   wire	reduce_done_all = reduce_adder0_done & reduce_adder1_done & reduce_subtractor_done;
+
+   always @(posedge clk or negedge rst_n)
+     //
+     if (rst_n == 1'b0)
+       //
+       fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
+   //
+     else begin
+	//
+	if (rdy)
+	  //
+	  fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
+	//
+	else if (!reduce_stop || reduce_done_all)
+	  //
+	  fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
+	//
+     end
+
+
+   //
+   // Word Index Increment Logic
+   //
+   always @(posedge clk)
+     //
+     if (rdy)
+       //
+       index_x <= WORD_INDEX_LAST;
+   //
+     else if (inc_index_x)
+       //
+       index_x	<= WORD_INDEX_PREVIOUS_OR_LAST(index_x);
+
+
+   //
+   // Look-up Table
+   //
+
+   //
+   // Take a look at the corresponding C model for more information
+   // on how exactly the math behind reduction works. The first step
+   // is to assemble nine 256-bit values ("z-words") from 32-bit parts
+   // of the full 512-bit product ("c-word"). The problem with z5 is
+   // that it contains c13 two times. This implementation scans from
+   // c15 to c0 and writes current part of c-word into corresponding
+   // parts of z-words. Since those 32-bit parts are stored in block
+   // memories, one source word can only be written to one location in
+   // every z-word at a time. The trick is to delay c13 and then write
+   // the delayed value at the corresponding location in z5 instead of
+   // the next c12. "z_save" flag is used to indicate that the current
+   // word should be delayed and written once again during the next cycle.
+   //
+
+   reg	[9*WORD_COUNTER_WIDTH-1:0]	z_addr;	//
+   reg [9                   -1:0] 	z_wren;	//
+   reg [9                   -1:0] 	z_mask;	// mask input to store zero word
+   reg [9                   -1:0] 	z_save;	// save previous word once again
+
+   always @(posedge clk)
+     //
+     if (inc_index_x)
+       //
+       case (index_x)
+	 //
+	 //                     s9     s8     s7     s6     s5     s4     s3     s2     s1
+	 //                     ||     ||     ||     ||     ||     ||     ||     ||     ||
+	 4'd00:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd00};
+	 4'd01:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd01};
+	 4'd02:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd02};
+	 4'd03:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd03};
+	 4'd04:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd04};
+	 4'd05:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd05};
+	 4'd06:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd06};
+	 4'd07:	z_addr <= {3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'dxx, 3'd07};
+	 4'd08:	z_addr <= {3'd02, 3'd03, 3'd04, 3'd06, 3'd07, 3'd00, 3'd00, 3'd00, 3'dxx};
+	 4'd09:	z_addr <= {3'd03, 3'd04, 3'd06, 3'd03, 3'd00, 3'd01, 3'd01, 3'd01, 3'dxx};
+	 4'd10:	z_addr <= {3'd04, 3'd05, 3'd05, 3'd07, 3'd01, 3'd02, 3'd02, 3'd02, 3'dxx};
+	 4'd11:	z_addr <= {3'd05, 3'd06, 3'd07, 3'd00, 3'd02, 3'd03, 3'd07, 3'd03, 3'dxx};
+	 4'd12:	z_addr <= {3'd06, 3'd07, 3'd00, 3'd01, 3'd06, 3'd04, 3'd03, 3'd04, 3'dxx};
+	 4'd13:	z_addr <= {3'd07, 3'd00, 3'd01, 3'd02, 3'd03, 3'd05, 3'd04, 3'd05, 3'dxx};
+	 4'd14:	z_addr <= {3'd00, 3'd01, 3'd02, 3'd04, 3'd04, 3'd06, 3'd05, 3'd06, 3'dxx};
+	 4'd15:	z_addr <= {3'd01, 3'd02, 3'd03, 3'd05, 3'd05, 3'd07, 3'd06, 3'd07, 3'dxx};
+	 //
+         default:	z_addr <= {9*WORD_COUNTER_WIDTH{1'bX}};
+	 //
+       endcase
+
+   always @(posedge clk)
+     //
+     case (index_x)
+       //
+       //                     9     8     7     6     5     4     3     2     1
+       //                     |     |     |     |     |     |     |     |     |
+       4'd00:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd01:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd02:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd03:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd04:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd05:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd06:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd07:	z_wren <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1};
+       4'd08:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       4'd09:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       4'd10:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       4'd11:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       4'd12:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       4'd13:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       4'd14:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       4'd15:	z_wren <= {1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b1, 1'b0};
+       //
+       default:	z_wren <= {9{1'b0}};
+       //
+     endcase
+
+   always @(posedge clk)
+     //
+     if (inc_index_x)
+       //
+       case (index_x)
+	 //
+	 //                     9     8     7     6     5     4     3     2     1
+	 //                     |     |     |     |     |     |     |     |     |
+	 4'd00:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd01:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd02:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd03:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd04:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd05:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd06:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd07:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd08:	z_mask <= {1'b1, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+	 4'd09:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+	 4'd10:	z_mask <= {1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0};
+	 4'd11:	z_mask <= {1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b1, 1'b1, 1'b0, 1'b0};
+	 4'd12:	z_mask <= {1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
+	 4'd13:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0};
+	 4'd14:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd15:	z_mask <= {1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 //
+         default:	z_mask <= {9{1'bX}};
+	 //
+       endcase
+
+   always @(posedge clk)
+     //
+     if (inc_index_x)
+       //
+       case (index_x)
+	 //
+	 //                     9     8     7     6     5     4     3     2     1
+	 //                     |     |     |     |     |     |     |     |     |
+	 4'd00:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd01:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd02:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd03:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd04:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd05:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd06:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd07:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd08:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd09:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd10:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd11:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd12:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b1, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd13:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd14:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 4'd15:	z_save <= {1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0, 1'b0};
+	 //
+         default:	z_save <= {9{1'bX}};
+	 //
+       endcase
+
+
+   //
+   // Intermediate Numbers
+   //
+   reg [WORD_COUNTER_WIDTH-1:0] 	reduce_z_addr[1:9];
+   wire [                32-1:0] 	reduce_z_dout[1:9];
+
+   reg [31: 0] 				x_din_dly;
+   always @(posedge clk)
+     //
+     x_din_dly <= x_din;
+
+
+   genvar 				z;
+   generate for (z=1; z<=9; z=z+1)
+     //
+     begin : gen_z_bram
+	//
+	bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+	bram_c_inst
+		   (
+		    .clk		(clk),
+
+		    .a_addr	(z_addr[(z-1) * WORD_COUNTER_WIDTH +: WORD_COUNTER_WIDTH]),
+		    .a_wr		(z_wren[z-1] & store_word_z),
+		    .a_in		(z_mask[z-1] ? {32{1'b0}} : (z_save[z-1] ? x_din_dly : x_din)),
+		    .a_out	(),
+
+		    .b_addr	(reduce_z_addr[z]),
+		    .b_out	(reduce_z_dout[z])
+		    );
+	//
+     end
+      //
+   endgenerate
+
+
+
+
+   wire	[                32-1:0]	bram_sum0_wr_din;
+   wire [WORD_COUNTER_WIDTH-1:0] 	bram_sum0_wr_addr;
+   wire 				bram_sum0_wr_wren;
+
+   wire [                32-1:0] 	bram_sum1_wr_din;
+   wire [WORD_COUNTER_WIDTH-1:0] 	bram_sum1_wr_addr;
+   wire 				bram_sum1_wr_wren;
+
+   wire [                32-1:0] 	bram_diff_wr_din;
+   wire [WORD_COUNTER_WIDTH-1:0] 	bram_diff_wr_addr;
+   wire 				bram_diff_wr_wren;
+
+   wire [                32-1:0] 	bram_sum0_rd_dout;
+   reg [WORD_COUNTER_WIDTH-1:0] 	bram_sum0_rd_addr;
+
+   wire [                32-1:0] 	bram_sum1_rd_dout;
+   reg [WORD_COUNTER_WIDTH-1:0] 	bram_sum1_rd_addr;
+
+   wire [                32-1:0] 	bram_diff_rd_dout;
+   reg [WORD_COUNTER_WIDTH-1:0] 	bram_diff_rd_addr;
+
+
+   bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+   bram_sum0_inst
+     (
+      .clk		(clk),
+
+      .a_addr	(bram_sum0_wr_addr),
+      .a_wr		(bram_sum0_wr_wren),
+      .a_in		(bram_sum0_wr_din),
+      .a_out	(),
+
+      .b_addr	(bram_sum0_rd_addr),
+      .b_out	(bram_sum0_rd_dout)
+      );
+
+   bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+   bram_sum1_inst
+     (
+      .clk		(clk),
+
+      .a_addr	(bram_sum1_wr_addr),
+      .a_wr		(bram_sum1_wr_wren),
+      .a_in		(bram_sum1_wr_din),
+      .a_out	(),
+
+      .b_addr	(bram_sum1_rd_addr),
+      .b_out	(bram_sum1_rd_dout)
+      );
+
+   bram_1rw_1ro_readfirst # (.MEM_WIDTH(32), .MEM_ADDR_BITS(WORD_COUNTER_WIDTH))
+   bram_diff_inst
+     (
+      .clk		(clk),
+
+      .a_addr	(bram_diff_wr_addr),
+      .a_wr		(bram_diff_wr_wren),
+      .a_in		(bram_diff_wr_din),
+      .a_out	(),
+
+      .b_addr	(bram_diff_rd_addr),
+      .b_out	(bram_diff_rd_dout)
+      );
+
+
+   wire [WORD_COUNTER_WIDTH-1:0] 	adder0_ab_addr;
+   wire [WORD_COUNTER_WIDTH-1:0] 	adder1_ab_addr;
+   wire [WORD_COUNTER_WIDTH-1:0] 	subtractor_ab_addr;
+
+   reg [                32-1:0] 	adder0_a_din;
+   reg [                32-1:0] 	adder0_b_din;
+
+   reg [                32-1:0] 	adder1_a_din;
+   reg [                32-1:0] 	adder1_b_din;
+
+   reg [                32-1:0] 	subtractor_a_din;
+   reg [                32-1:0] 	subtractor_b_din;
+
+   // n_addr - only 1 output, because all modules are in sync
+
+   modular_adder #
+     (
+      .OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
+      .WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH)
+      )
+   adder_inst0
+     (
+      .clk			(clk),
+      .rst_n		(rst_n),
+
+      .ena			(reduce_start),
+      .rdy			(reduce_adder0_done),
+
+      .ab_addr		(adder0_ab_addr),
+      .n_addr		(),
+      .s_addr		(bram_sum0_wr_addr),
+      .s_wren		(bram_sum0_wr_wren),
+
+      .a_din		(adder0_a_din),
+      .b_din		(adder0_b_din),
+      .n_din		(n_din),
+      .s_dout		(bram_sum0_wr_din)
+      );
+
+   modular_adder #
+     (
+      .OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
+      .WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH)
+      )
+   adder_inst1
+     (
+      .clk			(clk),
+      .rst_n		(rst_n),
+
+      .ena			(reduce_start),
+      .rdy			(reduce_adder1_done),
+
+      .ab_addr		(adder1_ab_addr),
+      .n_addr		(),
+      .s_addr		(bram_sum1_wr_addr),
+      .s_wren		(bram_sum1_wr_wren),
+
+      .a_din		(adder1_a_din),
+      .b_din		(adder1_b_din),
+      .n_din		(n_din),
+      .s_dout		(bram_sum1_wr_din)
+      );
+
+   modular_subtractor #
+     (
+      .OPERAND_NUM_WORDS	(OPERAND_NUM_WORDS),
+      .WORD_COUNTER_WIDTH	(WORD_COUNTER_WIDTH)
+      )
+   subtractor_inst
+     (
+      .clk			(clk),
+      .rst_n		(rst_n),
+
+      .ena			(reduce_start),
+      .rdy			(reduce_subtractor_done),
+
+      .ab_addr		(subtractor_ab_addr),
+      .n_addr		(n_addr),
+      .d_addr		(bram_diff_wr_addr),
+      .d_wren		(bram_diff_wr_wren),
+
+      .a_din		(subtractor_a_din),
+      .b_din		(subtractor_b_din),
+      .n_din		(n_din),
+      .d_dout		(bram_diff_wr_din)
+      );
+
+
+   //
+   // Address (Operand) Selector
+   //
+   always @(*)
+     //
+     case (fsm_shreg_reduce_stage_stop)
+       //
+       5'b10000: begin
+	  reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[2]	= adder0_ab_addr;
+	  reduce_z_addr[3]	= adder1_ab_addr;
+	  reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[6]	= subtractor_ab_addr;
+	  reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_sum0_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+       end
+       //
+       5'b01000: begin
+	  reduce_z_addr[1]	= adder0_ab_addr;
+	  reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[4]	= adder1_ab_addr;
+	  reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[7]	= subtractor_ab_addr;
+	  reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_sum0_rd_addr	= adder0_ab_addr;
+	  bram_sum1_rd_addr	= adder1_ab_addr;
+	  bram_diff_rd_addr = subtractor_ab_addr;
+       end
+       //
+       5'b00100: begin
+	  reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[5]	= adder0_ab_addr;
+	  reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[8]	= subtractor_ab_addr;
+	  reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_sum0_rd_addr	= adder0_ab_addr;
+	  bram_sum1_rd_addr	= adder1_ab_addr;
+	  bram_diff_rd_addr = subtractor_ab_addr;
+       end
+       //
+       5'b00010: begin
+	  reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[9]	= subtractor_ab_addr;
+	  bram_sum0_rd_addr	= adder0_ab_addr;
+	  bram_sum1_rd_addr	= adder0_ab_addr;
+	  bram_diff_rd_addr = subtractor_ab_addr;
+       end
+       //
+       5'b00001: begin
+	  reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_sum0_rd_addr	= adder0_ab_addr;
+	  bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_diff_rd_addr = adder0_ab_addr;
+       end
+       //
+       default: begin
+	  reduce_z_addr[1]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[2]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[3]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[4]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[5]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[6]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[7]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[8]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  reduce_z_addr[9]	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_sum0_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_sum1_rd_addr	= {WORD_COUNTER_WIDTH{1'bX}};
+	  bram_diff_rd_addr = {WORD_COUNTER_WIDTH{1'bX}};
+       end
+       //
+     endcase
+
+
+   //
+   // adder 0
+   //
+   always @(*) begin
+      //
+      case (fsm_shreg_reduce_stage_stop)
+	5'b10000:	adder0_a_din = reduce_z_dout[2];
+	5'b01000:	adder0_a_din = bram_sum0_rd_dout;
+	5'b00100:	adder0_a_din = bram_sum0_rd_dout;
+	5'b00010:	adder0_a_din = bram_sum0_rd_dout;
+	5'b00001:	adder0_a_din = bram_sum0_rd_dout;
+	default:		adder0_a_din = {32{1'bX}};
+      endcase
+      //
+      case (fsm_shreg_reduce_stage_stop)
+	5'b10000:	adder0_b_din = reduce_z_dout[2];
+	5'b01000:	adder0_b_din = reduce_z_dout[1];
+	5'b00100:	adder0_b_din = reduce_z_dout[5];
+	5'b00010:	adder0_b_din = bram_sum1_rd_dout;
+	5'b00001:	adder0_b_din = bram_diff_rd_dout;
+	default:		adder0_b_din = {32{1'bX}};
+      endcase
+      //
+   end
+
+   //
+   // adder 1
+   //
+   always @(*) begin
+      //
+      case (fsm_shreg_reduce_stage_stop)
+	5'b10000:	adder1_a_din = reduce_z_dout[3];
+	5'b01000:	adder1_a_din = bram_sum1_rd_dout;
+	5'b00100:	adder1_a_din = bram_sum1_rd_dout;
+	5'b00010:	adder1_a_din = {32{1'bX}};
+	5'b00001:	adder1_a_din = {32{1'bX}};
+	default:		adder1_a_din = {32{1'bX}};
+      endcase
+      //
+      case (fsm_shreg_reduce_stage_stop)
+	5'b10000:	adder1_b_din = reduce_z_dout[3];
+	5'b01000:	adder1_b_din = reduce_z_dout[4];
+	5'b00100:	adder1_b_din = {32{1'b0}};
+	5'b00010:	adder1_b_din = {32{1'bX}};
+	5'b00001:	adder1_b_din = {32{1'bX}};
+	default:		adder1_b_din = {32{1'bX}};
+      endcase
+      //
+   end
+
+
+   //
+   // subtractor
+   //
+   always @(*) begin
+      //
+      case (fsm_shreg_reduce_stage_stop)
+	5'b10000:	subtractor_a_din = {32{1'b0}};
+	5'b01000:	subtractor_a_din = bram_diff_rd_dout;
+	5'b00100:	subtractor_a_din = bram_diff_rd_dout;
+	5'b00010:	subtractor_a_din = bram_diff_rd_dout;
+	5'b00001:	subtractor_a_din = {32{1'bX}};
+	default:		subtractor_a_din = {32{1'bX}};
+      endcase
+      //
+      case (fsm_shreg_reduce_stage_stop)
+	5'b10000:	subtractor_b_din = reduce_z_dout[6];
+	5'b01000:	subtractor_b_din = reduce_z_dout[7];
+	5'b00100:	subtractor_b_din = reduce_z_dout[8];
+	5'b00010:	subtractor_b_din = reduce_z_dout[9];
+	5'b00001:	subtractor_b_din = {32{1'bX}};
+	default:		subtractor_b_din = {32{1'bX}};
+      endcase
+      //
+   end
+
+
+   //
+   // Address Mapping
+   //
+   assign p_addr	= bram_sum0_wr_addr;
+   assign p_wren	= bram_sum0_wr_wren & store_p;
+   assign p_dout	= bram_sum0_wr_din;
+
+
+endmodule
+
+
+//------------------------------------------------------------------------------
+// End-of-File
+//------------------------------------------------------------------------------
diff --git a/rtl/modular/modular_subtractor.v b/rtl/modular/modular_subtractor.v
deleted file mode 100644
index 322aec4..0000000
--- a/rtl/modular/modular_subtractor.v
+++ /dev/null
@@ -1,292 +0,0 @@
-//------------------------------------------------------------------------------
-//
-// modular_subtractor.v
-// -----------------------------------------------------------------------------
-// Modular subtractor.
-//
-// Authors: Pavel Shatov
-//
-// Copyright (c) 2016, NORDUnet A/S
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// - Redistributions of source code must retain the above copyright notice,
-//   this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// - Neither the name of the NORDUnet nor the names of its contributors may be
-//   used to endorse or promote products derived from this software without
-//   specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-//------------------------------------------------------------------------------
-
-module modular_subtractor
-	(
-		clk, rst_n,
-		ena, rdy,
-		ab_addr, n_addr, d_addr, d_wren,
-		a_din, b_din, n_din, d_dout
-	);
-
-
-		//
-		// Parameters
-		//
-	parameter	OPERAND_NUM_WORDS		= 8;
-	parameter	WORD_COUNTER_WIDTH	= 3;
-	
-	
-		//
-		// Handy Numbers
-		//
-	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_ZERO	= 0;
-	localparam	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_LAST	= OPERAND_NUM_WORDS - 1;
-	
-	
-		//
-		// Handy Functions
-		//
-	function	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_NEXT_OR_ZERO;
-		input	[WORD_COUNTER_WIDTH-1:0]	WORD_INDEX_CURRENT;
-		begin
-			WORD_INDEX_NEXT_OR_ZERO = (WORD_INDEX_CURRENT < WORD_INDEX_LAST) ?
-				WORD_INDEX_CURRENT + 1'b1 : WORD_INDEX_ZERO;
-		end
-	endfunction
-	
-	
-		//
-		// Ports
-		//
-	input		wire										clk;			// system clock
-	input		wire										rst_n;		// active-low async reset
-	
-	input		wire										ena;			// enable input
-	output	wire										rdy;			// ready output
-	
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	ab_addr;		// index of current A and B words
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	n_addr;		// index of current N word
-	output	wire	[WORD_COUNTER_WIDTH-1:0]	d_addr;		// index of current D word
-	output	wire										d_wren;		// store current D word now
-	
-	input		wire	[                  31:0]	a_din;		// A
-	input		wire	[                  31:0]	b_din;		// B
-	input		wire	[                  31:0]	n_din;		// N
-	output	wire	[                  31:0]	d_dout;		// D = (A - B) mod N
-	
-	
-		//
-		// Word Indices
-		//
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_ab;
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_n;
-	reg	[WORD_COUNTER_WIDTH-1:0]	index_d;
-		
-		/* map registers to output ports */
-	assign ab_addr	= index_ab;
-	assign n_addr	= index_n;
-	assign d_addr	= index_d;
-
-
-		//
-		// Subtractor
-		//
-	wire	[31: 0]	sub32_d;
-	wire				sub32_b_in;
-	wire				sub32_b_out;
-	
-	subtractor32_wrapper subtractor32
-	(
-		.clk		(clk),
-		.a			(a_din),
-		.b			(b_din),
-		.d			(sub32_d),
-		.b_in		(sub32_b_in),
-		.b_out	(sub32_b_out)
-	);
-	
-	
-		//
-		// Adder
-		//
-	wire	[31: 0]	add32_s;
-	wire				add32_c_in;
-	wire				add32_c_out;
-	
-	adder32_wrapper adder32
-	(
-		.clk		(clk),
-		.a			(sub32_d),
-		.b			(n_din),
-		.s			(add32_s),
-		.c_in		(add32_c_in),
-		.c_out	(add32_c_out)
-	);
-	
-	
-		//
-		// FSM
-		//
-		
-	localparam FSM_SHREG_WIDTH = 2*OPERAND_NUM_WORDS + 5;
-	
-	reg	[FSM_SHREG_WIDTH-1:0]	fsm_shreg;
-	
-	assign rdy = fsm_shreg[0];
-	
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 1) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 0)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_n		= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 2) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 1)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_dif_ab	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 3) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_dif_ab_n	= fsm_shreg[FSM_SHREG_WIDTH - (0 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 3)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_store_data_d	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 4) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 3)];
-	wire [OPERAND_NUM_WORDS-1:0] fsm_shreg_inc_index_d		= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 5) : FSM_SHREG_WIDTH - (2 * OPERAND_NUM_WORDS + 4)];
-	
-	wire fsm_latch_msb_borrow	= fsm_shreg[FSM_SHREG_WIDTH - (1 * OPERAND_NUM_WORDS + 2)];
-	
-	wire inc_index_ab		= |fsm_shreg_inc_index_ab;
-	wire inc_index_n		= |fsm_shreg_inc_index_n;
-	wire store_dif_ab		= |fsm_shreg_store_dif_ab;
-	wire store_dif_ab_n	= |fsm_shreg_store_dif_ab_n;
-	wire store_data_d		= |fsm_shreg_store_data_d;
-	wire inc_index_d		= |fsm_shreg_inc_index_d;
-	
-	always @(posedge clk or negedge rst_n)
-		//
-		if (rst_n == 1'b0)
-			//
-			fsm_shreg <= {{FSM_SHREG_WIDTH-1{1'b0}}, 1'b1};
-			//
-		else begin
-			//
-			if (rdy)	fsm_shreg <= {ena, {FSM_SHREG_WIDTH-2{1'b0}}, ~ena};
-			//
-			else		fsm_shreg <= {1'b0, fsm_shreg[FSM_SHREG_WIDTH-1:1]};
-			//
-		end
-		
-	
-		//
-		// Borrow & Carry Masking Logic
-		//
-	reg	sub32_b_mask;
-	reg	add32_c_mask;
-	
-	
-	always @(posedge clk) begin
-		//
-		sub32_b_mask <= (index_ab == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
-		add32_c_mask <= (index_n  == WORD_INDEX_ZERO) ? 1'b1 : 1'b0;
-		//
-	end
-	
-	assign sub32_b_in = sub32_b_out & ~sub32_b_mask;	
-	assign add32_c_in = add32_c_out & ~add32_c_mask;
-	
-	
-	
-		//
-		// Borrow & Carry Latch Logic
-		//
-	reg sub32_borrow_latch;
-	
-	always @(posedge clk) begin
-		//
-		if (fsm_latch_msb_borrow) sub32_borrow_latch <= sub32_b_out;
-		//
-	end
-
-		
-		//
-		// Intermediate Results
-		//
-	reg	[32*OPERAND_NUM_WORDS-1:0]		d_ab;
-	reg	[32*OPERAND_NUM_WORDS-1:0]		d_ab_n;
-	
-	always @(posedge clk)
-		//
-		if (store_data_d) begin
-			//
-			d_ab		<= {{32{1'bX}}, d_ab[32*OPERAND_NUM_WORDS-1:32]};
-			d_ab_n	<= {{32{1'bX}}, d_ab_n[32*OPERAND_NUM_WORDS-1:32]};		
-			//
-		end else begin
-			//
-			if (store_dif_ab) d_ab <= {sub32_d, d_ab[32*OPERAND_NUM_WORDS-1:32]};
-			if (store_dif_ab_n) d_ab_n <= {add32_s, d_ab_n[32*OPERAND_NUM_WORDS-1:32]};
-			//
-		end
-	
-	
-		//
-		// Word Index Increment Logic
-		//
-	always @(posedge clk)
-		//
-		if (rdy) begin
-			//
-			index_ab		<= WORD_INDEX_ZERO;
-			index_n		<= WORD_INDEX_ZERO;
-			index_d		<= WORD_INDEX_ZERO;
-			//
-		end else begin
-			//
-			if (inc_index_ab) index_ab <= WORD_INDEX_NEXT_OR_ZERO(index_ab);
-			if (inc_index_n)	index_n	<= WORD_INDEX_NEXT_OR_ZERO(index_n);
-			if (inc_index_d)	index_d	<= WORD_INDEX_NEXT_OR_ZERO(index_d);
-			//
-		end
-	
-	
-			//
-			// Output Sum Selector
-			//
-	wire	mux_select_ab_n = sub32_borrow_latch;
-			
-	
-			//
-			// Output Data and Write Enable Logic
-			//
-	reg				d_wren_reg;
-	reg	[31: 0]	d_dout_reg;
-	wire	[31: 0]	d_dout_mux = mux_select_ab_n ? d_ab_n[31:0] : d_ab[31:0];
-	
-	assign d_wren = d_wren_reg;
-	assign d_dout = d_dout_reg;
-	
-	always @(posedge clk)
-		//
-		if (rdy) begin
-			//
-			d_wren_reg	<= 1'b0;
-			d_dout_reg	<= {32{1'bX}};
-			//
-		end else begin
-			//
-			d_wren_reg <= store_data_d;
-			d_dout_reg <= store_data_d ? d_dout_mux : {32{1'bX}};
-			//
-		end			
-
-	
-endmodule
-
-
-//------------------------------------------------------------------------------
-// End-of-File
-//------------------------------------------------------------------------------
-- 
cgit v1.2.3