path: root/streebog_hash/streebog_core_lps.v



`timescale 1ns / 1ps

module streebog_core_lps
	(
		clk,
		ena, rdy, last,
		din, dout
	);
	
	
		//
		// Parameters
		//
	parameter	PS_PIPELINE_STAGES	=  8;	// 2, 4, 8
	parameter	L_PIPELINE_STAGES		=  8;	// 2, 4, 8, 16, 32, 64


		//
		// Ports
		//
	input		wire				clk;		// core clock
	input		wire				ena;		// start transformation flag
	output	wire				rdy;		// transformation done flag (dout is valid)
	output	wire				last;		// transformation about to complete (rdy flag will be asserted during the next cycle)
	input		wire	[511:0]	din;		// input data to transform
	output	wire	[511:0]	dout;		// output data (result of transformation)
	
				
		/*
		 * This LPS core has parametrized internal pipeline. P and S transformations are combined into one PS transformation and
		 * have common pipeline. L transformation has its own separate pipeline. The total latency of this core is thus
		 * PS_PIPELINE_STAGES*L_PIPELINE_STAGES. The fastest version completes the tranformation in 2*2=4 cycles, the slowest
		 * version requires 8*64=512 cycles. S transformation substitutes bytes according to a lookup table. P transformation does
		 * permutation of input bytes. L transformation multiplies input data by a special predefined matrix. If you don't understand
		 * how matrices are multiplied, you should not try to understand how the following code works. This may damage your brain.
		 * You've been warned. Seriously.
		 *
		 */


		//
		// Constants
		//
		
		/*
		 * PS transformation operates on 64-bit words. Input data contains 512/64=8 such words.
		 * Depending on PS pipeline stage count we can transform 1, 2 or 4 words at a time.
		 *
		 * L transformation operates on 64-bit words. Depending on L pipeline stage count we
		 * can transform 1, 2, 4, 8, 16 or 32 bits of a word at a time.
		 *
		 */

	localparam	PS_WORDS_AT_ONCE	=  8 / PS_PIPELINE_STAGES;
	localparam	L_BITS_AT_ONCE		= 64 / L_PIPELINE_STAGES;
	
		/*
		 * These functions return number of bytes needed to store pipeline stage counters. They will
		 * also prevent users from specifying illegal pipeline widths . This module will not synthesize
		 * with invalid pipeline stage count, because counter width will not be explicitely defined.
		 *
		 */
	
	function	integer	PS_NUM_COUNT_BITS;
		input	integer	x;
		begin
			case (x)
				2:	PS_NUM_COUNT_BITS = 1;
				4:	PS_NUM_COUNT_BITS = 2;
				8:	PS_NUM_COUNT_BITS = 3;
			endcase
		end
	endfunction
	
	function	integer	L_NUM_COUNT_BITS;
		input	integer	y;
		begin
			case (y)
				 2:	L_NUM_COUNT_BITS = 1;
				 4:	L_NUM_COUNT_BITS = 2;
				 8:	L_NUM_COUNT_BITS = 3;
				16:	L_NUM_COUNT_BITS = 4;
				32:	L_NUM_COUNT_BITS = 5;
				64:	L_NUM_COUNT_BITS = 6;
			endcase
		end
	endfunction
	
	
		//
		// Counter Widths
		//
	localparam	L_CNT_BITS	= L_NUM_COUNT_BITS(L_PIPELINE_STAGES);		// width of L counter
	localparam	PS_CNT_BITS	= PS_NUM_COUNT_BITS(PS_PIPELINE_STAGES);	// width of PS counter
	
	
		//
		// Input Multiplexor
		//
	wire	[63: 0]	din_mux[0:7];		// eight 64-bit words
	
		/*
		 * This multiplexor does the P transformation. P transformation is effectively a matrix
		 * transposition. Input 512-bit word is treated as a 8x8 byte matrix. Multiplexor outputs
		 * a set of 8 64-bit words. These words are columns of the original matrix (transposition
		 * turns rows into colums).
		 *
		 */
	
	genvar i, j;
	generate for (i=0; i<8; i=i+1)
		begin: gen_din_mux_i
			for (j=0; j<8; j=j+1) begin: gen_din_mux_j
				assign din_mux[i][8*j + 7 : 8*j] = din[64*j + 8*i + 7 : 64*j + 8*i];
			end
		end
	endgenerate
	
	
		//
		// Output Multiplexor
		//
	reg	[63: 0]	dout_mux[0:7];		// eight 64-bit words
	
		/*
		 * Output 64-bit subwords are concatenated to form output 512-bit word.
		 *
		 */
		 
	genvar k;
	generate for (k=0; k<8; k=k+1)
		begin: gen_dout_mux
			assign dout[64*k+63:64*k] = dout_mux[k];
		end
	endgenerate
	
	
		//
		// PS and L Counters
		//
		
		/*
		 * These counters control internal data flow of this core. For example, if PS has 2 stages and
		 * L has 4 stages, then the count will look like this:
		 *     ____
		 * ENA     \\\________________________________
		 *     _____                                 _
		 * RDY  ^   \_______________________________/
		 *      |   |   |   |   |   |   |   |   |   |
		 * +----+---+---+---+---+---+---+---+---+---+-
		 * | PS | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 |
		 * +----+---+---+---+---+---+---+---+---+---+-
		 * |  L | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | 0 |
		 * +----+---+---+---+---+---+---+---+---+---+-
		 *        ^               ^               |
		 *        |               |               +--> both counters will be zero during the last cycle
		 *        |               |
		 *        +---------------+------------------> preloading of new word(s) into S lookup table(s)
		 *
		 */
		 
	reg	[ L_CNT_BITS-1:0]	l_count	= { L_CNT_BITS{1'b0}};
	reg	[PS_CNT_BITS-1:0]	ps_count	= {PS_CNT_BITS{1'b0}};


		//
		// Handy Flags
		//
		
		/*
		 * These flags are used instead of lengthy (z_count == {Z_CNT_BITS{1'bZ}}) comparisons.
		 *
		 */

	wire	 l_count_done	= ( l_count == { L_CNT_BITS{1'b1}}) ? 1 : 0;
	wire	ps_count_done	= (ps_count == {PS_CNT_BITS{1'b1}}) ? 1 : 0;
	
	wire	 l_count_zero	= ( l_count == { L_CNT_BITS{1'b0}}) ? 1 : 0;
	wire	ps_count_zero	= (ps_count == {PS_CNT_BITS{1'b0}}) ? 1 : 0;
	
	
		//
		// Preload Flags
		//
		
		/*
		 * These flags are used as clock enables for S lookup table.
		 *
		 */
		
	wire	ps_preload_first	= (rdy && ena);
	wire	ps_preload_next	= (!rdy && !ps_count_zero && l_count_zero);
	
	
		//
		// Last Flag
		//
		
		/*
		 * This flag indicates that core operation is about to complete.
		 *
		 */
	assign last = !rdy && ps_count_zero && l_count_zero;

	
		//
		// Counter Logic
		//
	always @(posedge clk) begin
		//
		if (!rdy && l_count_done)								ps_count	<= ps_count + 1'b1;	// next word(s)
		//
		if (rdy && ena)				 							 l_count	<=  l_count + 1'b1;	// start of transformation
		//
		if (!rdy && !(ps_count_zero && l_count_zero))	 l_count	<=  l_count + 1'b1;	// next part of word(s)
		//
	end
	
	
		//
		// Ready Output Register
		//
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;
	
	
		//
		// Ready Set and Clear Logic
		//
	always @(posedge clk) begin
		//
		if (rdy && ena)										rdy_reg <= 0;	// start of transformation
		//
		if (!rdy && l_count_zero && ps_count_zero)	rdy_reg <= 1;	// end of transformation
		//
	end
		
		
		//
		// S Table Indices
		//
		
		/*
		 * To transform several words at once a set of indices is required.
		 *
		 */
		
	wire	[ 2: 0]	s_in_offset	[0:PS_WORDS_AT_ONCE-1];		// indices of words being transformed
	wire	[63: 0]	s_out			[0:PS_WORDS_AT_ONCE-1];		// output words of S transformation
	
	assign s_in_offset[0] = ps_count * PS_WORDS_AT_ONCE;	// the first index is defined by PS counter,
																			// following indices are linearly increasing
	
	genvar sw, sb;														// word and byte counter
	generate for (sw=1; sw<PS_WORDS_AT_ONCE; sw=sw+1)
		begin: gen_s_in_offset
			assign s_in_offset[sw] = s_in_offset[sw-1] + 1'b1;
		end
	endgenerate
	
	
		//
		// S Lookup Table
		//
	generate for (sw=0; sw<PS_WORDS_AT_ONCE; sw=sw+1)
		begin: gen_s_out_word
			for (sb=0; sb<8; sb=sb+1) begin: gen_s_out_byte
				//
				(* ROM_STYLE="BLOCK" *)
				//
				streebog_rom_s_table s_table
				(
					.clk		(clk),
					.ena		(ps_preload_first | ps_preload_next),
					.din		(din_mux[s_in_offset[sw]][8*sb + 7 : 8*sb]),
					.dout		(s_out[sw][8*sb + 7 : 8*sb])
				);
				//
			end
		end
	endgenerate
	
	
		//
		// A Matrix Indices
		//
		
		/*
		 * To transform several bits at once a set of indices is required.
		 *
		 */		
		 
	wire	[ 5: 0]	l_in_offset	[0:L_BITS_AT_ONCE-1];	// indices of bits being transformed
	wire	[63: 0]	l_out			[0:L_BITS_AT_ONCE-1];	// output bits of L transformation

	assign l_in_offset[0] = l_count * L_BITS_AT_ONCE;	// the first index is defined by L counter,
																		// following indices are linearly increasing
	
	genvar l;
	generate for (l=1; l<L_BITS_AT_ONCE; l=l+1)
		begin: gen_l_in_offset
			assign l_in_offset[l] = l_in_offset[l-1] + 1'b1;
		end
	endgenerate
	
	
		//
		// A Matrix
		//
	generate for (l=0; l<L_BITS_AT_ONCE; l=l+1)
		begin: gen_l_out		
			//
			(* ROM_STYLE="BLOCK" *)
			//
			streebog_rom_a_matrix a_matrix
			(
				.clk		(clk),
				.din		(l_in_offset[l]),
				.dout		(l_out[l])
			);
			//
		end
	endgenerate
	
	
		//
		// Multiplication Logic
		//
		
		/*
		 * Original specification describes multiplication method that effectively adds
		 * matrix rows based on source vector items. Instead of that multiplication is
		 * done column-by-column.
		 *
		 */
		 
	wire	[L_BITS_AT_ONCE-1:0]	l_out_part[0:PS_WORDS_AT_ONCE-1];
	
	genvar lw, lb;
	generate for (lw=0; lw<PS_WORDS_AT_ONCE; lw=lw+1)
		begin: gen_l_out_part
			for (lb=0; lb<L_BITS_AT_ONCE; lb=lb+1) begin: gen_l_out_bit
				//
				assign l_out_part[lw][lb] = ^(l_out[lb] & s_out[lw]);
				//
			end
		end
	endgenerate
	
	
		/*
		 * PS and L transformations have 1-cycle latency, so delayed versions
		 * of offsets are needed to update output registers accordingly.
		 *
		 */
		 
	reg	[PS_CNT_BITS-1:0]	ps_count_dly	= 0;	// delayed PS counter
	reg	[ L_CNT_BITS-1:0]	 l_count_dly	= 0;	// delayed L counter
	
	always @(posedge clk) ps_count_dly <= ps_count;
	always @(posedge clk)  l_count_dly <=  l_count;
	
	
		//
		// Output Offset Tables
		//
	wire	[ 2: 0]	dout_offset_word	[0:PS_WORDS_AT_ONCE-1];
	wire	[ 5: 0]	dout_offset_bit	[0:L_BITS_AT_ONCE  -1];

	assign dout_offset_word[0] = ps_count_dly * PS_WORDS_AT_ONCE;
	assign dout_offset_bit[0]  =  l_count_dly * L_BITS_AT_ONCE;
	
	genvar z;
	
	generate for (z=1; z<PS_WORDS_AT_ONCE; z=z+1)
		begin: gen_dout_offset_word
			assign dout_offset_word[z] = dout_offset_word[z-1] + 1'b1;
		end
	endgenerate
	
	generate for (z=1; z<L_BITS_AT_ONCE; z=z+1)
		begin: gen_dout_offset_bit
			assign dout_offset_bit[z] = dout_offset_bit[z-1] + 1'b1;
		end
	endgenerate
	
	
		//
		// Output Logic
		//
	integer lps_w, lps_b;
	
	always @(posedge clk)
		//
		if (! rdy)
			//
			for (lps_w=0; lps_w<PS_WORDS_AT_ONCE; lps_w=lps_w+1)
				for (lps_b=0; lps_b<L_BITS_AT_ONCE; lps_b=lps_b+1)
					dout_mux[dout_offset_word[lps_w]][dout_offset_bit[lps_b]] <= l_out_part[lps_w][lps_b];
					//dout_mux[dout_offset_word[lps_w]][L_BITS_AT_ONCE*l_count_dly+lps_b] <= l_out_part[lps_w][lps_b];
	
	
endmodule
`timescale 1ns / 1ps

module streebog_core_lps
	(
		clk,
		ena, rdy, last,
		din, dout
	);
	
	
		//
		// Parameters
		//
	parameter	PS_PIPELINE_STAGES	=  8;	// 2, 4, 8
	parameter	L_PIPELINE_STAGES		=  8;	// 2, 4, 8, 16, 32, 64


		//
		// Ports
		//
	input		wire				clk;		// core clock
	input		wire				ena;		// start transformation flag
	output	wire				rdy;		// transformation done flag (dout is valid)
	output	wire				last;		// transformation about to complete (rdy flag will be asserted during the next cycle)
	input		wire	[511:0]	din;		// input data to transform
	output	wire	[511:0]	dout;		// output data (result of transformation)
	
				
		/*
		 * This LPS core has parametrized internal pipeline. P and S transformations are combined into one PS transformation and
		 * have common pipeline. L transformation has its own separate pipeline. The total latency of this core is thus
		 * PS_PIPELINE_STAGES*L_PIPELINE_STAGES. The fastest version completes the tranformation in 2*2=4 cycles, the slowest
		 * version requires 8*64=512 cycles. S transformation substitutes bytes according to a lookup table. P transformation does
		 * permutation of input bytes. L transformation multiplies input data by a special predefined matrix. If you don't understand
		 * how matrices are multiplied, you should not try to understand how the following code works. This may damage your brain.
		 * You've been warned. Seriously.
		 *
		 */


		//
		// Constants
		//
		
		/*
		 * PS transformation operates on 64-bit words. Input data contains 512/64=8 such words.
		 * Depending on PS pipeline stage count we can transform 1, 2 or 4 words at a time.
		 *
		 * L transformation operates on 64-bit words. Depending on L pipeline stage count we
		 * can transform 1, 2, 4, 8, 16 or 32 bits of a word at a time.
		 *
		 */

	localparam	PS_WORDS_AT_ONCE	=  8 / PS_PIPELINE_STAGES;
	localparam	L_BITS_AT_ONCE		= 64 / L_PIPELINE_STAGES;
	
		/*
		 * These functions return number of bytes needed to store pipeline stage counters. They will
		 * also prevent users from specifying illegal pipeline widths . This module will not synthesize
		 * with invalid pipeline stage count, because counter width will not be explicitely defined.
		 *
		 */
	
	function	integer	PS_NUM_COUNT_BITS;
		input	integer	x;
		begin
			case (x)
				2:	PS_NUM_COUNT_BITS = 1;
				4:	PS_NUM_COUNT_BITS = 2;
				8:	PS_NUM_COUNT_BITS = 3;
			endcase
		end
	endfunction
	
	function	integer	L_NUM_COUNT_BITS;
		input	integer	y;
		begin
			case (y)
				 2:	L_NUM_COUNT_BITS = 1;
				 4:	L_NUM_COUNT_BITS = 2;
				 8:	L_NUM_COUNT_BITS = 3;
				16:	L_NUM_COUNT_BITS = 4;
				32:	L_NUM_COUNT_BITS = 5;
				64:	L_NUM_COUNT_BITS = 6;
			endcase
		end
	endfunction
	
	
		//
		// Counter Widths
		//
	localparam	L_CNT_BITS	= L_NUM_COUNT_BITS(L_PIPELINE_STAGES);		// width of L counter
	localparam	PS_CNT_BITS	= PS_NUM_COUNT_BITS(PS_PIPELINE_STAGES);	// width of PS counter
	
	
		//
		// Input Multiplexor
		//
	wire	[63: 0]	din_mux[0:7];		// eight 64-bit words
	
		/*
		 * This multiplexor does the P transformation. P transformation is effectively a matrix
		 * transposition. Input 512-bit word is treated as a 8x8 byte matrix. Multiplexor outputs
		 * a set of 8 64-bit words. These words are columns of the original matrix (transposition
		 * turns rows into colums).
		 *
		 */
	
	genvar i, j;
	generate for (i=0; i<8; i=i+1)
		begin: gen_din_mux_i
			for (j=0; j<8; j=j+1) begin: gen_din_mux_j
				assign din_mux[i][8*j + 7 : 8*j] = din[64*j + 8*i + 7 : 64*j + 8*i];
			end
		end
	endgenerate
	
	
		//
		// Output Multiplexor
		//
	reg	[63: 0]	dout_mux[0:7];		// eight 64-bit words
	
		/*
		 * Output 64-bit subwords are concatenated to form output 512-bit word.
		 *
		 */
		 
	genvar k;
	generate for (k=0; k<8; k=k+1)
		begin: gen_dout_mux
			assign dout[64*k+63:64*k] = dout_mux[k];
		end
	endgenerate
	
	
		//
		// PS and L Counters
		//
		
		/*
		 * These counters control internal data flow of this core. For example, if PS has 2 stages and
		 * L has 4 stages, then the count will look like this:
		 *     ____
		 * ENA     \\\________________________________
		 *     _____                                 _
		 * RDY  ^   \_______________________________/
		 *      |   |   |   |   |   |   |   |   |   |
		 * +----+---+---+---+---+---+---+---+---+---+-
		 * | PS | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 |
		 * +----+---+---+---+---+---+---+---+---+---+-
		 * |  L | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | 0 |
		 * +----+---+---+---+---+---+---+---+---+---+-
		 *        ^               ^               |
		 *        |               |               +--> both counters will be zero during the last cycle
		 *        |               |
		 *        +---------------+------------------> preloading of new word(s) into S lookup table(s)
		 *
		 */
		 
	reg	[ L_CNT_BITS-1:0]	l_count	= { L_CNT_BITS{1'b0}};
	reg	[PS_CNT_BITS-1:0]	ps_count	= {PS_CNT_BITS{1'b0}};


		//
		// Handy Flags
		//
		
		/*
		 * These flags are used instead of lengthy (z_count == {Z_CNT_BITS{1'bZ}}) comparisons.
		 *
		 */

	wire	 l_count_done	= ( l_count == { L_CNT_BITS{1'b1}}) ? 1 : 0;
	wire	ps_count_done	= (ps_count == {PS_CNT_BITS{1'b1}}) ? 1 : 0;
	
	wire	 l_count_zero	= ( l_count == { L_CNT_BITS{1'b0}}) ? 1 : 0;
	wire	ps_count_zero	= (ps_count == {PS_CNT_BITS{1'b0}}) ? 1 : 0;
	
	
		//
		// Preload Flags
		//
		
		/*
		 * These flags are used as clock enables for S lookup table.
		 *
		 */
		
	wire	ps_preload_first	= (rdy && ena);
	wire	ps_preload_next	= (!rdy && !ps_count_zero && l_count_zero);
	
	
		//
		// Last Flag
		//
		
		/*
		 * This flag indicates that core operation is about to complete.
		 *
		 */
	assign last = !rdy && ps_count_zero && l_count_zero;

	
		//
		// Counter Logic
		//
	always @(posedge clk) begin
		//
		if (!rdy && l_count_done)								ps_count	<= ps_count + 1'b1;	// next word(s)
		//
		if (rdy && ena)				 							 l_count	<=  l_count + 1'b1;	// start of transformation
		//
		if (!rdy && !(ps_count_zero && l_count_zero))	 l_count	<=  l_count + 1'b1;	// next part of word(s)
		//
	end
	
	
		//
		// Ready Output Register
		//
	reg rdy_reg = 1'b1;
	assign rdy = rdy_reg;
	
	
		//
		// Ready Set and Clear Logic
		//
	always @(posedge clk) begin
		//
		if (rdy && ena)										rdy_reg <= 0;	// start of transformation
		//
		if (!rdy && l_count_zero && ps_count_zero)	rdy_reg <= 1;	// end of transformation
		//
	end
		
		
		//
		// S Table Indices
		//
		
		/*
		 * To transform several words at once a set of indices is required.
		 *
		 */
		
	wire	[ 2: 0]	s_in_offset	[0:PS_WORDS_AT_ONCE-1];		// indices of words being transformed
	wire	[63: 0]	s_out			[0:PS_WORDS_AT_ONCE-1];		// output words of S transformation
	
	assign s_in_offset[0] = ps_count * PS_WORDS_AT_ONCE;	// the first index is defined by PS counter,
																			// following indices are linearly increasing
	
	genvar sw, sb;														// word and byte counter
	generate for (sw=1; sw<PS_WORDS_AT_ONCE; sw=sw+1)
		begin: gen_s_in_offset
			assign s_in_offset[sw] = s_in_offset[sw-1] + 1'b1;
		end
	endgenerate
	
	
		//
		// S Lookup Table
		//
	generate for (sw=0; sw<PS_WORDS_AT_ONCE; sw=sw+1)
		begin: gen_s_out_word
			for (sb=0; sb<8; sb=sb+1) begin: gen_s_out_byte
				//
				(* ROM_STYLE="BLOCK" *)
				//
				streebog_rom_s_table s_table
				(
					.clk		(clk),
					.ena		(ps_preload_first | ps_preload_next),
					.din		(din_mux[s_in_offset[sw]][8*sb + 7 : 8*sb]),
					.dout		(s_out[sw][8*sb + 7 : 8*sb])
				);
				//
			end
		end
	endgenerate
	
	
		//
		// A Matrix Indices
		//
		
		/*
		 * To transform several bits at once a set of indices is required.
		 *
		 */		
		 
	wire	[ 5: 0]	l_in_offset	[0:L_BITS_AT_ONCE-1];	// indices of bits being transformed
	wire	[63: 0]	l_out			[0:L_BITS_AT_ONCE-1];	// output bits of L transformation

	assign l_in_offset[0] = l_count * L_BITS_AT_ONCE;	// the first index is defined by L counter,
																		// following indices are linearly increasing
	
	genvar l;
	generate for (l=1; l<L_BITS_AT_ONCE; l=l+1)
		begin: gen_l_in_offset
			assign l_in_offset[l] = l_in_offset[l-1] + 1'b1;
		end
	endgenerate
	
	
		//
		// A Matrix
		//
	generate for (l=0; l<L_BITS_AT_ONCE; l=l+1)
		begin: gen_l_out		
			//
			(* ROM_STYLE="BLOCK" *)
			//
			streebog_rom_a_matrix a_matrix
			(
				.clk		(clk),
				.din		(l_in_offset[l]),
				.dout		(l_out[l])
			);
			//
		end
	endgenerate
	
	
		//
		// Multiplication Logic
		//
		
		/*
		 * Original specification describes multiplication method that effectively adds
		 * matrix rows based on source vector items. Instead of that multiplication is
		 * done column-by-column.
		 *
		 */
		 
	wire	[L_BITS_AT_ONCE-1:0]	l_out_part[0:PS_WORDS_AT_ONCE-1];
	
	genvar lw, lb;
	generate for (lw=0; lw<PS_WORDS_AT_ONCE; lw=lw+1)
		begin: gen_l_out_part
			for (lb=0; lb<L_BITS_AT_ONCE; lb=lb+1) begin: gen_l_out_bit
				//
				assign l_out_part[lw][lb] = ^(l_out[lb] & s_out[lw]);
				//
			end
		end
	endgenerate
	
	
		/*
		 * PS and L transformations have 1-cycle latency, so delayed versions
		 * of offsets are needed to update output registers accordingly.
		 *
		 */
		 
	reg	[PS_CNT_BITS-1:0]	ps_count_dly	= 0;	// delayed PS counter
	reg	[ L_CNT_BITS-1:0]	 l_count_dly	= 0;	// delayed L counter
	
	always @(posedge clk) ps_count_dly <= ps_count;
	always @(posedge clk)  l_count_dly <=  l_count;
	
	
		//
		// Output Offset Tables
		//
	wire	[ 2: 0]	dout_offset_word	[0:PS_WORDS_AT_ONCE-1];
	wire	[ 5: 0]	dout_offset_bit	[0:L_BITS_AT_ONCE  -1];

	assign dout_offset_word[0] = ps_count_dly * PS_WORDS_AT_ONCE;
	assign dout_offset_bit[0]  =  l_count_dly * L_BITS_AT_ONCE;
	
	genvar z;
	
	generate for (z=1; z<PS_WORDS_AT_ONCE; z=z+1)
		begin: gen_dout_offset_word
			assign dout_offset_word[z] = dout_offset_word[z-1] + 1'b1;
		end
	endgenerate
	
	generate for (z=1; z<L_BITS_AT_ONCE; z=z+1)
		begin: gen_dout_offset_bit
			assign dout_offset_bit[z] = dout_offset_bit[z-1] + 1'b1;
		end
	endgenerate
	
	
		//
		// Output Logic
		//
	integer lps_w, lps_b;
	
	always @(posedge clk)
		//
		if (! rdy)
			//
			for (lps_w=0; lps_w<PS_WORDS_AT_ONCE; lps_w=lps_w+1)
				for (lps_b=0; lps_b<L_BITS_AT_ONCE; lps_b=lps_b+1)
					dout_mux[dout_offset_word[lps_w]][dout_offset_bit[lps_b]] <= l_out_part[lps_w][lps_b];
					//dout_mux[dout_offset_word[lps_w]][L_BITS_AT_ONCE*l_count_dly+lps_b] <= l_out_part[lps_w][lps_b];
	
	
endmodule