From 83f8779a661202183f5866a4e80ef36f24b9e1ea Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Thu, 16 Jan 2020 14:45:26 +0300 Subject: Had to rework the general worker module to reach 180 MHz core clock. The module is responsible for doing certain supporting operations (mostly moving operands between banks and doing some simple math operations, such as modular subtraction and regular addition). Depending on the particular operation, one of three bank address space sweep patterns was used: * one-pass (for things like carry propagation) * two-pass (for things like modular subtraction that produce intermediate values in the process) * one-pass interleaved (for copying when only either CRT_?.X or CRT_?.Y is rewritten: we can only write to X and Y simultaneously, so we have to interleave reads from the source bank with reads from the destination bank and overwrite the destination with its just read value, otherwise the second destination operand is lost) I initially coded three FSMs, one for each of the address space sweeps and triggered one of them depending on the opcode, but that turned out too complicated. There's now only one FSM that always does the "one-pass interleaved" pattern, whereas the second read (from the destination bank) is inhibited when not need by the opcode. --- rtl/modexpng_general_worker.v | 1898 ++++++++++++++++++----------------------- 1 file changed, 839 insertions(+), 1059 deletions(-) (limited to 'rtl') diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v index eadd284..0620bd6 100644 --- a/rtl/modexpng_general_worker.v +++ b/rtl/modexpng_general_worker.v @@ -127,67 +127,46 @@ module modexpng_general_worker // // FSM Declaration // - localparam [5:0] WRK_FSM_STATE_IDLE = 6'h00; - - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1 = 6'h01; - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2 = 6'h02; - localparam [5:0] WRK_FSM_STATE_BUSY = 6'h03; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST1 = 6'h05; // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug! - localparam [5:0] WRK_FSM_STATE_LATENCY_POST2 = 6'h06; - - localparam [5:0] WRK_FSM_STATE_STOP = 6'h07; - - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M1 = 6'h10; - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M2 = 6'h11; - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M1 = 6'h12; - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M2 = 6'h13; - localparam [5:0] WRK_FSM_STATE_BUSY_M1 = 6'h14; - localparam [5:0] WRK_FSM_STATE_BUSY_M2 = 6'h15; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 6'h16; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 6'h17; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 6'h18; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 6'h19; - - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_TP = 6'h20; - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_TP = 6'h21; - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE3_TP = 6'h22; - localparam [5:0] WRK_FSM_STATE_LATENCY_PRE4_TP = 6'h23; - localparam [5:0] WRK_FSM_STATE_BUSY_TP = 6'h24; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_TP = 6'h25; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_TP = 6'h26; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST3_TP = 6'h27; - localparam [5:0] WRK_FSM_STATE_LATENCY_POST4_TP = 6'h28; - localparam [5:0] WRK_FSM_STATE_HOLDOFF_TP = 6'h29; - - reg [5:0] wrk_fsm_state = WRK_FSM_STATE_IDLE; - reg [5:0] wrk_fsm_state_next_one_pass; // single address space sweep - reg [5:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y) - reg [5:0] wrk_fsm_state_next_two_pass; // two address space sweeps - reg wrk_fsm_two_pass_pass; // 0=first pass, 1=second pass - reg wrk_fsm_two_pass_pass_dly; // 0=first pass, 1=second pass - - - // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps) + + localparam [3:0] WRK_FSM_STATE_IDLE = 4'h0; + + localparam [3:0] WRK_FSM_STATE_LATENCY_PRE1 = 4'h1; + localparam [3:0] WRK_FSM_STATE_LATENCY_PRE2 = 4'h2; + localparam [3:0] WRK_FSM_STATE_LATENCY_PRE3 = 4'h3; + localparam [3:0] WRK_FSM_STATE_LATENCY_PRE4 = 4'h4; + + localparam [3:0] WRK_FSM_STATE_BUSY1 = 4'hA; + localparam [3:0] WRK_FSM_STATE_BUSY2 = 4'hB; + + localparam [3:0] WRK_FSM_STATE_LATENCY_POST1 = 4'h5; + localparam [3:0] WRK_FSM_STATE_LATENCY_POST2 = 4'h6; + localparam [3:0] WRK_FSM_STATE_LATENCY_POST3 = 4'h7; + localparam [3:0] WRK_FSM_STATE_LATENCY_POST4 = 4'h8; + localparam [3:0] WRK_FSM_STATE_STOP = 4'hF; + + reg [3:0] wrk_fsm_state = WRK_FSM_STATE_IDLE; + reg [3:0] wrk_fsm_state_next; + // // Control Signals // - reg rd_wide_xy_ena_x = 1'b0; - reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_x; - reg [ OP_ADDR_W -1:0] rd_wide_xy_addr_x; + reg rd_wide_ena_x = 1'b0; + reg [BANK_ADDR_W -1:0] rd_wide_bank_x; + reg [ OP_ADDR_W -1:0] rd_wide_addr_x; - reg rd_narrow_xy_ena_x = 1'b0; - reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_x; - reg [ OP_ADDR_W -1:0] rd_narrow_xy_addr_x; + reg rd_narrow_ena_x = 1'b0; + reg [BANK_ADDR_W -1:0] rd_narrow_bank_x; + reg [ OP_ADDR_W -1:0] rd_narrow_addr_x; - reg rd_wide_xy_ena_y = 1'b0; - reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_y; - reg [ OP_ADDR_W -1:0] rd_wide_xy_addr_y; + reg rd_wide_ena_y = 1'b0; + reg [BANK_ADDR_W -1:0] rd_wide_bank_y; + reg [ OP_ADDR_W -1:0] rd_wide_addr_y; - reg rd_narrow_xy_ena_y = 1'b0; - reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_y; - reg [ OP_ADDR_W -1:0] rd_narrow_xy_addr_y; + reg rd_narrow_ena_y = 1'b0; + reg [BANK_ADDR_W -1:0] rd_narrow_bank_y; + reg [ OP_ADDR_W -1:0] rd_narrow_addr_y; reg wr_wide_xy_ena_x = 1'b0; reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_x; @@ -217,21 +196,21 @@ module modexpng_general_worker // // Mapping // - assign wrk_rd_wide_xy_ena_x = rd_wide_xy_ena_x; - assign wrk_rd_wide_xy_bank_x = rd_wide_xy_bank_x; - assign wrk_rd_wide_xy_addr_x = rd_wide_xy_addr_x; + assign wrk_rd_wide_xy_ena_x = rd_wide_ena_x; + assign wrk_rd_wide_xy_bank_x = rd_wide_bank_x; + assign wrk_rd_wide_xy_addr_x = rd_wide_addr_x; - assign wrk_rd_narrow_xy_ena_x = rd_narrow_xy_ena_x; - assign wrk_rd_narrow_xy_bank_x = rd_narrow_xy_bank_x; - assign wrk_rd_narrow_xy_addr_x = rd_narrow_xy_addr_x; + assign wrk_rd_narrow_xy_ena_x = rd_narrow_ena_x; + assign wrk_rd_narrow_xy_bank_x = rd_narrow_bank_x; + assign wrk_rd_narrow_xy_addr_x = rd_narrow_addr_x; - assign wrk_rd_wide_xy_ena_y = rd_wide_xy_ena_y; - assign wrk_rd_wide_xy_bank_y = rd_wide_xy_bank_y; - assign wrk_rd_wide_xy_addr_y = rd_wide_xy_addr_y; + assign wrk_rd_wide_xy_ena_y = rd_wide_ena_y; + assign wrk_rd_wide_xy_bank_y = rd_wide_bank_y; + assign wrk_rd_wide_xy_addr_y = rd_wide_addr_y; - assign wrk_rd_narrow_xy_ena_y = rd_narrow_xy_ena_y; - assign wrk_rd_narrow_xy_bank_y = rd_narrow_xy_bank_y; - assign wrk_rd_narrow_xy_addr_y = rd_narrow_xy_addr_y; + assign wrk_rd_narrow_xy_ena_y = rd_narrow_ena_y; + assign wrk_rd_narrow_xy_bank_y = rd_narrow_bank_y; + assign wrk_rd_narrow_xy_addr_y = rd_narrow_addr_y; assign wrk_wr_wide_xy_ena_x = wr_wide_xy_ena_x; assign wrk_wr_wide_xy_bank_x = wr_wide_xy_bank_x; @@ -260,172 +239,111 @@ module modexpng_general_worker // // Delays - // - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly1; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly2; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly3; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly4; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly1; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly2; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly3; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly4; - - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly1; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly2; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly3; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly4; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly1; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly2; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly3; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly4; - - reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly1; - reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly2; - reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly3; - //reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly4; - - reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly1; - reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly2; - reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly3; - //reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly4; - - reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly1; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly2; - - reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly1; - reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly2; + // + reg [OP_ADDR_W -1:0] rd_narrow_addr_x_dly[0:3]; + reg [OP_ADDR_W -1:0] rd_narrow_addr_y_dly[0:3]; + + reg [OP_ADDR_W -1:0] rd_wide_addr_x_dly[0:3]; + reg [OP_ADDR_W -1:0] rd_wide_addr_y_dly[0:3]; + + reg [WORD_EXT_W -1:0] rd_wide_x_din_x_dly1; + reg [WORD_EXT_W -1:0] rd_wide_y_din_x_dly1; + reg [WORD_EXT_W -1:0] rd_wide_x_din_y_dly1; + reg [WORD_EXT_W -1:0] rd_wide_y_din_y_dly1; + reg [WORD_EXT_W -1:0] rd_narrow_x_din_x_dly1; + reg [WORD_EXT_W -1:0] rd_narrow_y_din_x_dly1; + reg [WORD_EXT_W -1:0] rd_narrow_x_din_y_dly1; + reg [WORD_EXT_W -1:0] rd_narrow_y_din_y_dly1; always @(posedge clk) begin // - {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x}; - {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y}; + {rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x}; + {rd_wide_y_din_x_dly1} <= {wrk_rd_wide_y_din_x}; + {rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y}; + {rd_wide_y_din_y_dly1} <= {wrk_rd_wide_y_din_y}; // - {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x}; - {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y}; + {rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x}; + {rd_narrow_y_din_x_dly1} <= {wrk_rd_narrow_y_din_x}; + {rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y}; + {rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y}; // - {/*wrk_rd_wide_x_din_x_dly4,*/ wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {/*wrk_rd_wide_x_din_x_dly3,*/ wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x}; - {/*wrk_rd_wide_x_din_y_dly4,*/ wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {/*wrk_rd_wide_x_din_y_dly3,*/ wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y}; + {rd_narrow_addr_x_dly[3], rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0]} <= {rd_narrow_addr_x_dly[2], rd_narrow_addr_x_dly[1], rd_narrow_addr_x_dly[0], rd_narrow_addr_x}; + {rd_narrow_addr_y_dly[3], rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0]} <= {rd_narrow_addr_y_dly[2], rd_narrow_addr_y_dly[1], rd_narrow_addr_y_dly[0], rd_narrow_addr_y}; // - {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x}; - {wrk_rd_narrow_y_din_x_dly2, wrk_rd_narrow_y_din_x_dly1} <= {wrk_rd_narrow_y_din_x_dly1, wrk_rd_narrow_y_din_x}; - {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y}; - {wrk_rd_narrow_y_din_y_dly2, wrk_rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y_dly1, wrk_rd_narrow_y_din_y}; + {rd_wide_addr_x_dly[3], rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0]} <= {rd_wide_addr_x_dly[2], rd_wide_addr_x_dly[1], rd_wide_addr_x_dly[0], rd_wide_addr_x}; + {rd_wide_addr_y_dly[3], rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0]} <= {rd_wide_addr_y_dly[2], rd_wide_addr_y_dly[1], rd_wide_addr_y_dly[0], rd_wide_addr_y}; // end - - + + // // Source Read Enable Logic // + task _update_wide_rd_en; input _en; {rd_wide_ena_x, rd_wide_ena_y } <= {2{_en}}; endtask + task _update_narrow_rd_en; input _en; {rd_narrow_ena_x, rd_narrow_ena_y} <= {2{_en}}; endtask - task _update_wide_xy_rd_en; input _en; {rd_wide_xy_ena_x, rd_wide_xy_ena_y } <= {2{_en}}; endtask - task _update_narrow_xy_rd_en; input _en; {rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{_en}}; endtask + task enable_wide_rd_en; _update_wide_rd_en(1'b1); endtask + task disable_wide_rd_en; _update_wide_rd_en(1'b0); endtask - task enable_wide_xy_rd_en; _update_wide_xy_rd_en(1'b1); endtask - task disable_wide_xy_rd_en; _update_wide_xy_rd_en(1'b0); endtask - - task enable_narrow_xy_rd_en; _update_narrow_xy_rd_en(1'b1); endtask - task disable_narrow_xy_rd_en; _update_narrow_xy_rd_en(1'b0); endtask + task enable_narrow_rd_en; _update_narrow_rd_en(1'b1); endtask + task disable_narrow_rd_en; _update_narrow_rd_en(1'b0); endtask always @(posedge clk or negedge rst_n) // if (!rst_n) begin // - disable_wide_xy_rd_en; - disable_narrow_xy_rd_en; + disable_wide_rd_en; + disable_narrow_rd_en; // end else begin // - disable_wide_xy_rd_en; - disable_narrow_xy_rd_en; - // - // one_pass + disable_wide_rd_en; + disable_narrow_rd_en; // - case (wrk_fsm_state_next_one_pass) + case (opcode) // - WRK_FSM_STATE_LATENCY_PRE1, - WRK_FSM_STATE_LATENCY_PRE2, - WRK_FSM_STATE_BUSY: + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_OUTPUT_FROM_NARROW, + UOP_OPCODE_MODULAR_REDUCE_INIT, + UOP_OPCODE_MODULAR_SUBTRACT_X: // - case (opcode) - // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_MODULAR_REDUCE_INIT: - // - enable_narrow_xy_rd_en; - // - UOP_OPCODE_COPY_CRT_Y2X: begin - // - enable_wide_xy_rd_en; - enable_narrow_xy_rd_en; - // - end - // - UOP_OPCODE_MERGE_LH: - // - enable_wide_xy_rd_en; - // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1, + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: enable_narrow_rd_en; endcase // - endcase - // - // one_pass_meander - // - case (wrk_fsm_state_next_one_pass_meander) - // - WRK_FSM_STATE_LATENCY_PRE1_M1, - WRK_FSM_STATE_LATENCY_PRE1_M2, - WRK_FSM_STATE_LATENCY_PRE2_M1, - WRK_FSM_STATE_LATENCY_PRE2_M2, - WRK_FSM_STATE_BUSY_M1, - WRK_FSM_STATE_BUSY_M2: + UOP_OPCODE_COPY_CRT_Y2X, + UOP_OPCODE_MODULAR_SUBTRACT_Y, + UOP_OPCODE_MODULAR_SUBTRACT_Z, + UOP_OPCODE_REGULAR_ADD_UNEVEN: // - case (opcode) - // - UOP_OPCODE_COPY_LADDERS_X2Y, - UOP_OPCODE_CROSS_LADDERS_X2Y: begin - // - enable_wide_xy_rd_en; - enable_narrow_xy_rd_en; - // - end - // - UOP_OPCODE_REGULAR_ADD_UNEVEN: - // - enable_narrow_xy_rd_en; - // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1, + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: begin enable_wide_rd_en; enable_narrow_rd_en; end endcase // - endcase - // - // two_pass - // - case (wrk_fsm_state_next_two_pass) - // - WRK_FSM_STATE_LATENCY_PRE1_TP, - WRK_FSM_STATE_LATENCY_PRE2_TP, - WRK_FSM_STATE_LATENCY_PRE3_TP, - WRK_FSM_STATE_LATENCY_PRE4_TP, - WRK_FSM_STATE_BUSY_TP: + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: // - case (opcode) - UOP_OPCODE_MODULAR_SUBTRACT: - // - if (!wrk_fsm_two_pass_pass) begin - enable_wide_xy_rd_en; - enable_narrow_xy_rd_en; - end else - enable_narrow_xy_rd_en; - // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1, + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_BUSY2: begin enable_wide_rd_en; enable_narrow_rd_en; end endcase // + UOP_OPCODE_MERGE_LH: + // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1, + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: enable_wide_rd_en; + endcase + // endcase // end @@ -435,490 +353,330 @@ module modexpng_general_worker // Destination Write Enable Logic // - task _update_wide_xy_wr_en; input _en; {wr_wide_xy_ena_x, wr_wide_xy_ena_y } <= {2{_en}}; endtask - task _update_narrow_xy_wr_en; input _en; {wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{_en}}; endtask + task _update_wide_wr_en; input _en; {wr_wide_xy_ena_x, wr_wide_xy_ena_y } <= {2{_en}}; endtask + task _update_narrow_wr_en; input _en; {wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{_en}}; endtask - task enable_wide_xy_wr_en; _update_wide_xy_wr_en(1'b1); endtask - task disable_wide_xy_wr_en; _update_wide_xy_wr_en(1'b0); endtask + task enable_wide_wr_en; _update_wide_wr_en(1'b1); endtask + task disable_wide_wr_en; _update_wide_wr_en(1'b0); endtask - task enable_narrow_xy_wr_en; _update_narrow_xy_wr_en(1'b1); endtask - task disable_narrow_xy_wr_en; _update_narrow_xy_wr_en(1'b0); endtask + task enable_narrow_wr_en; _update_narrow_wr_en(1'b1); endtask + task disable_narrow_wr_en; _update_narrow_wr_en(1'b0); endtask always @(posedge clk or negedge rst_n) // if (!rst_n) begin // - disable_wide_xy_wr_en; - disable_narrow_xy_wr_en; + disable_wide_wr_en; + disable_narrow_wr_en; // end else begin // - disable_wide_xy_wr_en; - disable_narrow_xy_wr_en; + disable_wide_wr_en; + disable_narrow_wr_en; // - // one_pass - // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_MODULAR_SUBTRACT_X, + UOP_OPCODE_MERGE_LH, + UOP_OPCODE_REGULAR_ADD_UNEVEN: // - case (opcode) - // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_MERGE_LH: - // - enable_narrow_xy_wr_en; - // - UOP_OPCODE_COPY_CRT_Y2X: begin - // - enable_wide_xy_wr_en; - enable_narrow_xy_wr_en; - // - end - // - UOP_OPCODE_MODULAR_REDUCE_INIT: - // - enable_wide_xy_wr_en; - // + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: enable_narrow_wr_en; endcase // - endcase - // - // one_pass_meander - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2, - WRK_FSM_STATE_LATENCY_POST2_M2: + UOP_OPCODE_COPY_CRT_Y2X, + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y, + UOP_OPCODE_MODULAR_SUBTRACT_Z: // - case (opcode) - // - UOP_OPCODE_COPY_LADDERS_X2Y, - UOP_OPCODE_CROSS_LADDERS_X2Y: begin - // - enable_wide_xy_wr_en; - enable_narrow_xy_wr_en; - // - end - // - UOP_OPCODE_REGULAR_ADD_UNEVEN: - // - enable_narrow_xy_wr_en; - // + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: begin enable_wide_wr_en; enable_narrow_wr_en; end endcase // - endcase - // - // two_pass - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY_TP, - WRK_FSM_STATE_LATENCY_POST1_TP, - WRK_FSM_STATE_LATENCY_POST2_TP, - WRK_FSM_STATE_LATENCY_POST3_TP, - WRK_FSM_STATE_LATENCY_POST4_TP: + UOP_OPCODE_MODULAR_REDUCE_INIT, + UOP_OPCODE_MODULAR_SUBTRACT_Y: // - case (opcode) - // - UOP_OPCODE_MODULAR_SUBTRACT: - // - if (!wrk_fsm_two_pass_pass) - enable_narrow_xy_wr_en; - else begin - enable_wide_xy_wr_en; - enable_narrow_xy_wr_en; - end - // + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: enable_wide_wr_en; endcase - // + // endcase // end - + // - // Source to Destination Data Logic + // Source Read Address Logic // + reg [OP_ADDR_W -1:0] rd_wide_addr_next; + reg [OP_ADDR_W -1:0] rd_narrow_addr_next; + + reg rd_wide_addr_is_last = 1'b0; + reg rd_narrow_addr_is_last = 1'b0; + + reg rd_wide_addr_is_last_half = 1'b0; + reg rd_narrow_addr_is_last_half = 1'b0; + + reg rd_wide_addr_next_is_last = 1'b0; + reg rd_narrow_addr_next_is_last = 1'b0; + reg rd_wide_addr_next_is_last_half = 1'b0; + reg rd_narrow_addr_next_is_last_half = 1'b0; + + reg [3:0] rd_wide_addr_is_last_half_dly = 4'h0; + reg [3:0] rd_narrow_addr_is_last_half_dly = 4'h0; + always @(posedge clk) begin // - update_wide_dout (WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC); - update_narrow_dout(WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC); - // - // one_pass - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: - // - case (opcode) - // - UOP_OPCODE_PROPAGATE_CARRIES: - // - update_narrow_dout(rd_narrow_x_din_x_w_cry_reduced, - rd_narrow_y_din_x_w_cry_reduced, - rd_narrow_x_din_y_w_cry_reduced, - rd_narrow_y_din_y_w_cry_reduced); - // - UOP_OPCODE_COPY_CRT_Y2X: begin - // - update_wide_dout(wrk_rd_wide_x_din_y, - wrk_rd_wide_y_din_y, - wrk_rd_wide_x_din_y, - wrk_rd_wide_y_din_y); - // - update_narrow_dout(wrk_rd_narrow_x_din_y, - wrk_rd_narrow_y_din_y, - wrk_rd_narrow_x_din_y, - wrk_rd_narrow_y_din_y); - // - end - // - UOP_OPCODE_MODULAR_REDUCE_INIT: - // - update_wide_dout(wrk_rd_narrow_x_din_x, - wrk_rd_narrow_y_din_x, - wrk_rd_narrow_x_din_y, - wrk_rd_narrow_y_din_y); - // - UOP_OPCODE_MERGE_LH: - // - update_narrow_dout(wrk_rd_wide_x_din_x, - wrk_rd_wide_y_din_x, - wrk_rd_wide_x_din_y, - wrk_rd_wide_y_din_y); - // - endcase - // - endcase - // - // one_pass_meander - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2, - WRK_FSM_STATE_LATENCY_POST2_M2: - // - case (opcode) - // - UOP_OPCODE_COPY_LADDERS_X2Y: begin - // - update_wide_dout(wrk_rd_wide_x_din_x_dly3, - wrk_rd_wide_x_din_x_dly2, - wrk_rd_wide_x_din_y_dly3, - wrk_rd_wide_x_din_y_dly2); - // - update_narrow_dout(wrk_rd_narrow_x_din_x_dly3, - wrk_rd_narrow_x_din_x_dly2, - wrk_rd_narrow_x_din_y_dly3, - wrk_rd_narrow_x_din_y_dly2); - // - end - // - UOP_OPCODE_CROSS_LADDERS_X2Y: begin - // - update_wide_dout(wrk_rd_wide_x_din_x_dly3, - wrk_rd_wide_x_din_y_dly2, - wrk_rd_wide_x_din_y_dly3, - wrk_rd_wide_x_din_x_dly2); - // - update_narrow_dout(wrk_rd_narrow_x_din_x_dly3, - wrk_rd_narrow_x_din_y_dly2, - wrk_rd_narrow_x_din_y_dly3, - wrk_rd_narrow_x_din_x_dly2); - // - end - // - UOP_OPCODE_REGULAR_ADD_UNEVEN: begin - // - update_narrow_dout(regadd_x_x_trunc, - regadd_y_x_trunc, - regadd_x_y_trunc, - regadd_y_y_trunc); - // - end - // - endcase - // - endcase - // - // two_pass - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY_TP, - WRK_FSM_STATE_LATENCY_POST1_TP, - WRK_FSM_STATE_LATENCY_POST2_TP, - WRK_FSM_STATE_LATENCY_POST3_TP, - WRK_FSM_STATE_LATENCY_POST4_TP: - // - case (opcode) - // - UOP_OPCODE_MODULAR_SUBTRACT: - // - if (!wrk_fsm_two_pass_pass) - update_narrow_dout(modsub_x_ab_dly_trunc, modsub_x_abn_trunc, modsub_y_ab_dly_trunc, modsub_y_abn_trunc); - else begin - update_wide_dout (modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux); - update_narrow_dout(modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux); - end - // - endcase - // - endcase + rd_wide_addr_is_last_half_dly <= {rd_wide_addr_is_last_half_dly[2:0], rd_wide_addr_is_last_half}; + rd_narrow_addr_is_last_half_dly <= {rd_narrow_addr_is_last_half_dly[2:0], rd_narrow_addr_is_last_half}; // end - - // - // Source Read Address Logic - // - - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_xy_next; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_xy_next; - - reg rd_wide_xy_addr_xy_next_last_seen; - reg rd_wide_xy_addr_xy_next_last_seen_dly1; - reg rd_wide_xy_addr_xy_next_last_seen_dly2; - - wire rd_wide_xy_addr_xy_next_is_last = rd_wide_xy_addr_xy_next == word_index_last_half; - wire rd_narrow_xy_addr_xy_next_is_last = rd_narrow_xy_addr_xy_next == word_index_last; + task preset_rd_wide_bank_addr; + input [BANK_ADDR_W -1:0] bank; + input [ OP_ADDR_W -1:0] addr; + begin + {rd_wide_bank_x, rd_wide_addr_x} <= {bank, addr}; + {rd_wide_bank_y, rd_wide_addr_y} <= {bank, addr}; + rd_wide_addr_is_last <= 1'b0; + rd_wide_addr_is_last_half <= 1'b0; + end + endtask - task update_rd_wide_bank_addr; + task preset_rd_narrow_bank_addr; input [BANK_ADDR_W -1:0] bank; input [ OP_ADDR_W -1:0] addr; begin - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, addr}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, addr}; + {rd_narrow_bank_x, rd_narrow_addr_x} <= {bank, addr}; + {rd_narrow_bank_y, rd_narrow_addr_y} <= {bank, addr}; + rd_narrow_addr_is_last <= 1'b0; + rd_narrow_addr_is_last_half <= 1'b0; + end + endtask + + task preset_rd_wide_addr_next; + input [OP_ADDR_W -1:0] addr; + begin + rd_wide_addr_next <= addr; + rd_wide_addr_next_is_last <= 1'b0; + rd_wide_addr_next_is_last_half <= 1'b0; end endtask - task update_rd_wide_bank; - input [BANK_ADDR_W -1:0] bank; + task preset_rd_narrow_addr_next; + input [OP_ADDR_W -1:0] addr; + begin + rd_narrow_addr_next <= addr; + rd_narrow_addr_next_is_last <= 1'b0; + rd_narrow_addr_next_is_last_half <= 1'b0; + end + endtask + + task keep_rd_wide_bank; begin - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, rd_wide_xy_addr_x}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, rd_wide_xy_addr_y}; + {rd_wide_bank_x} <= {rd_wide_bank_x}; + {rd_wide_bank_y} <= {rd_wide_bank_y}; end endtask - task update_rd_narrow_bank_addr; + task switch_rd_wide_bank; input [BANK_ADDR_W -1:0] bank; - input [ OP_ADDR_W -1:0] addr; begin - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, addr}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, addr}; + {rd_wide_bank_x} <= {bank}; + {rd_wide_bank_y} <= {bank}; + end + endtask + + task keep_rd_wide_addr; + begin + {rd_wide_addr_x} <= {rd_wide_addr_x}; + {rd_wide_addr_y} <= {rd_wide_addr_y}; + end + endtask + + task advance_rd_wide_addr; + begin + {rd_wide_addr_x} <= {rd_wide_addr_next}; + {rd_wide_addr_y} <= {rd_wide_addr_next}; + rd_wide_addr_is_last <= rd_wide_addr_next == word_index_last; + rd_wide_addr_is_last_half <= rd_wide_addr_next == word_index_last_half; + end + endtask + + task keep_rd_narrow_bank; + begin + {rd_narrow_bank_x} <= {rd_narrow_bank_x}; + {rd_narrow_bank_y} <= {rd_narrow_bank_y}; end endtask - task update_rd_narrow_bank; + task switch_rd_narrow_bank; input [BANK_ADDR_W -1:0] bank; begin - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, rd_narrow_xy_addr_x}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, rd_narrow_xy_addr_y}; + {rd_narrow_bank_x} <= {bank}; + {rd_narrow_bank_y} <= {bank}; end endtask - task update_rd_wide_addr_next; - input [OP_ADDR_W -1:0] addr; - rd_wide_xy_addr_xy_next <= addr; + task keep_rd_narrow_addr; + begin + {rd_narrow_addr_x} <= {rd_narrow_addr_x}; + {rd_narrow_addr_y} <= {rd_narrow_addr_y}; + end + endtask + + task advance_rd_narrow_addr; + begin + {rd_narrow_addr_x} <= {rd_narrow_addr_next}; + {rd_narrow_addr_y} <= {rd_narrow_addr_next}; + rd_narrow_addr_is_last <= rd_narrow_addr_next == word_index_last; + rd_narrow_addr_is_last_half <= rd_narrow_addr_next == word_index_last_half; + end + endtask + + task update_rd_wide_addr_flags; + begin + rd_wide_addr_next_is_last <= rd_wide_addr_next == (word_index_last - 1'b1); + rd_wide_addr_next_is_last_half <= rd_wide_addr_next == (word_index_last_half - 1'b1); + end endtask - task update_rd_narrow_addr_next; - input [OP_ADDR_W -1:0] addr; - rd_narrow_xy_addr_xy_next <= addr; + task update_rd_narrow_addr_flags; + begin + rd_narrow_addr_next_is_last <= rd_narrow_addr_next == (word_index_last - 1'b1); + rd_narrow_addr_next_is_last_half <= rd_narrow_addr_next == (word_index_last_half - 1'b1); + end endtask task advance_rd_wide_addr_next; - rd_wide_xy_addr_xy_next <= !rd_wide_xy_addr_xy_next_is_last ? rd_wide_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO; + begin + rd_wide_addr_next <= !rd_wide_addr_next_is_last ? rd_wide_addr_next + 1'b1 : OP_ADDR_ZERO; + update_rd_wide_addr_flags; + end endtask task advance_rd_narrow_addr_next; - rd_narrow_xy_addr_xy_next <= !rd_narrow_xy_addr_xy_next_is_last ? rd_narrow_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO; + begin + rd_narrow_addr_next <= !rd_narrow_addr_next_is_last ? rd_narrow_addr_next + 1'b1 : OP_ADDR_ZERO; + update_rd_narrow_addr_flags; + end + endtask + + task advance_rd_wide_addr_next_half; + begin + rd_wide_addr_next <= !rd_wide_addr_next_is_last_half ? rd_wide_addr_next + 1'b1 : OP_ADDR_ZERO; + update_rd_wide_addr_flags; + end + endtask + + task advance_rd_narrow_addr_next_half; + begin + rd_narrow_addr_next <= !rd_narrow_addr_next_is_last_half ? rd_narrow_addr_next + 1'b1 : OP_ADDR_ZERO; + update_rd_narrow_addr_flags; + end endtask - - always @(posedge clk) - // - case (opcode) - UOP_OPCODE_MERGE_LH: - case (wrk_fsm_state_next_one_pass) - WRK_FSM_STATE_LATENCY_PRE1: - rd_wide_xy_addr_xy_next_last_seen <= 1'b0; - WRK_FSM_STATE_BUSY: - if (!rd_wide_xy_addr_xy_next_last_seen && rd_wide_xy_addr_xy_next_is_last) - rd_wide_xy_addr_xy_next_last_seen <= 1'b1; - endcase - UOP_OPCODE_REGULAR_ADD_UNEVEN: - case (wrk_fsm_state_next_one_pass_meander) - WRK_FSM_STATE_LATENCY_PRE1_M1: begin - rd_wide_xy_addr_xy_next_last_seen <= 1'b0; - rd_wide_xy_addr_xy_next_last_seen_dly1 <= 1'b0; - rd_wide_xy_addr_xy_next_last_seen_dly2 <= 1'b0; - end - WRK_FSM_STATE_BUSY_M1: begin - if (!rd_wide_xy_addr_xy_next_last_seen && rd_wide_xy_addr_xy_next_is_last) - rd_wide_xy_addr_xy_next_last_seen <= 1'b1; - rd_wide_xy_addr_xy_next_last_seen_dly1 <= rd_wide_xy_addr_xy_next_last_seen; - rd_wide_xy_addr_xy_next_last_seen_dly2 <= rd_wide_xy_addr_xy_next_last_seen_dly1; - end - endcase - endcase always @(posedge clk) begin // - update_rd_wide_bank_addr (BANK_DNC, OP_ADDR_DNC); - update_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_DNC); + preset_rd_wide_bank_addr (BANK_DNC, OP_ADDR_DNC); + preset_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_DNC); // - // one_pass - // - case (wrk_fsm_state_next_one_pass) + case (opcode) // - WRK_FSM_STATE_LATENCY_PRE1: - // - case (opcode) - // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_COPY_CRT_Y2X, - UOP_OPCODE_MODULAR_REDUCE_INIT: begin - // - update_rd_wide_bank_addr (sel_wide_in, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); - update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); - // - end - // - UOP_OPCODE_MERGE_LH: begin - update_rd_wide_bank_addr (BANK_WIDE_L, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); - update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); - end - // - endcase - // - WRK_FSM_STATE_LATENCY_PRE2, - WRK_FSM_STATE_BUSY: + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_OUTPUT_FROM_NARROW, + UOP_OPCODE_MODULAR_SUBTRACT_X: // - case (opcode) - // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_COPY_CRT_Y2X: begin - // - update_rd_wide_bank_addr (sel_wide_in, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next ; - update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; - // - end - // - UOP_OPCODE_MODULAR_REDUCE_INIT: begin - // - update_rd_wide_bank_addr (sel_wide_in, rd_wide_xy_addr_xy_next ); advance_rd_wide_addr_next ; - update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; - // - end - // - UOP_OPCODE_MERGE_LH: begin - // - if (!rd_wide_xy_addr_xy_next_last_seen) update_rd_wide_bank_addr (BANK_WIDE_L, rd_wide_xy_addr_xy_next ); - else update_rd_wide_bank_addr (BANK_WIDE_H, rd_wide_xy_addr_xy_next ); - advance_rd_wide_addr_next ; - update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; - // - end - // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: begin keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2: keep_rd_narrow_bank; endcase // - endcase - // - // one_pass_meander - // - case (wrk_fsm_state_next_one_pass_meander) - // - WRK_FSM_STATE_LATENCY_PRE1_M1: - case (opcode) - UOP_OPCODE_COPY_LADDERS_X2Y, - UOP_OPCODE_CROSS_LADDERS_X2Y: begin - update_rd_wide_bank_addr (sel_wide_out, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); - update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); - end - UOP_OPCODE_REGULAR_ADD_UNEVEN: begin - update_rd_wide_bank_addr (sel_wide_in, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); - update_rd_narrow_bank_addr(sel_wide_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); - end + UOP_OPCODE_COPY_CRT_Y2X, + UOP_OPCODE_MODULAR_SUBTRACT_Z, + UOP_OPCODE_REGULAR_ADD_UNEVEN: + // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr (sel_wide_in, OP_ADDR_ZERO); preset_rd_wide_addr_next (OP_ADDR_ONE); + preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: begin keep_rd_wide_bank; advance_rd_wide_addr; advance_rd_wide_addr_next; + keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2: begin keep_rd_wide_bank; keep_rd_narrow_bank; end endcase // - WRK_FSM_STATE_LATENCY_PRE2_M1, - WRK_FSM_STATE_BUSY_M1: - case (opcode) - UOP_OPCODE_COPY_LADDERS_X2Y, - UOP_OPCODE_CROSS_LADDERS_X2Y: begin - update_rd_wide_bank_addr (sel_wide_out, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next ; - update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; - end - UOP_OPCODE_REGULAR_ADD_UNEVEN: begin - update_rd_wide_bank_addr (sel_wide_in, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next ; - update_rd_narrow_bank_addr(sel_wide_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; - end + UOP_OPCODE_MODULAR_REDUCE_INIT: + // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr (BANK_DNC, OP_ADDR_ZERO); preset_rd_wide_addr_next (OP_ADDR_ONE); + preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: begin advance_rd_wide_addr; advance_rd_wide_addr_next_half; + keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2: keep_rd_narrow_bank; endcase // - WRK_FSM_STATE_LATENCY_PRE1_M2, - WRK_FSM_STATE_LATENCY_PRE2_M2, - WRK_FSM_STATE_BUSY_M2: - case (opcode) - UOP_OPCODE_COPY_LADDERS_X2Y, - UOP_OPCODE_CROSS_LADDERS_X2Y: begin - update_rd_wide_bank (sel_wide_in ); - update_rd_narrow_bank(sel_narrow_in); - end - UOP_OPCODE_REGULAR_ADD_UNEVEN: begin - update_rd_wide_bank (sel_narrow_in); - update_rd_narrow_bank(sel_narrow_in); - end + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: + // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr (sel_wide_in, OP_ADDR_ZERO); preset_rd_wide_addr_next (OP_ADDR_ONE); + preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end + WRK_FSM_STATE_LATENCY_PRE2: begin switch_rd_wide_bank (sel_wide_out); keep_rd_wide_addr; + switch_rd_narrow_bank(sel_narrow_out); keep_rd_narrow_addr; end + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: begin advance_rd_wide_addr; advance_rd_wide_addr_next; switch_rd_wide_bank(sel_wide_in); + advance_rd_narrow_addr; advance_rd_narrow_addr_next; switch_rd_narrow_bank(sel_narrow_in); end + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2: begin keep_rd_wide_addr; switch_rd_wide_bank (sel_wide_out); + keep_rd_narrow_addr; switch_rd_narrow_bank(sel_narrow_out); end endcase // - endcase - // - // two_pass - // - case (wrk_fsm_state_next_two_pass) - // - WRK_FSM_STATE_LATENCY_PRE1_TP: + UOP_OPCODE_MODULAR_SUBTRACT_Y: // - case (opcode) - // - UOP_OPCODE_MODULAR_SUBTRACT: - // - if (!wrk_fsm_two_pass_pass) begin - update_rd_wide_bank_addr (BANK_WIDE_N, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); - update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); - end else begin - update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); - end - // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr (BANK_WIDE_N, OP_ADDR_ZERO); preset_rd_wide_addr_next (OP_ADDR_ONE); + preset_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end + WRK_FSM_STATE_LATENCY_PRE3, + WRK_FSM_STATE_BUSY1: begin keep_rd_wide_bank; advance_rd_wide_addr; advance_rd_wide_addr_next; + keep_rd_narrow_bank; advance_rd_narrow_addr; advance_rd_narrow_addr_next; end + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2: begin keep_rd_wide_bank; keep_rd_narrow_bank; end endcase + // + UOP_OPCODE_MERGE_LH: // - WRK_FSM_STATE_LATENCY_PRE2_TP, - WRK_FSM_STATE_LATENCY_PRE3_TP, - WRK_FSM_STATE_LATENCY_PRE4_TP, - WRK_FSM_STATE_BUSY_TP: - // - case (opcode) - // - UOP_OPCODE_MODULAR_SUBTRACT: - // - if (!wrk_fsm_two_pass_pass) begin - update_rd_wide_bank_addr (BANK_WIDE_N, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next ; - update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; - end else begin - update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; - end - // + case (wrk_fsm_state_next) + WRK_FSM_STATE_LATENCY_PRE1: begin preset_rd_wide_bank_addr (BANK_WIDE_L, OP_ADDR_ZERO); preset_rd_wide_addr_next (OP_ADDR_ONE); + preset_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_ZERO); preset_rd_narrow_addr_next(OP_ADDR_ONE); end + WRK_FSM_STATE_LATENCY_PRE3: begin keep_rd_wide_bank; advance_rd_wide_addr; advance_rd_wide_addr_next_half; + advance_rd_narrow_addr; advance_rd_narrow_addr_next; end + WRK_FSM_STATE_BUSY1: begin if (!rd_wide_addr_is_last_half_dly[0]) keep_rd_wide_bank; + else switch_rd_wide_bank(BANK_WIDE_H); + advance_rd_wide_addr; advance_rd_wide_addr_next_half; + advance_rd_narrow_addr; advance_rd_narrow_addr_next; end + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2: keep_rd_wide_bank; endcase - // + // endcase // end @@ -927,13 +685,21 @@ module modexpng_general_worker // // Destination Write Address Logic // - - wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half; - wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half; - - wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_x = uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H; - wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_y = uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H; + reg modular_reduce_init_first_half_x; + reg modular_reduce_init_first_half_y; + reg [BANK_ADDR_W -1:0] modular_reduce_init_sel_wide_out_x; + reg [BANK_ADDR_W -1:0] modular_reduce_init_sel_wide_out_y; + always @(posedge clk) begin + // + modular_reduce_init_first_half_x <= rd_narrow_addr_x_dly[1] <= word_index_last_half; + modular_reduce_init_first_half_y <= rd_narrow_addr_y_dly[1] <= word_index_last_half; + // + modular_reduce_init_sel_wide_out_x <= modular_reduce_init_first_half_x ? BANK_WIDE_L : BANK_WIDE_H; + modular_reduce_init_sel_wide_out_y <= modular_reduce_init_first_half_y ? BANK_WIDE_L : BANK_WIDE_H; + // + end + task update_wr_wide_bank_addr; input [BANK_ADDR_W -1:0] x_bank; input [BANK_ADDR_W -1:0] y_bank; @@ -955,120 +721,351 @@ module modexpng_general_worker {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {y_bank, y_addr}; end endtask - + always @(posedge clk) begin // update_wr_wide_bank_addr (BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC); update_wr_narrow_bank_addr(BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC); // - // one_pass - // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_MODULAR_SUBTRACT_X, + UOP_OPCODE_MERGE_LH, + UOP_OPCODE_REGULAR_ADD_UNEVEN: // - case (opcode) - // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_COPY_CRT_Y2X: begin - update_wr_wide_bank_addr (sel_wide_out, sel_wide_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2); - update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2); - end - // - UOP_OPCODE_MODULAR_REDUCE_INIT: - update_wr_wide_bank_addr(uop_modular_reduce_init_bank_x, uop_modular_reduce_init_bank_y, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_y_dly2); - // - UOP_OPCODE_MERGE_LH: - update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2); - // + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_addr_x_dly[3], rd_narrow_addr_y_dly[3]); endcase - // - endcase - // - // one_pass_meander - // - case (wrk_fsm_state) // - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2, - WRK_FSM_STATE_LATENCY_POST2_M2: - // - case (opcode) - UOP_OPCODE_COPY_LADDERS_X2Y, - UOP_OPCODE_CROSS_LADDERS_X2Y: begin - update_wr_wide_bank_addr (sel_wide_out, sel_wide_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); - update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); - end - UOP_OPCODE_REGULAR_ADD_UNEVEN: - update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); + UOP_OPCODE_COPY_CRT_Y2X, + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y, + UOP_OPCODE_MODULAR_SUBTRACT_Z: + // + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: begin update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_addr_x_dly[3], rd_narrow_addr_y_dly[3]); + update_wr_wide_bank_addr (sel_wide_out, sel_wide_out, rd_wide_addr_x_dly[3], rd_wide_addr_y_dly[3] ); end endcase + // + UOP_OPCODE_MODULAR_REDUCE_INIT: + // + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: update_wr_wide_bank_addr(modular_reduce_init_sel_wide_out_x, modular_reduce_init_sel_wide_out_y, rd_wide_addr_x_dly[3], rd_wide_addr_y_dly[3]); + endcase + // + UOP_OPCODE_MODULAR_SUBTRACT_Y: // + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: update_wr_wide_bank_addr(sel_wide_out, sel_wide_out, rd_wide_addr_x_dly[3], rd_wide_addr_y_dly[3]); + endcase + // endcase // - // two_pass + end + + + // + // UOP_OPCODE_PROPAGATE_CARRIES + // + reg [CARRY_W -1:0] propagate_carries_x_x_cry_r; + reg [CARRY_W -1:0] propagate_carries_y_x_cry_r; + reg [CARRY_W -1:0] propagate_carries_x_y_cry_r; + reg [CARRY_W -1:0] propagate_carries_y_y_cry_r; + + wire [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry = rd_narrow_x_din_x_dly1 + {{WORD_W{1'b0}}, propagate_carries_x_x_cry_r}; + wire [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry = rd_narrow_y_din_x_dly1 + {{WORD_W{1'b0}}, propagate_carries_y_x_cry_r}; + wire [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry = rd_narrow_x_din_y_dly1 + {{WORD_W{1'b0}}, propagate_carries_x_y_cry_r}; + wire [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry = rd_narrow_y_din_y_dly1 + {{WORD_W{1'b0}}, propagate_carries_y_y_cry_r}; + + reg [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry_r; + reg [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry_r; + reg [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry_r; + reg [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry_r; + + wire [CARRY_W -1:0] propagate_carries_x_x_w_cry_msb = propagate_carries_x_x_w_cry_r[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] propagate_carries_y_x_w_cry_msb = propagate_carries_y_x_w_cry_r[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] propagate_carries_x_y_w_cry_msb = propagate_carries_x_y_w_cry_r[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] propagate_carries_y_y_w_cry_msb = propagate_carries_y_y_w_cry_r[WORD_EXT_W -1:WORD_W]; + + wire [WORD_W -1:0] propagate_carries_x_x_w_cry_lsb = propagate_carries_x_x_w_cry_r[WORD_W -1:0]; + wire [WORD_W -1:0] propagate_carries_y_x_w_cry_lsb = propagate_carries_y_x_w_cry_r[WORD_W -1:0]; + wire [WORD_W -1:0] propagate_carries_x_y_w_cry_lsb = propagate_carries_x_y_w_cry_r[WORD_W -1:0]; + wire [WORD_W -1:0] propagate_carries_y_y_w_cry_lsb = propagate_carries_y_y_w_cry_r[WORD_W -1:0]; + + wire [WORD_EXT_W -1:0] propagate_carries_x_x_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_x_x_w_cry_lsb}; + wire [WORD_EXT_W -1:0] propagate_carries_y_x_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_y_x_w_cry_lsb}; + wire [WORD_EXT_W -1:0] propagate_carries_x_y_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_x_y_w_cry_lsb}; + wire [WORD_EXT_W -1:0] propagate_carries_y_y_w_cry_reduced = {{CARRY_W{1'b0}}, propagate_carries_y_y_w_cry_lsb}; + + task _propagate_carries_update_cry; + input [CARRY_W-1:0] x_x_cry, y_x_cry, x_y_cry, y_y_cry; + { propagate_carries_x_x_cry_r, propagate_carries_y_x_cry_r, propagate_carries_x_y_cry_r, propagate_carries_y_y_cry_r} <= + { x_x_cry, y_x_cry, x_y_cry, y_y_cry}; + endtask + + task propagate_carries_clear_cry; _propagate_carries_update_cry( CARRY_ZERO, CARRY_ZERO, CARRY_ZERO, CARRY_ZERO); endtask + task propagate_carries_store_cry; _propagate_carries_update_cry(propagate_carries_x_x_w_cry_msb, propagate_carries_y_x_w_cry_msb, propagate_carries_x_y_w_cry_msb, propagate_carries_y_y_w_cry_msb); endtask + + task _propagate_carries_update_sum_w_cry; + input [WORD_EXT_W-1:0] x_x_sum_w_cry, y_x_sum_w_cry, x_y_sum_w_cry, y_y_sum_w_cry; + { propagate_carries_x_x_w_cry_r, propagate_carries_y_x_w_cry_r, propagate_carries_x_y_w_cry_r, propagate_carries_y_y_w_cry_r} <= + { x_x_sum_w_cry, y_x_sum_w_cry, x_y_sum_w_cry, y_y_sum_w_cry}; + endtask + + task propagate_carries_store_sum_w_cry; _propagate_carries_update_sum_w_cry(propagate_carries_x_x_w_cry, propagate_carries_y_x_w_cry, propagate_carries_x_y_w_cry, propagate_carries_y_y_w_cry); endtask + + always @(posedge clk) // - case (wrk_fsm_state) + if (opcode == UOP_OPCODE_PROPAGATE_CARRIES) + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_LATENCY_PRE3: propagate_carries_clear_cry; + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1: propagate_carries_store_cry; + // + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2, + WRK_FSM_STATE_LATENCY_POST2: propagate_carries_store_sum_w_cry; + // + endcase + + + // + // UOP_OPCODE_MODULAR_SUBTRACT_X + // UOP_OPCODE_MODULAR_SUBTRACT_Y + // + reg modular_subtract_x_brw_r; + reg modular_subtract_y_brw_r; + + reg modular_subtract_x_cry_r; + reg modular_subtract_y_cry_r; + + wire [WORD_W:0] modular_subtract_x_w_brw = rd_narrow_x_din_x_dly1[WORD_W:0] - rd_narrow_y_din_x_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_x_brw_r}; + wire [WORD_W:0] modular_subtract_y_w_brw = rd_narrow_x_din_y_dly1[WORD_W:0] - rd_narrow_y_din_y_dly1[WORD_W:0] - {{WORD_W{1'b0}}, modular_subtract_y_brw_r}; + + wire [WORD_W:0] modular_subtract_x_w_cry = rd_narrow_x_din_x_dly1[WORD_W:0] + rd_wide_x_din_x_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_x_cry_r}; + wire [WORD_W:0] modular_subtract_y_w_cry = rd_narrow_x_din_y_dly1[WORD_W:0] + rd_wide_x_din_y_dly1[WORD_W:0] + {{WORD_W{1'b0}}, modular_subtract_y_brw_r}; + + reg [WORD_W:0] modular_subtract_x_w_brw_r; + reg [WORD_W:0] modular_subtract_y_w_brw_r; + + reg [WORD_W:0] modular_subtract_x_w_cry_r; + reg [WORD_W:0] modular_subtract_y_w_cry_r; + + wire modular_subtract_x_w_brw_msb = modular_subtract_x_w_brw_r[WORD_W]; + wire modular_subtract_y_w_brw_msb = modular_subtract_y_w_brw_r[WORD_W]; + + wire modular_subtract_x_w_cry_msb = modular_subtract_x_w_cry_r[WORD_W]; + wire modular_subtract_y_w_cry_msb = modular_subtract_y_w_cry_r[WORD_W]; + + wire [WORD_W -1:0] modular_subtract_x_w_brw_lsb = modular_subtract_x_w_brw_r[WORD_W -1:0]; + wire [WORD_W -1:0] modular_subtract_y_w_brw_lsb = modular_subtract_y_w_brw_r[WORD_W -1:0]; + + wire [WORD_W -1:0] modular_subtract_x_w_cry_lsb = modular_subtract_x_w_cry_r[WORD_W -1:0]; + wire [WORD_W -1:0] modular_subtract_y_w_cry_lsb = modular_subtract_y_w_cry_r[WORD_W -1:0]; + + wire [WORD_EXT_W -1:0] modular_subtract_x_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_brw_lsb}; + wire [WORD_EXT_W -1:0] modular_subtract_y_w_brw_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_brw_lsb}; + + wire [WORD_EXT_W -1:0] modular_subtract_x_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_w_cry_lsb}; + wire [WORD_EXT_W -1:0] modular_subtract_y_w_cry_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_w_cry_lsb}; + + reg [WORD_EXT_W -1:0] modular_subtract_x_mux; + reg [WORD_EXT_W -1:0] modular_subtract_y_mux; + + wire [WORD_EXT_W -1:0] modular_subtract_x_mux_reduced = {{CARRY_W{1'b0}}, modular_subtract_x_mux[WORD_W-1:0]}; + wire [WORD_EXT_W -1:0] modular_subtract_y_mux_reduced = {{CARRY_W{1'b0}}, modular_subtract_y_mux[WORD_W-1:0]}; + + task _modular_subtract_update_brw; + input x_brw, y_brw; + {modular_subtract_x_brw_r, modular_subtract_y_brw_r} <= {x_brw, y_brw}; + endtask + + task _modular_subtract_update_cry; + input x_cry, y_cry; + {modular_subtract_x_cry_r, modular_subtract_y_cry_r} <= {x_cry, y_cry}; + endtask + + task modular_subtract_clear_brw; _modular_subtract_update_brw( 1'b0, 1'b0); endtask + task modular_subtract_store_brw; _modular_subtract_update_brw(modular_subtract_x_w_brw_msb, modular_subtract_y_w_brw_msb); endtask + + task modular_subtract_clear_cry; _modular_subtract_update_cry( 1'b0, 1'b0); endtask + task modular_subtract_store_cry; _modular_subtract_update_cry(modular_subtract_x_w_cry_msb, modular_subtract_y_w_cry_msb); endtask + + task _modular_subtract_update_diff_w_brw; + input [WORD_W:0] x_diff_w_brw, y_diff_w_brw; + {modular_subtract_x_w_brw_r, modular_subtract_y_w_brw_r} <= {x_diff_w_brw, y_diff_w_brw}; + endtask + + task _modular_subtract_update_sum_w_cry; + input [WORD_W:0] x_sum_w_cry, y_sum_w_cry; + {modular_subtract_x_w_cry_r, modular_subtract_y_w_cry_r} <= {x_sum_w_cry, y_sum_w_cry}; + endtask + + task modular_subtract_store_diff_w_brw; _modular_subtract_update_diff_w_brw(modular_subtract_x_w_brw, modular_subtract_y_w_brw); endtask + + task modular_subtract_store_sum_w_cry; _modular_subtract_update_sum_w_cry(modular_subtract_x_w_cry, modular_subtract_y_w_cry); endtask + + always @(posedge clk) + // + case (opcode) // - WRK_FSM_STATE_BUSY_TP, - WRK_FSM_STATE_LATENCY_POST1_TP, - WRK_FSM_STATE_LATENCY_POST2_TP, - WRK_FSM_STATE_LATENCY_POST3_TP, - WRK_FSM_STATE_LATENCY_POST4_TP: + UOP_OPCODE_MODULAR_SUBTRACT_X: // - case (opcode) + case (wrk_fsm_state) // - UOP_OPCODE_MODULAR_SUBTRACT: - // - if (!wrk_fsm_two_pass_pass) begin - update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); - end else begin - update_wr_wide_bank_addr (sel_wide_out, sel_wide_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); - update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); - end + WRK_FSM_STATE_LATENCY_PRE3: modular_subtract_clear_brw; + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: modular_subtract_store_brw; // we need the very last borrow here too! + // + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2, + WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_diff_w_brw; + // + endcase + // + UOP_OPCODE_MODULAR_SUBTRACT_Y: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_LATENCY_PRE3: modular_subtract_clear_cry; + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1: modular_subtract_store_cry; + // + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2, + WRK_FSM_STATE_LATENCY_POST2: modular_subtract_store_sum_w_cry; + // + endcase + // + UOP_OPCODE_MODULAR_SUBTRACT_Z: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_LATENCY_PRE4, + WRK_FSM_STATE_BUSY2, + WRK_FSM_STATE_LATENCY_POST2: // - endcase + begin modular_subtract_x_mux <= !modular_subtract_x_brw_r ? rd_narrow_x_din_x_dly1 : rd_wide_x_din_x_dly1; + modular_subtract_y_mux <= !modular_subtract_y_brw_r ? rd_narrow_x_din_y_dly1 : rd_wide_x_din_y_dly1; end + // + endcase + // + endcase + + + // + // UOP_OPCODE_REGULAR_ADD_UNEVEN + // + reg [CARRY_W -1:0] regular_add_uneven_x_x_cry_r; + reg [CARRY_W -1:0] regular_add_uneven_y_x_cry_r; + reg [CARRY_W -1:0] regular_add_uneven_x_y_cry_r; + reg [CARRY_W -1:0] regular_add_uneven_y_y_cry_r; + + wire [WORD_EXT_W -1:0] regular_add_uneven_x_x_msb_w_cry = rd_narrow_x_din_x_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_x_x_cry_r}; + wire [WORD_EXT_W -1:0] regular_add_uneven_y_x_msb_w_cry = rd_narrow_y_din_x_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_y_x_cry_r}; + wire [WORD_EXT_W -1:0] regular_add_uneven_x_y_msb_w_cry = rd_narrow_x_din_y_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_x_y_cry_r}; + wire [WORD_EXT_W -1:0] regular_add_uneven_y_y_msb_w_cry = rd_narrow_y_din_y_dly1 + {{WORD_W{1'b0}}, regular_add_uneven_y_y_cry_r}; + + wire [WORD_EXT_W -1:0] regular_add_uneven_x_x_lsb_w_cry = regular_add_uneven_x_x_msb_w_cry + rd_wide_x_din_x_dly1; + wire [WORD_EXT_W -1:0] regular_add_uneven_y_x_lsb_w_cry = regular_add_uneven_y_x_msb_w_cry + rd_wide_y_din_x_dly1; + wire [WORD_EXT_W -1:0] regular_add_uneven_x_y_lsb_w_cry = regular_add_uneven_x_y_msb_w_cry + rd_wide_x_din_y_dly1; + wire [WORD_EXT_W -1:0] regular_add_uneven_y_y_lsb_w_cry = regular_add_uneven_y_y_msb_w_cry + rd_wide_y_din_y_dly1; + + reg [WORD_EXT_W -1:0] regular_add_uneven_x_x_w_cry_r; + reg [WORD_EXT_W -1:0] regular_add_uneven_y_x_w_cry_r; + reg [WORD_EXT_W -1:0] regular_add_uneven_x_y_w_cry_r; + reg [WORD_EXT_W -1:0] regular_add_uneven_y_y_w_cry_r; + + wire [CARRY_W -1:0] regular_add_uneven_x_x_w_cry_msb = regular_add_uneven_x_x_w_cry_r[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] regular_add_uneven_y_x_w_cry_msb = regular_add_uneven_y_x_w_cry_r[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] regular_add_uneven_x_y_w_cry_msb = regular_add_uneven_x_y_w_cry_r[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] regular_add_uneven_y_y_w_cry_msb = regular_add_uneven_y_y_w_cry_r[WORD_EXT_W -1:WORD_W]; + + wire [WORD_W -1:0] regular_add_uneven_x_x_w_cry_lsb = regular_add_uneven_x_x_w_cry_r[WORD_W -1:0]; + wire [WORD_W -1:0] regular_add_uneven_y_x_w_cry_lsb = regular_add_uneven_y_x_w_cry_r[WORD_W -1:0]; + wire [WORD_W -1:0] regular_add_uneven_x_y_w_cry_lsb = regular_add_uneven_x_y_w_cry_r[WORD_W -1:0]; + wire [WORD_W -1:0] regular_add_uneven_y_y_w_cry_lsb = regular_add_uneven_y_y_w_cry_r[WORD_W -1:0]; + + wire [WORD_EXT_W -1:0] regular_add_uneven_x_x_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_x_x_w_cry_lsb}; + wire [WORD_EXT_W -1:0] regular_add_uneven_y_x_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_y_x_w_cry_lsb}; + wire [WORD_EXT_W -1:0] regular_add_uneven_x_y_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_x_y_w_cry_lsb}; + wire [WORD_EXT_W -1:0] regular_add_uneven_y_y_w_cry_reduced = {{CARRY_W{1'b0}}, regular_add_uneven_y_y_w_cry_lsb}; + + reg regular_add_uneven_store_lsb_now; + + task _regular_add_uneven_update_cry; + input [CARRY_W-1:0] x_x_cry, y_x_cry, x_y_cry, y_y_cry; + { regular_add_uneven_x_x_cry_r, regular_add_uneven_y_x_cry_r, regular_add_uneven_x_y_cry_r, regular_add_uneven_y_y_cry_r} <= + { x_x_cry, y_x_cry, x_y_cry, y_y_cry}; + endtask + + task regular_add_uneven_clear_cry; _regular_add_uneven_update_cry( CARRY_ZERO, CARRY_ZERO, CARRY_ZERO, CARRY_ZERO); endtask + task regular_add_uneven_store_cry; _regular_add_uneven_update_cry(regular_add_uneven_x_x_w_cry_msb, regular_add_uneven_y_x_w_cry_msb, regular_add_uneven_x_y_w_cry_msb, regular_add_uneven_y_y_w_cry_msb); endtask + + task _regular_add_uneven_update_sum_w_cry; + input [WORD_EXT_W-1:0] x_x_sum_w_cry, y_x_sum_w_cry, x_y_sum_w_cry, y_y_sum_w_cry; + { regular_add_uneven_x_x_w_cry_r, regular_add_uneven_y_x_w_cry_r, regular_add_uneven_x_y_w_cry_r, regular_add_uneven_y_y_w_cry_r} <= + { x_x_sum_w_cry, y_x_sum_w_cry, x_y_sum_w_cry, y_y_sum_w_cry}; + endtask + + task regular_add_uneven_store_sum_lsb_w_cry; _regular_add_uneven_update_sum_w_cry(regular_add_uneven_x_x_lsb_w_cry, regular_add_uneven_y_x_lsb_w_cry, regular_add_uneven_x_y_lsb_w_cry, regular_add_uneven_y_y_lsb_w_cry); endtask + + task regular_add_uneven_store_sum_msb_w_cry; _regular_add_uneven_update_sum_w_cry(regular_add_uneven_x_x_msb_w_cry, regular_add_uneven_y_x_msb_w_cry, regular_add_uneven_x_y_msb_w_cry, regular_add_uneven_y_y_msb_w_cry); endtask + + always @(posedge clk) + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_LATENCY_PRE3: regular_add_uneven_store_lsb_now <= 1'b1; + WRK_FSM_STATE_BUSY1: if (rd_wide_addr_is_last_half_dly[3]) regular_add_uneven_store_lsb_now <= 1'b0; // endcase + + always @(posedge clk) // - end + case (wrk_fsm_state) + // + WRK_FSM_STATE_LATENCY_PRE3: regular_add_uneven_clear_cry; + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1: regular_add_uneven_store_cry; + // + WRK_FSM_STATE_LATENCY_PRE4: regular_add_uneven_store_sum_lsb_w_cry; + WRK_FSM_STATE_BUSY2: if (regular_add_uneven_store_lsb_now) regular_add_uneven_store_sum_lsb_w_cry; + else regular_add_uneven_store_sum_msb_w_cry; + WRK_FSM_STATE_LATENCY_POST2: regular_add_uneven_store_sum_msb_w_cry; + // + endcase // // FSM Process // - always @(posedge clk or negedge rst_n) // if (!rst_n) wrk_fsm_state <= WRK_FSM_STATE_IDLE; - else case (opcode) - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_COPY_CRT_Y2X, - UOP_OPCODE_MODULAR_REDUCE_INIT, - UOP_OPCODE_MERGE_LH: wrk_fsm_state <= wrk_fsm_state_next_one_pass; - UOP_OPCODE_COPY_LADDERS_X2Y, - UOP_OPCODE_CROSS_LADDERS_X2Y, - UOP_OPCODE_REGULAR_ADD_UNEVEN: wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander; - UOP_OPCODE_MODULAR_SUBTRACT: wrk_fsm_state <= wrk_fsm_state_next_two_pass; - default: wrk_fsm_state <= WRK_FSM_STATE_IDLE; - endcase - - + else wrk_fsm_state <= wrk_fsm_state_next; + + // // Busy Exit Logic - // - - reg wrk_fsm_done_one_pass = 1'b0; - reg wrk_fsm_done_one_pass_meander = 1'b0; - reg wrk_fsm_done_two_pass = 1'b0; + // + reg wrk_fsm_done = 1'b0; always @(posedge clk) begin // - wrk_fsm_done_one_pass <= 1'b0; - wrk_fsm_done_one_pass_meander <= 1'b0; - wrk_fsm_done_two_pass <= 1'b0; + wrk_fsm_done <= 1'b0; // case (opcode) // @@ -1076,47 +1073,22 @@ module modexpng_general_worker UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X, UOP_OPCODE_MODULAR_REDUCE_INIT, - UOP_OPCODE_MERGE_LH: - // - case (wrk_fsm_state) - WRK_FSM_STATE_BUSY: - if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass <= 1'b1; - endcase - // UOP_OPCODE_COPY_LADDERS_X2Y, UOP_OPCODE_CROSS_LADDERS_X2Y, + UOP_OPCODE_MODULAR_SUBTRACT_X, + UOP_OPCODE_MODULAR_SUBTRACT_Y, + UOP_OPCODE_MODULAR_SUBTRACT_Z, + UOP_OPCODE_MERGE_LH, UOP_OPCODE_REGULAR_ADD_UNEVEN: // case (wrk_fsm_state) - WRK_FSM_STATE_BUSY_M2: - if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; - WRK_FSM_STATE_BUSY_M1: - wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander; + WRK_FSM_STATE_BUSY1: + if (rd_narrow_addr_is_last) wrk_fsm_done <= 1'b1; endcase - // - UOP_OPCODE_MODULAR_SUBTRACT: - // - case (wrk_fsm_state) - WRK_FSM_STATE_BUSY_TP: - if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_two_pass <= 1'b1; - endcase - // // endcase // end - - - // - // FSM Helper Logic - // - always @(posedge clk) - // - case (wrk_fsm_state) - WRK_FSM_STATE_IDLE: if (ena) {wrk_fsm_two_pass_pass, wrk_fsm_two_pass_pass_dly} <= {1'b0, 1'b0}; - WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_two_pass_pass <= 1'b1; - WRK_FSM_STATE_HOLDOFF_TP: wrk_fsm_two_pass_pass_dly <= 1'b1; - endcase // @@ -1125,64 +1097,26 @@ module modexpng_general_worker always @* begin // case (wrk_fsm_state) - WRK_FSM_STATE_IDLE: wrk_fsm_state_next_one_pass = ena ? WRK_FSM_STATE_LATENCY_PRE1 : WRK_FSM_STATE_IDLE ; - WRK_FSM_STATE_LATENCY_PRE1: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_LATENCY_PRE2 ; - WRK_FSM_STATE_LATENCY_PRE2: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_BUSY ; - WRK_FSM_STATE_BUSY: wrk_fsm_state_next_one_pass = wrk_fsm_done_one_pass ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY ; - WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_LATENCY_POST2 ; - WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_STOP ; - WRK_FSM_STATE_STOP: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_IDLE ; - default: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_IDLE ; - endcase - // - end - - always @* begin - // - case (wrk_fsm_state) - WRK_FSM_STATE_IDLE: wrk_fsm_state_next_one_pass_meander = ena ? WRK_FSM_STATE_LATENCY_PRE1_M1 : WRK_FSM_STATE_IDLE ; - // - WRK_FSM_STATE_LATENCY_PRE1_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE1_M2 ; - WRK_FSM_STATE_LATENCY_PRE1_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE2_M1 ; - WRK_FSM_STATE_LATENCY_PRE2_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE2_M2 ; - WRK_FSM_STATE_LATENCY_PRE2_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_BUSY_M1 ; - WRK_FSM_STATE_BUSY_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_BUSY_M2 ; - WRK_FSM_STATE_BUSY_M2: wrk_fsm_state_next_one_pass_meander = wrk_fsm_done_one_pass_meander ? WRK_FSM_STATE_LATENCY_POST1_M1 : WRK_FSM_STATE_BUSY_M1 ; - WRK_FSM_STATE_LATENCY_POST1_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST1_M2 ; - WRK_FSM_STATE_LATENCY_POST1_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST2_M1 ; - WRK_FSM_STATE_LATENCY_POST2_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST2_M2 ; - WRK_FSM_STATE_LATENCY_POST2_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_STOP ; - // - WRK_FSM_STATE_STOP: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_IDLE ; - // - default: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_IDLE ; - endcase - // - end - - always @* begin - // - case (wrk_fsm_state) - WRK_FSM_STATE_IDLE: wrk_fsm_state_next_two_pass = ena ? WRK_FSM_STATE_LATENCY_PRE1_TP : WRK_FSM_STATE_IDLE; - WRK_FSM_STATE_LATENCY_PRE1_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_PRE2_TP ; - WRK_FSM_STATE_LATENCY_PRE2_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_PRE3_TP ; - WRK_FSM_STATE_LATENCY_PRE3_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_PRE4_TP ; - WRK_FSM_STATE_LATENCY_PRE4_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_BUSY_TP ; - WRK_FSM_STATE_BUSY_TP: wrk_fsm_state_next_two_pass = wrk_fsm_done_two_pass ? WRK_FSM_STATE_LATENCY_POST1_TP : WRK_FSM_STATE_BUSY_TP; - WRK_FSM_STATE_LATENCY_POST1_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_POST2_TP ; - WRK_FSM_STATE_LATENCY_POST2_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_POST3_TP ; - WRK_FSM_STATE_LATENCY_POST3_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_POST4_TP ; - WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_HOLDOFF_TP ; - WRK_FSM_STATE_HOLDOFF_TP: wrk_fsm_state_next_two_pass = wrk_fsm_two_pass_pass_dly ? WRK_FSM_STATE_STOP : WRK_FSM_STATE_LATENCY_PRE1_TP; - WRK_FSM_STATE_STOP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_IDLE ; - default: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_IDLE ; + WRK_FSM_STATE_IDLE: wrk_fsm_state_next = ena ? WRK_FSM_STATE_LATENCY_PRE1 : WRK_FSM_STATE_IDLE ; + WRK_FSM_STATE_LATENCY_PRE1: wrk_fsm_state_next = WRK_FSM_STATE_LATENCY_PRE2 ; + WRK_FSM_STATE_LATENCY_PRE2: wrk_fsm_state_next = WRK_FSM_STATE_LATENCY_PRE3 ; + WRK_FSM_STATE_LATENCY_PRE3: wrk_fsm_state_next = WRK_FSM_STATE_LATENCY_PRE4 ; + WRK_FSM_STATE_LATENCY_PRE4: wrk_fsm_state_next = WRK_FSM_STATE_BUSY1 ; + WRK_FSM_STATE_BUSY1: wrk_fsm_state_next = WRK_FSM_STATE_BUSY2 ; + WRK_FSM_STATE_BUSY2: wrk_fsm_state_next = wrk_fsm_done ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY1 ; + WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next = WRK_FSM_STATE_LATENCY_POST2 ; + WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next = WRK_FSM_STATE_LATENCY_POST3 ; + WRK_FSM_STATE_LATENCY_POST3: wrk_fsm_state_next = WRK_FSM_STATE_LATENCY_POST4 ; + WRK_FSM_STATE_LATENCY_POST4: wrk_fsm_state_next = WRK_FSM_STATE_STOP ; + WRK_FSM_STATE_STOP: wrk_fsm_state_next = WRK_FSM_STATE_IDLE ; + default: wrk_fsm_state_next = WRK_FSM_STATE_IDLE ; endcase // end - - + + // - // Ready Logic + // Ready Flag Logic // reg rdy_reg = 1'b1; @@ -1198,321 +1132,167 @@ module modexpng_general_worker // - // UOP_OPCODE_PROPAGATE_CARRIES + // Source to Destination Data Logic // - reg [CARRY_W -1:0] rd_narrow_x_din_x_cry_r; - reg [CARRY_W -1:0] rd_narrow_y_din_x_cry_r; - reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r; - reg [CARRY_W -1:0] rd_narrow_y_din_y_cry_r; - - wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry = wrk_rd_narrow_x_din_x + {{WORD_W{1'b0}}, rd_narrow_x_din_x_cry_r}; - wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry = wrk_rd_narrow_y_din_x + {{WORD_W{1'b0}}, rd_narrow_y_din_x_cry_r}; - wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r}; - wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r}; - - wire [CARRY_W -1:0] rd_narrow_x_din_x_w_cry_msb = rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W]; - wire [CARRY_W -1:0] rd_narrow_y_din_x_w_cry_msb = rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W]; - wire [CARRY_W -1:0] rd_narrow_x_din_y_w_cry_msb = rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W]; - wire [CARRY_W -1:0] rd_narrow_y_din_y_w_cry_msb = rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W]; - - wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_x_w_cry[WORD_W -1:0]}; - wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_x_w_cry[WORD_W -1:0]}; - wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_y_w_cry[WORD_W -1:0]}; - wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_y_w_cry[WORD_W -1:0]}; - + reg [WORD_EXT_W -1:0] rd_wide_x_din_x_dly2; + reg [WORD_EXT_W -1:0] rd_wide_y_din_x_dly2; + reg [WORD_EXT_W -1:0] rd_wide_x_din_y_dly2; + reg [WORD_EXT_W -1:0] rd_wide_y_din_y_dly2; + reg [WORD_EXT_W -1:0] rd_narrow_x_din_x_dly2; + reg [WORD_EXT_W -1:0] rd_narrow_y_din_x_dly2; + reg [WORD_EXT_W -1:0] rd_narrow_x_din_y_dly2; + reg [WORD_EXT_W -1:0] rd_narrow_y_din_y_dly2; + + always @(posedge clk) begin + {rd_wide_x_din_x_dly2, rd_wide_y_din_x_dly2, rd_wide_x_din_y_dly2, rd_wide_y_din_y_dly2 } <= {rd_wide_x_din_x_dly1, rd_wide_y_din_x_dly1, rd_wide_x_din_y_dly1, rd_wide_y_din_y_dly1 }; + {rd_narrow_x_din_x_dly2, rd_narrow_y_din_x_dly2, rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2} <= {rd_narrow_x_din_x_dly1, rd_narrow_y_din_x_dly1, rd_narrow_x_din_y_dly1, rd_narrow_y_din_y_dly1}; + end + task update_wide_dout; input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y; {wr_wide_x_dout_x, wr_wide_y_dout_x, wr_wide_x_dout_y, wr_wide_y_dout_y} <= - { x_x, y_x, x_y, y_y }; + { x_x, y_x, x_y, y_y}; endtask task update_narrow_dout; input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y; {wr_narrow_x_dout_x, wr_narrow_y_dout_x, wr_narrow_x_dout_y, wr_narrow_y_dout_y} <= - { x_x, y_x, x_y, y_y }; - endtask - - task update_narrow_carries; - input [CARRY_W-1:0] x_x_cry, y_x_cry, x_y_cry, y_y_cry; - {rd_narrow_x_din_x_cry_r, rd_narrow_y_din_x_cry_r, rd_narrow_x_din_y_cry_r, rd_narrow_y_din_y_cry_r} <= - { x_x_cry, y_x_cry, x_y_cry, y_y_cry }; + { x_x, y_x, x_y, y_y}; endtask - - always @(posedge clk) - // - if (opcode == UOP_OPCODE_PROPAGATE_CARRIES) - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_LATENCY_PRE2: - // - update_narrow_carries(CARRY_ZERO, CARRY_ZERO, CARRY_ZERO, CARRY_ZERO); - // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1: - // - update_narrow_carries(rd_narrow_x_din_x_w_cry_msb, - rd_narrow_y_din_x_w_cry_msb, - rd_narrow_x_din_y_w_cry_msb, - rd_narrow_y_din_y_w_cry_msb); - // - endcase - - - // - // UOP_OPCODE_MODULAR_SUBTRACT - // - - reg [WORD_W:0] modsub_x_ab; - reg [WORD_W:0] modsub_y_ab; - reg [WORD_W:0] modsub_x_ab_dly; - reg [WORD_W:0] modsub_y_ab_dly; - - reg [WORD_W:0] modsub_x_abn; - reg [WORD_W:0] modsub_y_abn; - - reg modsub_x_ab_mask_now; - reg modsub_y_ab_mask_now; - - reg modsub_x_abn_mask_now; - reg modsub_y_abn_mask_now; - - reg modsub_x_borrow_r; - reg modsub_y_borrow_r; - - wire modsub_x_ab_masked = modsub_x_ab_mask_now ? 1'b0 : modsub_x_ab[WORD_W]; - wire modsub_y_ab_masked = modsub_y_ab_mask_now ? 1'b0 : modsub_y_ab[WORD_W]; - - wire modsub_x_abn_masked = modsub_x_abn_mask_now ? 1'b0 : modsub_x_abn[WORD_W]; - wire modsub_y_abn_masked = modsub_y_abn_mask_now ? 1'b0 : modsub_y_abn[WORD_W]; - - wire [WORD_W:0] modsub_x_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_x_din_x[WORD_W-1:0]}; - wire [WORD_W:0] modsub_y_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_y_din_x[WORD_W-1:0]}; - wire [WORD_W:0] modsub_x_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_x_din_y[WORD_W-1:0]}; - wire [WORD_W:0] modsub_y_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_y_din_y[WORD_W-1:0]}; - - wire [WORD_W:0] modsub_x_wide_x_lsb_pad = {1'b0, wrk_rd_wide_x_din_x_dly1[WORD_W-1:0]}; - wire [WORD_W:0] modsub_x_wide_y_lsb_pad = {1'b0, wrk_rd_wide_x_din_y_dly1[WORD_W-1:0]}; - - wire [WORD_EXT_W -1:0] modsub_x_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_x_ab_dly[WORD_W-1:0]}; - wire [WORD_EXT_W -1:0] modsub_y_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_y_ab_dly[WORD_W-1:0]}; - - wire [WORD_EXT_W -1:0] modsub_x_abn_trunc = {{CARRY_W{1'b0}}, modsub_x_abn[WORD_W-1:0]}; - wire [WORD_EXT_W -1:0] modsub_y_abn_trunc = {{CARRY_W{1'b0}}, modsub_y_abn[WORD_W-1:0]}; - - wire [WORD_EXT_W -1:0] modsub_x_mux = !modsub_x_borrow_r ? wrk_rd_narrow_x_din_x_dly2 : wrk_rd_narrow_y_din_x_dly2; - wire [WORD_EXT_W -1:0] modsub_y_mux = !modsub_y_borrow_r ? wrk_rd_narrow_x_din_y_dly2 : wrk_rd_narrow_y_din_y_dly2; - - wire [WORD_W:0] modsub_x_ab_lsb_pad = {1'b0, modsub_x_ab[WORD_W-1:0]}; - wire [WORD_W:0] modsub_y_ab_lsb_pad = {1'b0, modsub_y_ab[WORD_W-1:0]}; - - task update_modsub_ab; - begin - modsub_x_ab <= modsub_x_narrow_x_lsb_pad - modsub_y_narrow_x_lsb_pad - modsub_x_ab_masked; - modsub_y_ab <= modsub_x_narrow_y_lsb_pad - modsub_y_narrow_y_lsb_pad - modsub_y_ab_masked; - end - endtask - - task update_modsub_abn; - begin - modsub_x_abn <= modsub_x_ab_lsb_pad + modsub_x_wide_x_lsb_pad + modsub_x_abn_masked; - modsub_y_abn <= modsub_y_ab_lsb_pad + modsub_x_wide_y_lsb_pad + modsub_y_abn_masked; - end - endtask - - always @(posedge clk) - // - if (opcode == UOP_OPCODE_MODULAR_SUBTRACT) - // - case (wrk_fsm_state) - WRK_FSM_STATE_LATENCY_POST4_TP: - if (!wrk_fsm_two_pass_pass) - {modsub_x_borrow_r, modsub_y_borrow_r} <= {modsub_x_ab_dly[WORD_W], modsub_y_ab_dly[WORD_W]}; - endcase - - always @(posedge clk) begin - modsub_x_ab_dly <= modsub_x_ab; - modsub_y_ab_dly <= modsub_y_ab; - end - always @(posedge clk) begin // - modsub_x_ab <= {1'bX, WORD_DNC}; - modsub_y_ab <= {1'bX, WORD_DNC}; - // - modsub_x_abn <= {1'bX, WORD_DNC}; - modsub_y_abn <= {1'bX, WORD_DNC}; + update_wide_dout (WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC); + update_narrow_dout(WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC); // - if (opcode == UOP_OPCODE_MODULAR_SUBTRACT) + case (opcode) // - case (wrk_fsm_state) - // - WRK_FSM_STATE_LATENCY_PRE3_TP: - update_modsub_ab; - - WRK_FSM_STATE_LATENCY_PRE4_TP, - WRK_FSM_STATE_BUSY_TP, - WRK_FSM_STATE_LATENCY_POST1_TP, - WRK_FSM_STATE_LATENCY_POST2_TP: begin - update_modsub_ab; - update_modsub_abn; - end + UOP_OPCODE_PROPAGATE_CARRIES: // - WRK_FSM_STATE_LATENCY_POST3_TP: + case (wrk_fsm_state) // - update_modsub_abn; - // - endcase - // - end - - always @(posedge clk) begin - // - modsub_x_ab_mask_now <= 1'b0; - modsub_y_ab_mask_now <= 1'b0; - // - modsub_x_abn_mask_now <= 1'b0; - modsub_y_abn_mask_now <= 1'b0; - // - if (opcode == UOP_OPCODE_MODULAR_SUBTRACT) + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + update_narrow_dout(propagate_carries_x_x_w_cry_reduced, propagate_carries_y_x_w_cry_reduced, propagate_carries_x_y_w_cry_reduced, propagate_carries_y_y_w_cry_reduced); + // + endcase // - case (wrk_fsm_state) - // - WRK_FSM_STATE_LATENCY_PRE2_TP: begin - modsub_x_ab_mask_now <= 1'b1; - modsub_y_ab_mask_now <= 1'b1; - end + UOP_OPCODE_COPY_CRT_Y2X: // - WRK_FSM_STATE_LATENCY_PRE3_TP: begin - modsub_x_abn_mask_now <= 1'b1; - modsub_y_abn_mask_now <= 1'b1; - end + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + begin update_narrow_dout(rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2, rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2); + update_wide_dout (rd_wide_x_din_y_dly2, rd_wide_y_din_y_dly2, rd_wide_x_din_y_dly2, rd_wide_y_din_y_dly2); end + // + endcase + // + UOP_OPCODE_MODULAR_REDUCE_INIT: // - endcase - // - end - - - // - // UOP_OPCODE_ADD_UNEVEN - // - reg [WORD_W:0] regadd_x_x; - reg [WORD_W:0] regadd_y_x; - reg [WORD_W:0] regadd_x_y; - reg [WORD_W:0] regadd_y_y; - - reg regadd_x_x_cry; - reg regadd_y_x_cry; - reg regadd_x_y_cry; - reg regadd_y_y_cry; - - wire [WORD_EXT_W-1:0] regadd_x_x_trunc = {{CARRY_W{1'b0}}, regadd_x_x[WORD_W-1:0]}; - wire [WORD_EXT_W-1:0] regadd_y_x_trunc = {{CARRY_W{1'b0}}, regadd_y_x[WORD_W-1:0]}; - wire [WORD_EXT_W-1:0] regadd_x_y_trunc = {{CARRY_W{1'b0}}, regadd_x_y[WORD_W-1:0]}; - wire [WORD_EXT_W-1:0] regadd_y_y_trunc = {{CARRY_W{1'b0}}, regadd_y_y[WORD_W-1:0]}; - - //wire regadd_x_x_masked = regadd_xy_ab_x_mask_now ? 1'b0 : regadd_x_x[WORD_W]; - //wire regadd_y_x_masked = regadd_xy_ab_x_mask_now ? 1'b0 : regadd_y_x[WORD_W]; - //wire regadd_x_y_masked = regadd_xy_ab_y_mask_now ? 1'b0 : regadd_x_y[WORD_W]; - //wire regadd_y_y_masked = regadd_xy_ab_y_mask_now ? 1'b0 : regadd_y_y[WORD_W]; - /**/ - reg [WORD_W:0] regadd_x_x_a_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_x_dly2[WORD_W-1:0]}; - reg [WORD_W:0] regadd_x_x_b_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_x_dly1[WORD_W-1:0]}; - reg [WORD_W:0] regadd_y_x_a_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_x_dly2[WORD_W-1:0]}; - reg [WORD_W:0] regadd_y_x_b_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_x_dly1[WORD_W-1:0]}; - reg [WORD_W:0] regadd_x_y_a_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_y_dly2[WORD_W-1:0]}; - reg [WORD_W:0] regadd_x_y_b_lsb_pad; //= {1'b0, wrk_rd_narrow_x_din_y_dly1[WORD_W-1:0]}; - reg [WORD_W:0] regadd_y_y_a_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_y_dly2[WORD_W-1:0]}; - reg [WORD_W:0] regadd_y_y_b_lsb_pad; //= {1'b0, wrk_rd_narrow_y_din_y_dly1[WORD_W-1:0]}; - /**/ - //WRK_FSM_STATE_BUSY_M1, - //WRK_FSM_STATE_LATENCY_POST1_M1, - //WRK_FSM_STATE_LATENCY_POST2_M1: - - always @(posedge clk) begin - // - regadd_x_x_a_lsb_pad <= {1'bX, WORD_DNC}; - regadd_x_x_b_lsb_pad <= {1'bX, WORD_DNC}; - regadd_y_x_a_lsb_pad <= {1'bX, WORD_DNC}; - regadd_y_x_b_lsb_pad <= {1'bX, WORD_DNC}; - regadd_x_y_a_lsb_pad <= {1'bX, WORD_DNC}; - regadd_x_y_b_lsb_pad <= {1'bX, WORD_DNC}; - regadd_y_y_a_lsb_pad <= {1'bX, WORD_DNC}; - regadd_y_y_b_lsb_pad <= {1'bX, WORD_DNC}; - // - if (opcode == UOP_OPCODE_REGULAR_ADD_UNEVEN) + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + update_wide_dout(rd_narrow_x_din_x_dly2, rd_narrow_y_din_x_dly2, rd_narrow_x_din_y_dly2, rd_narrow_y_din_y_dly2); + // + endcase // - case (wrk_fsm_state) + UOP_OPCODE_COPY_LADDERS_X2Y: // - WRK_FSM_STATE_LATENCY_PRE2_M2, - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2: begin - regadd_x_x_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_x_din_x_dly1[WORD_W-1:0] : WORD_ZERO}; - regadd_x_x_b_lsb_pad <= {1'b0, wrk_rd_narrow_x_din_x [WORD_W-1:0] }; - regadd_y_x_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_y_din_x_dly1[WORD_W-1:0] : WORD_ZERO}; - regadd_y_x_b_lsb_pad <= {1'b0, wrk_rd_narrow_y_din_x [WORD_W-1:0] }; - regadd_x_y_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_x_din_y_dly1[WORD_W-1:0] : WORD_ZERO}; - regadd_x_y_b_lsb_pad <= {1'b0, wrk_rd_narrow_x_din_y [WORD_W-1:0] }; - regadd_y_y_a_lsb_pad <= {1'b0, !rd_wide_xy_addr_xy_next_last_seen_dly2 ? wrk_rd_narrow_y_din_y_dly1[WORD_W-1:0] : WORD_ZERO}; - regadd_y_y_b_lsb_pad <= {1'b0, wrk_rd_narrow_y_din_y [WORD_W-1:0] }; - end + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + begin update_wide_dout (rd_wide_x_din_x_dly1, rd_wide_x_din_x_dly2, rd_wide_x_din_y_dly1, rd_wide_x_din_y_dly2); + update_narrow_dout(rd_narrow_x_din_x_dly1, rd_narrow_x_din_x_dly2, rd_narrow_x_din_y_dly1, rd_narrow_x_din_y_dly2); end + // + endcase + // + UOP_OPCODE_CROSS_LADDERS_X2Y: // - endcase - end - - always @(posedge clk) begin - // - regadd_x_x <= {1'bX, WORD_DNC}; - regadd_y_x <= {1'bX, WORD_DNC}; - regadd_x_y <= {1'bX, WORD_DNC}; - regadd_y_y <= {1'bX, WORD_DNC}; - // - if (opcode == UOP_OPCODE_REGULAR_ADD_UNEVEN) + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + begin update_wide_dout (rd_wide_x_din_x_dly1, rd_wide_x_din_y_dly2, rd_wide_x_din_y_dly1, rd_wide_x_din_x_dly2); + update_narrow_dout(rd_narrow_x_din_x_dly1, rd_narrow_x_din_y_dly2, rd_narrow_x_din_y_dly1, rd_narrow_x_din_x_dly2); end + // + endcase // - case (wrk_fsm_state) + UOP_OPCODE_MODULAR_SUBTRACT_X: // - WRK_FSM_STATE_BUSY_M1, - WRK_FSM_STATE_LATENCY_POST1_M1, - WRK_FSM_STATE_LATENCY_POST2_M1: begin - regadd_x_x <= regadd_x_x_a_lsb_pad + regadd_x_x_b_lsb_pad + regadd_x_x_cry; - regadd_y_x <= regadd_y_x_a_lsb_pad + regadd_y_x_b_lsb_pad + regadd_y_x_cry; - regadd_x_y <= regadd_x_y_a_lsb_pad + regadd_x_y_b_lsb_pad + regadd_x_y_cry; - regadd_y_y <= regadd_y_y_a_lsb_pad + regadd_y_y_b_lsb_pad + regadd_y_y_cry; - end + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + update_narrow_dout(modular_subtract_x_w_brw_reduced, modular_subtract_x_w_brw_reduced, modular_subtract_y_w_brw_reduced, modular_subtract_y_w_brw_reduced); + // + endcase + // + UOP_OPCODE_MODULAR_SUBTRACT_Y: // - endcase - // - end - - always @(posedge clk) begin - // - regadd_x_x_cry <= 1'bX; - regadd_y_x_cry <= 1'bX; - regadd_x_y_cry <= 1'bX; - regadd_y_y_cry <= 1'bX; - // - if (opcode == UOP_OPCODE_REGULAR_ADD_UNEVEN) + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + update_wide_dout(modular_subtract_x_w_cry_reduced, modular_subtract_x_w_cry_reduced, modular_subtract_y_w_cry_reduced, modular_subtract_y_w_cry_reduced); + // + endcase // - case (wrk_fsm_state) + UOP_OPCODE_MODULAR_SUBTRACT_Z: // - WRK_FSM_STATE_LATENCY_PRE2_M2: begin - regadd_x_x_cry <= 1'b0; - regadd_y_x_cry <= 1'b0; - regadd_x_y_cry <= 1'b0; - regadd_y_y_cry <= 1'b0; - end + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + begin update_wide_dout (modular_subtract_x_mux_reduced, modular_subtract_x_mux_reduced, modular_subtract_y_mux_reduced, modular_subtract_y_mux_reduced); + update_narrow_dout(modular_subtract_x_mux_reduced, modular_subtract_x_mux_reduced, modular_subtract_y_mux_reduced, modular_subtract_y_mux_reduced); end + // + endcase + // + UOP_OPCODE_MERGE_LH: // - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2: begin - regadd_x_x_cry <= regadd_x_x[WORD_W]; - regadd_y_x_cry <= regadd_y_x[WORD_W]; - regadd_x_y_cry <= regadd_x_y[WORD_W]; - regadd_y_y_cry <= regadd_y_y[WORD_W]; - end + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + update_narrow_dout(rd_wide_x_din_x_dly2, rd_wide_y_din_x_dly2, rd_wide_x_din_y_dly2, rd_wide_y_din_y_dly2); + // + endcase + // + UOP_OPCODE_REGULAR_ADD_UNEVEN: // - endcase - // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY1, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST3: + // + update_narrow_dout(regular_add_uneven_x_x_w_cry_reduced, regular_add_uneven_y_x_w_cry_reduced, regular_add_uneven_x_y_w_cry_reduced, regular_add_uneven_y_y_w_cry_reduced); + // + endcase + endcase + // end + endmodule -- cgit v1.2.3