From 1e3303286bdb0d400d78d9d8b0aa90b29949c4a3 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Mon, 21 Oct 2019 12:44:33 +0300 Subject: Refactored general worker module Added modular subtraction micro-operation --- rtl/modexpng_core_top.v | 20 +- rtl/modexpng_general_worker.v | 1180 +++++++++++++++++++++++++---------------- rtl/modexpng_microcode.vh | 12 +- rtl/modexpng_uop_rom.v | 26 +- 4 files changed, 767 insertions(+), 471 deletions(-) (limited to 'rtl') diff --git a/rtl/modexpng_core_top.v b/rtl/modexpng_core_top.v index c78a969..dea7f0a 100644 --- a/rtl/modexpng_core_top.v +++ b/rtl/modexpng_core_top.v @@ -87,7 +87,9 @@ module modexpng_core_top wire uop_opcode_is_wrk = (uop_data_opcode == UOP_OPCODE_PROPAGATE_CARRIES ) || (uop_data_opcode == UOP_OPCODE_COPY_CRT_Y2X ) || (uop_data_opcode == UOP_OPCODE_MODULAR_REDUCE_INIT ) || - (uop_data_opcode == UOP_OPCODE_COPY_LADDERS_X2Y ) ; + (uop_data_opcode == UOP_OPCODE_COPY_LADDERS_X2Y ) || + (uop_data_opcode == UOP_OPCODE_CROSS_LADDERS_X2Y ) || + (uop_data_opcode == UOP_OPCODE_MODULAR_SUBTRACT ) ; wire uop_loop_now; @@ -1113,8 +1115,15 @@ module modexpng_core_top wrk_sel_narrow_out <= uop_data_sel_narrow_out; end // + UOP_OPCODE_MODULAR_SUBTRACT: begin + wrk_sel_wide_out <= uop_data_sel_wide_out; + wrk_sel_narrow_in <= uop_data_sel_narrow_in; + wrk_sel_narrow_out <= uop_data_sel_narrow_out; + end + // UOP_OPCODE_COPY_CRT_Y2X, - UOP_OPCODE_COPY_LADDERS_X2Y: begin + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: begin wrk_sel_wide_in <= uop_data_sel_wide_in; wrk_sel_wide_out <= uop_data_sel_wide_out; wrk_sel_narrow_in <= uop_data_sel_narrow_in; @@ -1157,7 +1166,8 @@ module modexpng_core_top // UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_COPY_CRT_Y2X, - UOP_OPCODE_COPY_LADDERS_X2Y: + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: wrk_word_index_last <= uop_npq_is_n ? word_index_last_n : word_index_last_pq; // UOP_OPCODE_MODULAR_REDUCE_INIT: begin @@ -1171,6 +1181,10 @@ module modexpng_core_top {rdct_word_index_last_x, rdct_word_index_last_y } <= {2{word_index_last_pq }}; end // + UOP_OPCODE_MODULAR_SUBTRACT: begin + wrk_word_index_last <= uop_npq_is_n ? word_index_last_n : word_index_last_pq; + end + // UOP_OPCODE_LADDER_INIT: begin io_mgr_word_index_last <= OP_ADDR_LADDER_LAST; io_mgr_ladder_steps <= crt_mode ? bit_index_last_pq : bit_index_last_n; diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v index 269ef98..74c939b 100644 --- a/rtl/modexpng_general_worker.v +++ b/rtl/modexpng_general_worker.v @@ -1,70 +1,22 @@ module modexpng_general_worker ( - clk, - rst, - - ena, - rdy, - - sel_narrow_in, - sel_narrow_out, - sel_wide_in, - sel_wide_out, - + clk, rst, + ena, rdy, + sel_narrow_in, sel_narrow_out, + sel_wide_in, sel_wide_out, opcode, - - word_index_last, - word_index_last_half, - - wrk_rd_wide_xy_ena_x, - wrk_rd_wide_xy_bank_x, - wrk_rd_wide_xy_addr_x, - wrk_rd_wide_x_din_x, - wrk_rd_wide_y_din_x, - - wrk_rd_narrow_xy_ena_x, - wrk_rd_narrow_xy_bank_x, - wrk_rd_narrow_xy_addr_x, - wrk_rd_narrow_x_din_x, - wrk_rd_narrow_y_din_x, - - wrk_rd_wide_xy_ena_y, - wrk_rd_wide_xy_bank_y, - wrk_rd_wide_xy_addr_y, - wrk_rd_wide_x_din_y, - wrk_rd_wide_y_din_y, - - wrk_rd_narrow_xy_ena_y, - wrk_rd_narrow_xy_bank_y, - wrk_rd_narrow_xy_addr_y, - wrk_rd_narrow_x_din_y, - wrk_rd_narrow_y_din_y, - - wrk_wr_wide_xy_ena_x, - wrk_wr_wide_xy_bank_x, - wrk_wr_wide_xy_addr_x, - wrk_wr_wide_x_dout_x, - wrk_wr_wide_y_dout_x, - - wrk_wr_narrow_xy_ena_x, - wrk_wr_narrow_xy_bank_x, - wrk_wr_narrow_xy_addr_x, - wrk_wr_narrow_x_dout_x, - wrk_wr_narrow_y_dout_x, - - wrk_wr_wide_xy_ena_y, - wrk_wr_wide_xy_bank_y, - wrk_wr_wide_xy_addr_y, - wrk_wr_wide_x_dout_y, - wrk_wr_wide_y_dout_y, - - wrk_wr_narrow_xy_ena_y, - wrk_wr_narrow_xy_bank_y, - wrk_wr_narrow_xy_addr_y, - wrk_wr_narrow_x_dout_y, - wrk_wr_narrow_y_dout_y + word_index_last, word_index_last_half, + wrk_rd_wide_xy_ena_x, wrk_rd_wide_xy_bank_x, wrk_rd_wide_xy_addr_x, wrk_rd_wide_x_din_x, wrk_rd_wide_y_din_x, + wrk_rd_narrow_xy_ena_x, wrk_rd_narrow_xy_bank_x, wrk_rd_narrow_xy_addr_x, wrk_rd_narrow_x_din_x, wrk_rd_narrow_y_din_x, + wrk_rd_wide_xy_ena_y, wrk_rd_wide_xy_bank_y, wrk_rd_wide_xy_addr_y, wrk_rd_wide_x_din_y, wrk_rd_wide_y_din_y, + wrk_rd_narrow_xy_ena_y, wrk_rd_narrow_xy_bank_y, wrk_rd_narrow_xy_addr_y, wrk_rd_narrow_x_din_y, wrk_rd_narrow_y_din_y, + wrk_wr_wide_xy_ena_x, wrk_wr_wide_xy_bank_x, wrk_wr_wide_xy_addr_x, wrk_wr_wide_x_dout_x, wrk_wr_wide_y_dout_x, + wrk_wr_narrow_xy_ena_x, wrk_wr_narrow_xy_bank_x, wrk_wr_narrow_xy_addr_x, wrk_wr_narrow_x_dout_x, wrk_wr_narrow_y_dout_x, + wrk_wr_wide_xy_ena_y, wrk_wr_wide_xy_bank_y, wrk_wr_wide_xy_addr_y, wrk_wr_wide_x_dout_y, wrk_wr_wide_y_dout_y, + wrk_wr_narrow_xy_ena_y, wrk_wr_narrow_xy_bank_y, wrk_wr_narrow_xy_addr_y, wrk_wr_narrow_x_dout_y, wrk_wr_narrow_y_dout_y ); + // // Headers // @@ -143,30 +95,44 @@ module modexpng_general_worker // // FSM Declaration // - localparam [4:0] WRK_FSM_STATE_IDLE = 5'h00; + localparam [5:0] WRK_FSM_STATE_IDLE = 6'h00; - localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1 = 5'h01; - localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2 = 5'h02; - localparam [4:0] WRK_FSM_STATE_BUSY = 5'h03; - localparam [4:0] WRK_FSM_STATE_LATENCY_POST1 = 5'h05; // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug! - localparam [4:0] WRK_FSM_STATE_LATENCY_POST2 = 5'h06; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1 = 6'h01; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2 = 6'h02; + localparam [5:0] WRK_FSM_STATE_BUSY = 6'h03; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST1 = 6'h05; // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug! + localparam [5:0] WRK_FSM_STATE_LATENCY_POST2 = 6'h06; - localparam [4:0] WRK_FSM_STATE_STOP = 5'h07; + localparam [5:0] WRK_FSM_STATE_STOP = 6'h07; - localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M1 = 5'h10; - localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M2 = 5'h11; - localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M1 = 5'h12; - localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M2 = 5'h13; - localparam [4:0] WRK_FSM_STATE_BUSY_M1 = 5'h14; - localparam [4:0] WRK_FSM_STATE_BUSY_M2 = 5'h15; - localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 5'h16; - localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 5'h17; - localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 5'h18; - localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 5'h19; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M1 = 6'h10; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_M2 = 6'h11; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M1 = 6'h12; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_M2 = 6'h13; + localparam [5:0] WRK_FSM_STATE_BUSY_M1 = 6'h14; + localparam [5:0] WRK_FSM_STATE_BUSY_M2 = 6'h15; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 6'h16; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 6'h17; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 6'h18; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 6'h19; + + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE1_TP = 6'h20; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE2_TP = 6'h21; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE3_TP = 6'h22; + localparam [5:0] WRK_FSM_STATE_LATENCY_PRE4_TP = 6'h23; + localparam [5:0] WRK_FSM_STATE_BUSY_TP = 6'h24; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST1_TP = 6'h25; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST2_TP = 6'h26; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST3_TP = 6'h27; + localparam [5:0] WRK_FSM_STATE_LATENCY_POST4_TP = 6'h28; + localparam [5:0] WRK_FSM_STATE_HOLDOFF_TP = 6'h29; - reg [4:0] wrk_fsm_state = WRK_FSM_STATE_IDLE; - reg [4:0] wrk_fsm_state_next_one_pass; // single address space sweep - reg [4:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y) + reg [5:0] wrk_fsm_state = WRK_FSM_STATE_IDLE; + reg [5:0] wrk_fsm_state_next_one_pass; // single address space sweep + reg [5:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y) + reg [5:0] wrk_fsm_state_next_two_pass; // two address space sweeps + reg wrk_fsm_two_pass_pass; // 0=first pass, 1=second pass + reg wrk_fsm_two_pass_pass_dly; // 0=first pass, 1=second pass // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps) @@ -292,37 +258,36 @@ module modexpng_general_worker reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3; + reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly1; + reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x_dly2; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3; - + reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly1; + reg [WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y_dly2; always @(posedge clk) begin // - {rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x}; - {rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y}; - // - {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2}; - {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2}; - // - {rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x}; - {rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y}; + {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x}; + {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y}; // - {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2}; - {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2}; + {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x}; + {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y}; // {wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x}; {wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y}; // {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x}; - {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y}; + {wrk_rd_narrow_y_din_x_dly2, wrk_rd_narrow_y_din_x_dly1} <= {wrk_rd_narrow_y_din_x_dly1, wrk_rd_narrow_y_din_x}; + {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y}; + {wrk_rd_narrow_y_din_y_dly2, wrk_rd_narrow_y_din_y_dly1} <= {wrk_rd_narrow_y_din_y_dly1, wrk_rd_narrow_y_din_y}; // end // - // Read Enable Logic + // Source Read Enable Logic // task _update_wide_xy_rd_en; input _en; {rd_wide_xy_ena_x, rd_wide_xy_ena_y } <= {2{_en}}; endtask @@ -340,48 +305,54 @@ module modexpng_general_worker // disable_wide_xy_rd_en; disable_narrow_xy_rd_en; - /* - rd_wide_xy_ena_x <= 1'b0; - rd_wide_xy_ena_y <= 1'b0; - rd_narrow_xy_ena_x <= 1'b0; - rd_narrow_xy_ena_y <= 1'b0; - */ + // end else begin // disable_wide_xy_rd_en; disable_narrow_xy_rd_en; // - //rd_wide_xy_ena_x <= 1'b0; - //rd_wide_xy_ena_y <= 1'b0; - //rd_narrow_xy_ena_x <= 1'b0; - //rd_narrow_xy_ena_y <= 1'b0; + // one_pass // - case (opcode) + case (wrk_fsm_state_next_one_pass) // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_MODULAR_REDUCE_INIT: + WRK_FSM_STATE_LATENCY_PRE1, + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_BUSY: // - case (wrk_fsm_state_next_one_pass) + case (opcode) // - WRK_FSM_STATE_LATENCY_PRE1, - WRK_FSM_STATE_LATENCY_PRE2, - WRK_FSM_STATE_BUSY: + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_OUTPUT_FROM_NARROW, + UOP_OPCODE_MODULAR_REDUCE_INIT: // enable_narrow_xy_rd_en; - //{rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{1'b1}}; // + UOP_OPCODE_COPY_CRT_Y2X: begin + // + enable_wide_xy_rd_en; + enable_narrow_xy_rd_en; + // + end // endcase - // // - UOP_OPCODE_COPY_CRT_Y2X: + endcase + // + // one_pass_meander + // + case (wrk_fsm_state_next_one_pass_meander) + // + WRK_FSM_STATE_LATENCY_PRE1_M1, + WRK_FSM_STATE_LATENCY_PRE1_M2, + WRK_FSM_STATE_LATENCY_PRE2_M1, + WRK_FSM_STATE_LATENCY_PRE2_M2, + WRK_FSM_STATE_BUSY_M1, + WRK_FSM_STATE_BUSY_M2: // - case (wrk_fsm_state_next_one_pass) + case (opcode) // - WRK_FSM_STATE_LATENCY_PRE1, - WRK_FSM_STATE_LATENCY_PRE2, - WRK_FSM_STATE_BUSY: begin + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: begin // enable_wide_xy_rd_en; enable_narrow_xy_rd_en; @@ -389,24 +360,29 @@ module modexpng_general_worker end // endcase + // + endcase + // + // two_pass + // + case (wrk_fsm_state_next_two_pass) + // + WRK_FSM_STATE_LATENCY_PRE1_TP, + WRK_FSM_STATE_LATENCY_PRE2_TP, + WRK_FSM_STATE_LATENCY_PRE3_TP, + WRK_FSM_STATE_LATENCY_PRE4_TP, + WRK_FSM_STATE_BUSY_TP: // - UOP_OPCODE_COPY_LADDERS_X2Y: - // - case (wrk_fsm_state_next_one_pass_meander) - // - WRK_FSM_STATE_LATENCY_PRE1_M1, - WRK_FSM_STATE_LATENCY_PRE1_M2, - WRK_FSM_STATE_LATENCY_PRE2_M1, - WRK_FSM_STATE_LATENCY_PRE2_M2, - WRK_FSM_STATE_BUSY_M1, - WRK_FSM_STATE_BUSY_M2: begin - // - enable_wide_xy_rd_en; - enable_narrow_xy_rd_en; + case (opcode) + UOP_OPCODE_MODULAR_SUBTRACT: // - end + if (!wrk_fsm_two_pass_pass) begin + enable_wide_xy_rd_en; + enable_narrow_xy_rd_en; + end else + enable_narrow_xy_rd_en; // - endcase + endcase // endcase // @@ -414,7 +390,7 @@ module modexpng_general_worker // - // Write Enable Logic + // Destination Write Enable Logic // task _update_wide_xy_wr_en; input _en; {wr_wide_xy_ena_x, wr_wide_xy_ena_y } <= {2{_en}}; endtask @@ -432,71 +408,53 @@ module modexpng_general_worker // disable_wide_xy_wr_en; disable_narrow_xy_wr_en; - //wr_wide_xy_ena_x <= 1'b0; - //wr_wide_xy_ena_y <= 1'b0; - //wr_narrow_xy_ena_x <= 1'b0; - //wr_narrow_xy_ena_y <= 1'b0; // end else begin // disable_wide_xy_wr_en; disable_narrow_xy_wr_en; // - //wr_wide_xy_ena_x <= 1'b0; - //wr_wide_xy_ena_y <= 1'b0; - //wr_narrow_xy_ena_x <= 1'b0; - //wr_narrow_xy_ena_y <= 1'b0; + // one_pass // - case (opcode) + case (wrk_fsm_state) // - UOP_OPCODE_PROPAGATE_CARRIES: + WRK_FSM_STATE_BUSY, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST2: // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: + UOP_OPCODE_PROPAGATE_CARRIES: // enable_narrow_xy_wr_en; // - // - endcase - // - UOP_OPCODE_COPY_CRT_Y2X: - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: begin + UOP_OPCODE_COPY_CRT_Y2X: begin // enable_wide_xy_wr_en; - enable_narrow_xy_wr_en; + enable_narrow_xy_wr_en; // end // - endcase - // - UOP_OPCODE_MODULAR_REDUCE_INIT: - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: + UOP_OPCODE_MODULAR_REDUCE_INIT: // enable_wide_xy_wr_en; - // // endcase + // + endcase + // + // one_pass_meander + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_M2, + WRK_FSM_STATE_LATENCY_POST1_M2, + WRK_FSM_STATE_LATENCY_POST2_M2: // - UOP_OPCODE_COPY_LADDERS_X2Y: - // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2, - WRK_FSM_STATE_LATENCY_POST2_M2: begin + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: begin // enable_wide_xy_wr_en; enable_narrow_xy_wr_en; @@ -507,12 +465,42 @@ module modexpng_general_worker // endcase // + // two_pass + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_TP, + WRK_FSM_STATE_LATENCY_POST1_TP, + WRK_FSM_STATE_LATENCY_POST2_TP, + WRK_FSM_STATE_LATENCY_POST3_TP, + WRK_FSM_STATE_LATENCY_POST4_TP: + // + case (opcode) + // + UOP_OPCODE_MODULAR_SUBTRACT: + // + if (!wrk_fsm_two_pass_pass) + enable_narrow_xy_wr_en; + else begin + enable_wide_xy_wr_en; + enable_narrow_xy_wr_en; + end + // + endcase + // + endcase + // end // - // Data Logic + // Source to Destination Data Logic + // + + // + // UOP_OPCODE_PROPAGATE_CARRIES // + reg [CARRY_W -1:0] rd_narrow_x_din_x_cry_r; reg [CARRY_W -1:0] rd_narrow_y_din_x_cry_r; reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r; @@ -523,112 +511,300 @@ module modexpng_general_worker wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r}; + wire [CARRY_W -1:0] rd_narrow_x_din_x_w_cry_msb = rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] rd_narrow_y_din_x_w_cry_msb = rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] rd_narrow_x_din_y_w_cry_msb = rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W]; + wire [CARRY_W -1:0] rd_narrow_y_din_y_w_cry_msb = rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W]; + wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_x_w_cry[WORD_W -1:0]}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_x_w_cry[WORD_W -1:0]}; wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_y_w_cry[WORD_W -1:0]}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_y_w_cry[WORD_W -1:0]}; + task update_wide_dout; + input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y; + {wr_wide_x_dout_x, wr_wide_y_dout_x, wr_wide_x_dout_y, wr_wide_y_dout_y} <= + { x_x, y_x, x_y, y_y }; + endtask + + task update_narrow_dout; + input [WORD_EXT_W-1:0] x_x, y_x, x_y, y_y; + {wr_narrow_x_dout_x, wr_narrow_y_dout_x, wr_narrow_x_dout_y, wr_narrow_y_dout_y} <= + { x_x, y_x, x_y, y_y }; + endtask + + task update_narrow_carries; + input [CARRY_W-1:0] x_x_cry, y_x_cry, x_y_cry, y_y_cry; + {rd_narrow_x_din_x_cry_r, rd_narrow_y_din_x_cry_r, rd_narrow_x_din_y_cry_r, rd_narrow_y_din_y_cry_r} <= + { x_x_cry, y_x_cry, x_y_cry, y_y_cry }; + endtask + + + always @(posedge clk) + // + if (opcode == UOP_OPCODE_PROPAGATE_CARRIES) + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_LATENCY_PRE2: + // + update_narrow_carries(CARRY_ZERO, CARRY_ZERO, CARRY_ZERO, CARRY_ZERO); + // + WRK_FSM_STATE_BUSY, + WRK_FSM_STATE_LATENCY_POST1: + // + update_narrow_carries(rd_narrow_x_din_x_w_cry_msb, + rd_narrow_y_din_x_w_cry_msb, + rd_narrow_x_din_y_w_cry_msb, + rd_narrow_y_din_y_w_cry_msb); + // + endcase + + + // + // UOP_OPCODE_MODULAR_SUBTRACT + // + + reg [WORD_W:0] modsub_x_ab; + reg [WORD_W:0] modsub_y_ab; + + reg [WORD_W:0] modsub_x_ab_dly; + reg [WORD_W:0] modsub_y_ab_dly; + + reg [WORD_W:0] modsub_x_abn; + reg [WORD_W:0] modsub_y_abn; + + reg modsub_x_ab_mask_now; + reg modsub_y_ab_mask_now; + + reg modsub_x_abn_mask_now; + reg modsub_y_abn_mask_now; + + reg modsub_x_borrow_r; + reg modsub_y_borrow_r; + + wire modsub_x_ab_masked = modsub_x_ab_mask_now ? 1'b0 : modsub_x_ab[WORD_W]; + wire modsub_y_ab_masked = modsub_y_ab_mask_now ? 1'b0 : modsub_y_ab[WORD_W]; + + wire modsub_x_abn_masked = modsub_x_abn_mask_now ? 1'b0 : modsub_x_abn[WORD_W]; + wire modsub_y_abn_masked = modsub_y_abn_mask_now ? 1'b0 : modsub_y_abn[WORD_W]; + + wire [WORD_W:0] modsub_x_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_x_din_x[WORD_W-1:0]}; + wire [WORD_W:0] modsub_y_narrow_x_lsb_pad = {1'b0, wrk_rd_narrow_y_din_x[WORD_W-1:0]}; + wire [WORD_W:0] modsub_x_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_x_din_y[WORD_W-1:0]}; + wire [WORD_W:0] modsub_y_narrow_y_lsb_pad = {1'b0, wrk_rd_narrow_y_din_y[WORD_W-1:0]}; + + wire [WORD_W:0] modsub_x_wide_x_lsb_pad = {1'b0, wrk_rd_wide_x_din_x_dly1[WORD_W-1:0]}; + wire [WORD_W:0] modsub_x_wide_y_lsb_pad = {1'b0, wrk_rd_wide_x_din_y_dly1[WORD_W-1:0]}; + + wire [WORD_EXT_W -1:0] modsub_x_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_x_ab_dly[WORD_W-1:0]}; + wire [WORD_EXT_W -1:0] modsub_y_ab_dly_trunc = {{CARRY_W{1'b0}}, modsub_y_ab_dly[WORD_W-1:0]}; + + wire [WORD_EXT_W -1:0] modsub_x_abn_trunc = {{CARRY_W{1'b0}}, modsub_x_abn[WORD_W-1:0]}; + wire [WORD_EXT_W -1:0] modsub_y_abn_trunc = {{CARRY_W{1'b0}}, modsub_y_abn[WORD_W-1:0]}; + + wire [WORD_EXT_W -1:0] modsub_x_mux = !modsub_x_borrow_r ? wrk_rd_narrow_x_din_x_dly2 : wrk_rd_narrow_y_din_x_dly2; + wire [WORD_EXT_W -1:0] modsub_y_mux = !modsub_y_borrow_r ? wrk_rd_narrow_x_din_y_dly2 : wrk_rd_narrow_y_din_y_dly2; + + wire [WORD_W:0] modsub_x_ab_lsb_pad = {1'b0, modsub_x_ab[WORD_W-1:0]}; + wire [WORD_W:0] modsub_y_ab_lsb_pad = {1'b0, modsub_y_ab[WORD_W-1:0]}; + + task update_modsub_ab; + begin + modsub_x_ab <= modsub_x_narrow_x_lsb_pad - modsub_y_narrow_x_lsb_pad - modsub_x_ab_masked; + modsub_y_ab <= modsub_x_narrow_y_lsb_pad - modsub_y_narrow_y_lsb_pad - modsub_y_ab_masked; + end + endtask + + task update_modsub_abn; + begin + modsub_x_abn <= modsub_x_ab_lsb_pad + modsub_x_wide_x_lsb_pad + modsub_x_abn_masked; + modsub_y_abn <= modsub_y_ab_lsb_pad + modsub_x_wide_y_lsb_pad + modsub_y_abn_masked; + end + endtask + + always @(posedge clk) + // + if (opcode == UOP_OPCODE_MODULAR_SUBTRACT) + // + case (wrk_fsm_state) + WRK_FSM_STATE_LATENCY_POST4_TP: + if (!wrk_fsm_two_pass_pass) + {modsub_x_borrow_r, modsub_y_borrow_r} <= {modsub_x_ab_dly[WORD_W], modsub_y_ab_dly[WORD_W]}; + endcase + + always @(posedge clk) begin + modsub_x_ab_dly <= modsub_x_ab; + modsub_y_ab_dly <= modsub_y_ab; + end + always @(posedge clk) begin // - wr_wide_x_dout_x <= WORD_EXT_DNC; - wr_wide_y_dout_x <= WORD_EXT_DNC; - wr_wide_x_dout_y <= WORD_EXT_DNC; - wr_wide_y_dout_y <= WORD_EXT_DNC; - wr_narrow_x_dout_x <= WORD_EXT_DNC; - wr_narrow_y_dout_x <= WORD_EXT_DNC; - wr_narrow_x_dout_y <= WORD_EXT_DNC; - wr_narrow_y_dout_y <= WORD_EXT_DNC; + modsub_x_ab <= {1'bX, WORD_DNC}; + modsub_y_ab <= {1'bX, WORD_DNC}; // - case (opcode) + modsub_x_abn <= {1'bX, WORD_DNC}; + modsub_y_abn <= {1'bX, WORD_DNC}; + // + if (opcode == UOP_OPCODE_MODULAR_SUBTRACT) // - UOP_OPCODE_PROPAGATE_CARRIES: + case (wrk_fsm_state) // - case (wrk_fsm_state) + WRK_FSM_STATE_LATENCY_PRE3_TP: + update_modsub_ab; + + WRK_FSM_STATE_LATENCY_PRE4_TP, + WRK_FSM_STATE_BUSY_TP, + WRK_FSM_STATE_LATENCY_POST1_TP, + WRK_FSM_STATE_LATENCY_POST2_TP: begin + update_modsub_ab; + update_modsub_abn; + end + // + WRK_FSM_STATE_LATENCY_POST3_TP: // - WRK_FSM_STATE_LATENCY_PRE2: begin - rd_narrow_x_din_x_cry_r <= CARRY_ZERO; - rd_narrow_y_din_x_cry_r <= CARRY_ZERO; - rd_narrow_x_din_y_cry_r <= CARRY_ZERO; - rd_narrow_y_din_y_cry_r <= CARRY_ZERO; - end + update_modsub_abn; + // + endcase + // + end + + always @(posedge clk) begin + // + modsub_x_ab_mask_now <= 1'b0; + modsub_y_ab_mask_now <= 1'b0; + // + modsub_x_abn_mask_now <= 1'b0; + modsub_y_abn_mask_now <= 1'b0; + // + if (opcode == UOP_OPCODE_MODULAR_SUBTRACT) + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_LATENCY_PRE2_TP: begin + modsub_x_ab_mask_now <= 1'b1; + modsub_y_ab_mask_now <= 1'b1; + end + // + WRK_FSM_STATE_LATENCY_PRE3_TP: begin + modsub_x_abn_mask_now <= 1'b1; + modsub_y_abn_mask_now <= 1'b1; + end + // + endcase + // + end + + always @(posedge clk) begin + // + update_wide_dout (WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC); + update_narrow_dout(WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC, WORD_EXT_DNC); + // + // one_pass + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST2: + // + case (opcode) + // + UOP_OPCODE_PROPAGATE_CARRIES: + // + update_narrow_dout(rd_narrow_x_din_x_w_cry_reduced, + rd_narrow_y_din_x_w_cry_reduced, + rd_narrow_x_din_y_w_cry_reduced, + rd_narrow_y_din_y_w_cry_reduced); // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: begin // TODO: post2 doesn't need update of carry, since that's the last word + UOP_OPCODE_COPY_CRT_Y2X: begin // - rd_narrow_x_din_x_cry_r <= rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W]; - rd_narrow_y_din_x_cry_r <= rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W]; - rd_narrow_x_din_y_cry_r <= rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W]; - rd_narrow_y_din_y_cry_r <= rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W]; + update_wide_dout(wrk_rd_wide_x_din_y, + wrk_rd_wide_y_din_y, + wrk_rd_wide_x_din_y, + wrk_rd_wide_y_din_y); // - wr_narrow_x_dout_x <= rd_narrow_x_din_x_w_cry_reduced; - wr_narrow_y_dout_x <= rd_narrow_y_din_x_w_cry_reduced; - wr_narrow_x_dout_y <= rd_narrow_x_din_y_w_cry_reduced; - wr_narrow_y_dout_y <= rd_narrow_y_din_y_w_cry_reduced; + update_narrow_dout(wrk_rd_narrow_x_din_y, + wrk_rd_narrow_y_din_y, + wrk_rd_narrow_x_din_y, + wrk_rd_narrow_y_din_y); // end // + UOP_OPCODE_MODULAR_REDUCE_INIT: + // + update_wide_dout(wrk_rd_narrow_x_din_x, + wrk_rd_narrow_y_din_x, + wrk_rd_narrow_x_din_y, + wrk_rd_narrow_y_din_y); + // endcase + // + endcase + // + // one_pass_meander + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_M2, + WRK_FSM_STATE_LATENCY_POST1_M2, + WRK_FSM_STATE_LATENCY_POST2_M2: // - UOP_OPCODE_COPY_CRT_Y2X: - // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: begin + UOP_OPCODE_COPY_LADDERS_X2Y: begin // - wr_wide_x_dout_x <= wrk_rd_wide_x_din_y; - wr_wide_y_dout_x <= wrk_rd_wide_y_din_y; - wr_wide_x_dout_y <= wrk_rd_wide_x_din_y; - wr_wide_y_dout_y <= wrk_rd_wide_y_din_y; + update_wide_dout(wrk_rd_wide_x_din_x_dly3, + wrk_rd_wide_x_din_x_dly2, + wrk_rd_wide_x_din_y_dly3, + wrk_rd_wide_x_din_y_dly2); // - wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_y; - wr_narrow_y_dout_x <= wrk_rd_narrow_y_din_y; - wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y; - wr_narrow_y_dout_y <= wrk_rd_narrow_y_din_y; + update_narrow_dout(wrk_rd_narrow_x_din_x_dly3, + wrk_rd_narrow_x_din_x_dly2, + wrk_rd_narrow_x_din_y_dly3, + wrk_rd_narrow_x_din_y_dly2); // end // - endcase - // - UOP_OPCODE_COPY_LADDERS_X2Y: - // - case (wrk_fsm_state) - // - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2, - WRK_FSM_STATE_LATENCY_POST2_M2: begin + UOP_OPCODE_CROSS_LADDERS_X2Y: begin // - wr_wide_x_dout_x <= wrk_rd_wide_x_din_x_dly3; - wr_wide_y_dout_x <= wrk_rd_wide_x_din_x_dly2; - wr_wide_x_dout_y <= wrk_rd_wide_x_din_y_dly3; - wr_wide_y_dout_y <= wrk_rd_wide_x_din_y_dly2; + update_wide_dout(wrk_rd_wide_x_din_x_dly3, + wrk_rd_wide_x_din_y_dly2, + wrk_rd_wide_x_din_y_dly3, + wrk_rd_wide_x_din_x_dly2); // - wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_x_dly3; - wr_narrow_y_dout_x <= wrk_rd_narrow_x_din_x_dly2; - wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y_dly3; - wr_narrow_y_dout_y <= wrk_rd_narrow_x_din_y_dly2; + update_narrow_dout(wrk_rd_narrow_x_din_x_dly3, + wrk_rd_narrow_x_din_y_dly2, + wrk_rd_narrow_x_din_y_dly3, + wrk_rd_narrow_x_din_x_dly2); // end // endcase + // + endcase + // + // two_pass + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_TP, + WRK_FSM_STATE_LATENCY_POST1_TP, + WRK_FSM_STATE_LATENCY_POST2_TP, + WRK_FSM_STATE_LATENCY_POST3_TP, + WRK_FSM_STATE_LATENCY_POST4_TP: // - UOP_OPCODE_MODULAR_REDUCE_INIT: - // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: begin + UOP_OPCODE_MODULAR_SUBTRACT: // - wr_wide_x_dout_x <= wrk_rd_narrow_x_din_x; - wr_wide_y_dout_x <= wrk_rd_narrow_y_din_x; - wr_wide_x_dout_y <= wrk_rd_narrow_x_din_y; - wr_wide_y_dout_y <= wrk_rd_narrow_y_din_y; + if (!wrk_fsm_two_pass_pass) + update_narrow_dout(modsub_x_ab_dly_trunc, modsub_x_abn_trunc, modsub_y_ab_dly_trunc, modsub_y_abn_trunc); + else begin + update_wide_dout (modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux); + update_narrow_dout(modsub_x_mux, modsub_x_mux, modsub_y_mux, modsub_y_mux); + end // - end - // endcase // endcase @@ -637,254 +813,307 @@ module modexpng_general_worker // - // Write Address Logic + // Source Read Address Logic // - wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half; - wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half; + + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_xy_next; + reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_xy_next; + + wire rd_wide_xy_addr_xy_next_is_last = rd_wide_xy_addr_xy_next == word_index_last_half; + wire rd_narrow_xy_addr_xy_next_is_last = rd_narrow_xy_addr_xy_next == word_index_last; + + task update_rd_wide_bank_addr; + input [BANK_ADDR_W -1:0] bank; + input [ OP_ADDR_W -1:0] addr; + begin + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, addr}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, addr}; + end + endtask + + task update_rd_wide_bank; + input [BANK_ADDR_W -1:0] bank; + begin + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {bank, rd_wide_xy_addr_x}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {bank, rd_wide_xy_addr_y}; + end + endtask + + task update_rd_narrow_bank_addr; + input [BANK_ADDR_W -1:0] bank; + input [ OP_ADDR_W -1:0] addr; + begin + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, addr}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, addr}; + end + endtask + + task update_rd_narrow_bank; + input [BANK_ADDR_W -1:0] bank; + begin + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {bank, rd_narrow_xy_addr_x}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {bank, rd_narrow_xy_addr_y}; + end + endtask + + task update_rd_wide_addr_next; + input [OP_ADDR_W -1:0] addr; + rd_wide_xy_addr_xy_next <= addr; + endtask + + task update_rd_narrow_addr_next; + input [OP_ADDR_W -1:0] addr; + rd_narrow_xy_addr_xy_next <= addr; + endtask + + task advance_rd_wide_addr_next; + rd_wide_xy_addr_xy_next <= !rd_wide_xy_addr_xy_next_is_last ? rd_wide_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO; + endtask + + task advance_rd_narrow_addr_next; + rd_narrow_xy_addr_xy_next <= !rd_narrow_xy_addr_xy_next_is_last ? rd_narrow_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO; + endtask always @(posedge clk) begin // - {wr_wide_xy_bank_x, wr_wide_xy_addr_x } <= {BANK_DNC, OP_ADDR_DNC}; - {wr_wide_xy_bank_y, wr_wide_xy_addr_y } <= {BANK_DNC, OP_ADDR_DNC}; - {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC}; - {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC}; + update_rd_wide_bank_addr (BANK_DNC, OP_ADDR_DNC); + update_rd_narrow_bank_addr(BANK_DNC, OP_ADDR_DNC); // - case (opcode) + // one_pass + // + case (wrk_fsm_state_next_one_pass) // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_COPY_CRT_Y2X: + WRK_FSM_STATE_LATENCY_PRE1: // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: begin - // - {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly2}; - {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly2}; + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_OUTPUT_FROM_NARROW, + UOP_OPCODE_COPY_CRT_Y2X, + UOP_OPCODE_MODULAR_REDUCE_INIT: begin // - {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly2}; - {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly2}; + update_rd_wide_bank_addr (sel_wide_in, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); + update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); // end // endcase // - UOP_OPCODE_MODULAR_REDUCE_INIT: + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_BUSY: // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY, - WRK_FSM_STATE_LATENCY_POST1, - WRK_FSM_STATE_LATENCY_POST2: begin + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_OUTPUT_FROM_NARROW, + UOP_OPCODE_COPY_CRT_Y2X: begin + // + update_rd_wide_bank_addr (sel_wide_in, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next ; + update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; // - wr_wide_xy_bank_x <= uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H; - wr_wide_xy_bank_y <= uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H; + end + // + UOP_OPCODE_MODULAR_REDUCE_INIT: begin // - wr_wide_xy_addr_x <= rd_wide_xy_addr_x_dly2; - wr_wide_xy_addr_y <= rd_wide_xy_addr_y_dly2; + update_rd_wide_bank_addr (sel_wide_in, rd_wide_xy_addr_xy_next ); advance_rd_wide_addr_next ; + update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; // end // endcase + // + endcase + // + // one_pass_meander + // + case (wrk_fsm_state_next_one_pass_meander) + // + WRK_FSM_STATE_LATENCY_PRE1_M1: + case (opcode) + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: begin + update_rd_wide_bank_addr (sel_wide_out, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); + update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); + end + endcase + // + WRK_FSM_STATE_LATENCY_PRE2_M1, + WRK_FSM_STATE_BUSY_M1: + case (opcode) + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: begin + update_rd_wide_bank_addr (sel_wide_out, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next ; + update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; + // + end + // + endcase + // + WRK_FSM_STATE_LATENCY_PRE1_M2, + WRK_FSM_STATE_LATENCY_PRE2_M2, + WRK_FSM_STATE_BUSY_M2: + case (opcode) + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: begin + update_rd_wide_bank (sel_wide_in ); + update_rd_narrow_bank(sel_narrow_in); + end + endcase + // + endcase + // + // two_pass + // + case (wrk_fsm_state_next_two_pass) + // + WRK_FSM_STATE_LATENCY_PRE1_TP: // - UOP_OPCODE_COPY_LADDERS_X2Y: - // - case (wrk_fsm_state) + case (opcode) // - WRK_FSM_STATE_BUSY_M2, - WRK_FSM_STATE_LATENCY_POST1_M2, - WRK_FSM_STATE_LATENCY_POST2_M2: begin - // - {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly4}; - {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly4}; + UOP_OPCODE_MODULAR_SUBTRACT: // - {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly4}; - {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly4}; + if (!wrk_fsm_two_pass_pass) begin + update_rd_wide_bank_addr (BANK_WIDE_N, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); + update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); + end else begin + update_rd_narrow_bank_addr(sel_narrow_out, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); + end + // + endcase + // + WRK_FSM_STATE_LATENCY_PRE2_TP, + WRK_FSM_STATE_LATENCY_PRE3_TP, + WRK_FSM_STATE_LATENCY_PRE4_TP, + WRK_FSM_STATE_BUSY_TP: + // + case (opcode) + // + UOP_OPCODE_MODULAR_SUBTRACT: // - end + if (!wrk_fsm_two_pass_pass) begin + update_rd_wide_bank_addr (BANK_WIDE_N, rd_narrow_xy_addr_xy_next); advance_rd_wide_addr_next ; + update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; + end else begin + update_rd_narrow_bank_addr(sel_narrow_out, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; + end // endcase // - // endcase // end // - // Read Address Logic + // Destination Write Address Logic // - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_next; - reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_next; - - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_next; - reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_next; + + wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half; + wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half; - wire rd_wide_xy_addr_x_next_is_last = rd_wide_xy_addr_x_next == word_index_last_half; - wire rd_wide_xy_addr_y_next_is_last = rd_wide_xy_addr_y_next == word_index_last_half; + wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_x = uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H; + wire [BANK_ADDR_W -1:0] uop_modular_reduce_init_bank_y = uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H; - wire rd_narrow_xy_addr_x_next_is_last = rd_narrow_xy_addr_x_next == word_index_last; - wire rd_narrow_xy_addr_y_next_is_last = rd_narrow_xy_addr_y_next == word_index_last; + task update_wr_wide_bank_addr; + input [BANK_ADDR_W -1:0] x_bank; + input [BANK_ADDR_W -1:0] y_bank; + input [ OP_ADDR_W -1:0] x_addr; + input [ OP_ADDR_W -1:0] y_addr; + begin + {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {x_bank, x_addr}; + {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {y_bank, y_addr}; + end + endtask - always @(posedge clk) begin // TODO: Maybe split into two blocks (read address / next address)?? + task update_wr_narrow_bank_addr; + input [BANK_ADDR_W -1:0] x_bank; + input [BANK_ADDR_W -1:0] y_bank; + input [ OP_ADDR_W -1:0] x_addr; + input [ OP_ADDR_W -1:0] y_addr; + begin + {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {x_bank, x_addr}; + {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {y_bank, y_addr}; + end + endtask + + always @(posedge clk) begin // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x } <= {BANK_DNC, OP_ADDR_DNC}; // TODO: Add same default path for io_manager ?? - {rd_wide_xy_bank_y, rd_wide_xy_addr_y } <= {BANK_DNC, OP_ADDR_DNC}; - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC}; + update_wr_wide_bank_addr (BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC); + update_wr_narrow_bank_addr(BANK_DNC, BANK_DNC, OP_ADDR_DNC, OP_ADDR_DNC); // - case (opcode) + // one_pass + // + case (wrk_fsm_state) // - UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_COPY_CRT_Y2X: + WRK_FSM_STATE_BUSY, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST2: // - case (wrk_fsm_state_next_one_pass) + case (opcode) // - WRK_FSM_STATE_LATENCY_PRE1: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO}; - // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO}; - // - rd_wide_xy_addr_x_next <= OP_ADDR_ONE; - rd_wide_xy_addr_y_next <= OP_ADDR_ONE; - // - rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; - rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; - // + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_COPY_CRT_Y2X: begin + update_wr_wide_bank_addr (sel_wide_out, sel_wide_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2); + update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2); end // - WRK_FSM_STATE_LATENCY_PRE2, - WRK_FSM_STATE_BUSY: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_narrow_xy_addr_x_next}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_narrow_xy_addr_y_next}; - // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next}; - // - rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; - rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; - // - rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; - rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; - // - end + UOP_OPCODE_MODULAR_REDUCE_INIT: + update_wr_wide_bank_addr(uop_modular_reduce_init_bank_x, uop_modular_reduce_init_bank_y, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_y_dly2); // endcase // - UOP_OPCODE_MODULAR_REDUCE_INIT: - // - case (wrk_fsm_state_next_one_pass) - // - WRK_FSM_STATE_LATENCY_PRE1: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO}; - // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO}; - // - rd_wide_xy_addr_x_next <= OP_ADDR_ONE; - rd_wide_xy_addr_y_next <= OP_ADDR_ONE; - // - rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; - rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; - // - end - // - WRK_FSM_STATE_LATENCY_PRE2, - WRK_FSM_STATE_BUSY: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x_next}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y_next}; - // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next}; - // - rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; - rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; - // - rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; - rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; - // + endcase + // + // one_pass_meander + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_M2, + WRK_FSM_STATE_LATENCY_POST1_M2, + WRK_FSM_STATE_LATENCY_POST2_M2: + // + case (opcode) + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: begin + update_wr_wide_bank_addr (sel_wide_out, sel_wide_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); + update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); end - // endcase // - UOP_OPCODE_COPY_LADDERS_X2Y: + endcase + // + // two_pass + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_TP, + WRK_FSM_STATE_LATENCY_POST1_TP, + WRK_FSM_STATE_LATENCY_POST2_TP, + WRK_FSM_STATE_LATENCY_POST3_TP, + WRK_FSM_STATE_LATENCY_POST4_TP: // - case (wrk_fsm_state_next_one_pass_meander) - // - WRK_FSM_STATE_LATENCY_PRE1_M1: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, OP_ADDR_ZERO}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, OP_ADDR_ZERO}; - // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, OP_ADDR_ZERO}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, OP_ADDR_ZERO}; - // - rd_wide_xy_addr_x_next <= OP_ADDR_ONE; - rd_wide_xy_addr_y_next <= OP_ADDR_ONE; - // - rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; - rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; - // - end + case (opcode) // - WRK_FSM_STATE_LATENCY_PRE1_M2: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y}; - // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y}; - // - end - // - WRK_FSM_STATE_LATENCY_PRE2_M1, - WRK_FSM_STATE_BUSY_M1: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_next}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_next}; + UOP_OPCODE_MODULAR_SUBTRACT: // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_next}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_next}; + if (!wrk_fsm_two_pass_pass) begin + update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); + end else begin + update_wr_wide_bank_addr (sel_wide_out, sel_wide_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); + update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_y_dly4); + end // - rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; - rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; - // - rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; - rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; - // - end - // - WRK_FSM_STATE_LATENCY_PRE2_M2, - WRK_FSM_STATE_BUSY_M2: begin - // - {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x}; - {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y}; - // - {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x}; - {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y}; - // - end - // - endcase + endcase // - // - endcase + endcase // end - + // // FSM Process // + always @(posedge clk) // if (rst) wrk_fsm_state <= WRK_FSM_STATE_IDLE; @@ -893,7 +1122,9 @@ module modexpng_general_worker UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X, UOP_OPCODE_MODULAR_REDUCE_INIT: wrk_fsm_state <= wrk_fsm_state_next_one_pass; - UOP_OPCODE_COPY_LADDERS_X2Y: wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander; + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander; + UOP_OPCODE_MODULAR_SUBTRACT: wrk_fsm_state <= wrk_fsm_state_next_two_pass; default: wrk_fsm_state <= WRK_FSM_STATE_IDLE; endcase @@ -901,49 +1132,64 @@ module modexpng_general_worker // // Busy Exit Logic // - reg wrk_fsm_done_one_pass = 1'b0; + + reg wrk_fsm_done_one_pass = 1'b0; reg wrk_fsm_done_one_pass_meander = 1'b0; + reg wrk_fsm_done_two_pass = 1'b0; always @(posedge clk) begin // wrk_fsm_done_one_pass <= 1'b0; wrk_fsm_done_one_pass_meander <= 1'b0; + wrk_fsm_done_two_pass <= 1'b0; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X, - UOP_OPCODE_MODULAR_REDUCE_INIT: begin + UOP_OPCODE_MODULAR_REDUCE_INIT: // - if (wrk_fsm_state == WRK_FSM_STATE_BUSY) begin - // - if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass <= 1'b1; // TODO: Check, whether both are necessary... - if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass <= 1'b1; - // - end + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY: + if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass <= 1'b1; + endcase // - end - // - UOP_OPCODE_COPY_LADDERS_X2Y: begin + UOP_OPCODE_COPY_LADDERS_X2Y, + UOP_OPCODE_CROSS_LADDERS_X2Y: // - if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M2) begin - // - if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; // TODO: Check, whether both are necessary... - if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; - // - end + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY_M2: + if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; + WRK_FSM_STATE_BUSY_M1: + wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander; + endcase + // + UOP_OPCODE_MODULAR_SUBTRACT: // - if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M1) - wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander; + case (wrk_fsm_state) + WRK_FSM_STATE_BUSY_TP: + if (rd_narrow_xy_addr_xy_next_is_last) wrk_fsm_done_two_pass <= 1'b1; + endcase // - end // endcase // end - + + // + // FSM Helper Logic + // + always @(posedge clk) + // + case (wrk_fsm_state) + WRK_FSM_STATE_IDLE: if (ena) {wrk_fsm_two_pass_pass, wrk_fsm_two_pass_pass_dly} <= {1'b0, 1'b0}; + WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_two_pass_pass <= 1'b1; + WRK_FSM_STATE_HOLDOFF_TP: wrk_fsm_two_pass_pass_dly <= 1'b1; + endcase + + // // FSM Transition Logic // @@ -985,7 +1231,27 @@ module modexpng_general_worker // end - + always @* begin + // + case (wrk_fsm_state) + WRK_FSM_STATE_IDLE: wrk_fsm_state_next_two_pass = ena ? WRK_FSM_STATE_LATENCY_PRE1_TP : WRK_FSM_STATE_IDLE; + WRK_FSM_STATE_LATENCY_PRE1_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_PRE2_TP ; + WRK_FSM_STATE_LATENCY_PRE2_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_PRE3_TP ; + WRK_FSM_STATE_LATENCY_PRE3_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_PRE4_TP ; + WRK_FSM_STATE_LATENCY_PRE4_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_BUSY_TP ; + WRK_FSM_STATE_BUSY_TP: wrk_fsm_state_next_two_pass = wrk_fsm_done_two_pass ? WRK_FSM_STATE_LATENCY_POST1_TP : WRK_FSM_STATE_BUSY_TP; + WRK_FSM_STATE_LATENCY_POST1_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_POST2_TP ; + WRK_FSM_STATE_LATENCY_POST2_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_POST3_TP ; + WRK_FSM_STATE_LATENCY_POST3_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_LATENCY_POST4_TP ; + WRK_FSM_STATE_LATENCY_POST4_TP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_HOLDOFF_TP ; + WRK_FSM_STATE_HOLDOFF_TP: wrk_fsm_state_next_two_pass = wrk_fsm_two_pass_pass_dly ? WRK_FSM_STATE_STOP : WRK_FSM_STATE_LATENCY_PRE1_TP; + WRK_FSM_STATE_STOP: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_IDLE ; + default: wrk_fsm_state_next_two_pass = WRK_FSM_STATE_IDLE ; + endcase + // + end + + // // Ready Logic // diff --git a/rtl/modexpng_microcode.vh b/rtl/modexpng_microcode.vh index f68c559..3493e26 100644 --- a/rtl/modexpng_microcode.vh +++ b/rtl/modexpng_microcode.vh @@ -39,8 +39,9 @@ localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_OUTPUT_FROM_NARROW = 5'd3; * source and destination WIDE are don't care */ -localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_CRT_Y2X = 5'd4; -localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_LADDERS_X2Y = 5'd5; +localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_CRT_Y2X = 5'd4; +localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_COPY_LADDERS_X2Y = 5'd5; +localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_CROSS_LADDERS_X2Y = 5'd7; /* CRT is don't care * NPQ specifies the width of the operand * AUX is don't care @@ -53,6 +54,13 @@ localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_MULTIPLY = 5'd8; * AUX = AUX_2 forces B input to 1 (AUX_1 reads from source NARROW as usual) * LADDER specifies Montgomery ladder mode */ +localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_SUBTRACT = 5'd9; +/* CRT is don't care + * NPQ specifies the width of the operand + * AUX is don't care + * LADDER is don't care + */ + localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_REDUCE_INIT = 5'd10; localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MODULAR_REDUCE_PROC = 5'd11; /* CRT diff --git a/rtl/modexpng_uop_rom.v b/rtl/modexpng_uop_rom.v index 04f0c83..adc657a 100644 --- a/rtl/modexpng_uop_rom.v +++ b/rtl/modexpng_uop_rom.v @@ -21,35 +21,35 @@ module modexpng_uop_rom 6'd03: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_Y, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_Y, BANK_WIDE_A, BANK_DNC }; // 6'd04: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_X, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_M, BANK_WIDE_E, BANK_DNC }; // 6'd05: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_Y, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_M, BANK_WIDE_E, BANK_DNC }; // - // + // 6'd06: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_X, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_N_COEFF, BANK_DNC, BANK_NARROW_COEFF}; // 6'd07: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_Y, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_N_COEFF, BANK_DNC, BANK_NARROW_COEFF}; // 6'd08: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_X, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_N_FACTOR, BANK_DNC, BANK_NARROW_A }; // 6'd09: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_Y, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_N_FACTOR, BANK_DNC, BANK_NARROW_A }; // 6'd10: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_X, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_M, BANK_DNC, BANK_NARROW_E }; // 6'd11: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_Y, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_DNC, BANK_DNC, BANK_IN_1_M, BANK_DNC, BANK_NARROW_E }; // - // + // 6'd12: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_11, BANK_WIDE_A, BANK_NARROW_A, BANK_WIDE_B, BANK_NARROW_B }; // 6'd13: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_11, BANK_WIDE_B, BANK_NARROW_B, BANK_WIDE_C, BANK_NARROW_C }; // 6'd14: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_N, UOP_AUX_2, UOP_LADDER_11, BANK_WIDE_C, BANK_DNC, BANK_WIDE_D, BANK_NARROW_D }; // - // + // 6'd15: data <= {UOP_OPCODE_PROPAGATE_CARRIES, UOP_CRT_DNC, UOP_NPQ_N, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_NARROW_D, BANK_DNC, BANK_NARROW_D }; // - // + // 6'd16: data <= {UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_CRT_X, UOP_NPQ_N, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_NARROW_D, BANK_DNC, BANK_OUT_XM }; // 6'd17: data <= {UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_CRT_Y, UOP_NPQ_N, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_NARROW_D, BANK_DNC, BANK_OUT_YM }; // - // + // 6'd18: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_N, UOP_AUX_1, UOP_LADDER_11, BANK_WIDE_E, BANK_NARROW_B, BANK_WIDE_C, BANK_NARROW_C }; // - // + // 6'd19: data <= {UOP_OPCODE_PROPAGATE_CARRIES, UOP_CRT_DNC, UOP_NPQ_N, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_NARROW_C, BANK_DNC, BANK_NARROW_C }; // - // + // 6'd20: data <= {UOP_OPCODE_COPY_CRT_Y2X, UOP_CRT_DNC, UOP_NPQ_N, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_WIDE_C, BANK_NARROW_C, BANK_WIDE_C, BANK_NARROW_C }; // - // + // 6'd21: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_X, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_P, BANK_WIDE_N, BANK_DNC }; // 6'd22: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_Y, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_Q, BANK_WIDE_N, BANK_DNC }; // 6'd23: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_X, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_P_FACTOR, BANK_WIDE_A, BANK_DNC }; // 6'd24: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_Y, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_Q_FACTOR, BANK_WIDE_A, BANK_DNC }; // 6'd25: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_X, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_QINV, BANK_WIDE_E, BANK_DNC }; // - // + // 6'd26: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_X, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_P_COEFF, BANK_DNC, BANK_NARROW_COEFF}; // 6'd27: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_Y, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_Q_COEFF, BANK_DNC, BANK_NARROW_COEFF}; // 6'd28: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_X, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_P_FACTOR, BANK_DNC, BANK_NARROW_A }; // @@ -70,6 +70,14 @@ module modexpng_uop_rom 6'd38: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_1, UOP_LADDER_PQ, BANK_WIDE_C, BANK_NARROW_C, BANK_WIDE_C, BANK_NARROW_C }; // 6'd39: data <= {UOP_OPCODE_LADDER_STEP, UOP_CRT_DNC, UOP_NPQ_DNC, UOP_AUX_DNC, UOP_LADDER_DNC, UOP_SEL_DNC_ALL }; // // + 6'd40: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_11, BANK_WIDE_C, BANK_DNC, BANK_WIDE_D, BANK_NARROW_D }; // + // + 6'd41: data <= {UOP_OPCODE_PROPAGATE_CARRIES, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_NARROW_D, BANK_DNC, BANK_NARROW_D }; // + // + 6'd42: data <= {UOP_OPCODE_CROSS_LADDERS_X2Y, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_WIDE_D, BANK_NARROW_D, BANK_WIDE_D, BANK_NARROW_D }; // + // + 6'd43: data <= {UOP_OPCODE_MODULAR_SUBTRACT, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_NARROW_D, BANK_WIDE_C, BANK_NARROW_C }; // + // default: data <= {UOP_OPCODE_STOP, UOP_CRT_DNC, UOP_NPQ_DNC, UOP_AUX_DNC, UOP_LADDER_DNC, UOP_SEL_DNC_ALL }; // endcase -- cgit v1.2.3