From f4771a7b6774a53cbada5b86701d65e08a36c10d Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Sat, 16 Nov 2019 01:17:02 +0300 Subject: The uOP engine didn't compile at 180 MHz. The pipeline had two stages: FETCH and DECODE. Apparently one clock cycle is not enough to entirely decode an instruction, so decoding now takes two clock cycles (DECODE_1 and DECODE_2). This seems to solve the problem. If we run into more timing violations here, we can add an extra DECODE_3 cycle and register the currently combinatorial uop_opcode_* flags at DECODE_2. This fix increases the core's latency by 59/32 clock cycles (CRT/non-CRT mode) plus two extra clock cycles per each bit of the exponent. --- rtl/modexpng_uop_engine.v | 168 +++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 77 deletions(-) diff --git a/rtl/modexpng_uop_engine.v b/rtl/modexpng_uop_engine.v index ba2d4c8..1119fbd 100644 --- a/rtl/modexpng_uop_engine.v +++ b/rtl/modexpng_uop_engine.v @@ -280,66 +280,79 @@ module modexpng_uop_engine // // UOP_FSM // - localparam [1:0] UOP_FSM_STATE_IDLE = 2'b00; - localparam [1:0] UOP_FSM_STATE_FETCH = 2'b01; - localparam [1:0] UOP_FSM_STATE_DECODE = 2'b10; - localparam [1:0] UOP_FSM_STATE_BUSY = 2'b11; + localparam [2:0] UOP_FSM_STATE_IDLE = 3'b000; + localparam [2:0] UOP_FSM_STATE_FETCH = 3'b001; + localparam [2:0] UOP_FSM_STATE_DECODE_1 = 3'b010; + localparam [2:0] UOP_FSM_STATE_DECODE_2 = 3'b011; + localparam [2:0] UOP_FSM_STATE_BUSY = 3'b100; - reg [1:0] uop_fsm_state = UOP_FSM_STATE_IDLE; - reg [1:0] uop_fsm_state_next; + reg [2:0] uop_fsm_state = UOP_FSM_STATE_IDLE; + reg [2:0] uop_fsm_state_next; // // UOP ROM // reg [UOP_ADDR_W -1:0] uop_addr; - wire [UOP_W -1:0] uop_data; + wire [UOP_W -1:0] uop_data_int; modexpng_uop_rom uop_rom ( .clk (clk), .addr (uop_addr), - .data (uop_data) - ); + .data (uop_data_int) + ); // // UOP ROM Data Decoder // - wire [UOP_OPCODE_W -1:0] uop_data_opcode = uop_data[UOP_W -1-: UOP_OPCODE_W]; - wire [UOP_CRT_W -1:0] uop_data_crt = uop_data[UOP_W -UOP_OPCODE_W -1-: UOP_CRT_W ]; - wire [UOP_NPQ_W -1:0] uop_data_npq = uop_data[UOP_W -UOP_OPCODE_W -UOP_CRT_W -1-: UOP_NPQ_W ]; - wire [UOP_AUX_W -1:0] uop_data_aux = uop_data[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -1-: UOP_AUX_W ]; - wire [UOP_LADDER_W -1:0] uop_data_ladder = uop_data[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -1-: UOP_LADDER_W]; - wire [BANK_ADDR_W -1:0] uop_data_sel_wide_in = uop_data[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -1-: BANK_ADDR_W ]; - wire [BANK_ADDR_W -1:0] uop_data_sel_narrow_in = uop_data[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -1*BANK_ADDR_W -1-: BANK_ADDR_W ]; - wire [BANK_ADDR_W -1:0] uop_data_sel_wide_out = uop_data[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -2*BANK_ADDR_W -1-: BANK_ADDR_W ]; - wire [BANK_ADDR_W -1:0] uop_data_sel_narrow_out = uop_data[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -3*BANK_ADDR_W -1-: BANK_ADDR_W ]; - - wire uop_opcode_is_stop = uop_data_opcode == UOP_OPCODE_STOP ; - wire uop_opcode_is_in = (uop_data_opcode == UOP_OPCODE_INPUT_TO_WIDE ) || - (uop_data_opcode == UOP_OPCODE_INPUT_TO_NARROW ) ; - wire uop_opcode_is_out = uop_data_opcode == UOP_OPCODE_OUTPUT_FROM_NARROW ; - wire uop_opcode_is_wrk = (uop_data_opcode == UOP_OPCODE_COPY_CRT_Y2X ) || - (uop_data_opcode == UOP_OPCODE_COPY_LADDERS_X2Y ) || - (uop_data_opcode == UOP_OPCODE_CROSS_LADDERS_X2Y ) || - (uop_data_opcode == UOP_OPCODE_MODULAR_SUBTRACT ) || - (uop_data_opcode == UOP_OPCODE_MODULAR_REDUCE_INIT ) || - (uop_data_opcode == UOP_OPCODE_PROPAGATE_CARRIES ) || - (uop_data_opcode == UOP_OPCODE_MERGE_LH ) || - (uop_data_opcode == UOP_OPCODE_REGULAR_ADD_UNEVEN ) ; - wire uop_opcode_is_mmm = (uop_data_opcode == UOP_OPCODE_MODULAR_MULTIPLY ) || - (uop_data_opcode == UOP_OPCODE_MODULAR_REDUCE_PROC ) || - (uop_data_opcode == UOP_OPCODE_REGULAR_MULTIPLY ) ; - wire uop_opcode_is_ladder = (uop_data_opcode == UOP_OPCODE_LADDER_INIT ) || - (uop_data_opcode == UOP_OPCODE_LADDER_STEP ) ; + reg [UOP_OPCODE_W -1:0] uop_data_opcode_dec; + reg [UOP_CRT_W -1:0] uop_data_crt_dec; + reg [UOP_NPQ_W -1:0] uop_data_npq_dec; + reg [UOP_AUX_W -1:0] uop_data_aux_dec; + reg [UOP_LADDER_W -1:0] uop_data_ladder_dec; + reg [BANK_ADDR_W -1:0] uop_data_sel_wide_in_dec; + reg [BANK_ADDR_W -1:0] uop_data_sel_narrow_in_dec; + reg [BANK_ADDR_W -1:0] uop_data_sel_wide_out_dec; + reg [BANK_ADDR_W -1:0] uop_data_sel_narrow_out_dec; + + always @(posedge clk) begin + uop_data_opcode_dec <= uop_data_int[UOP_W -1-: UOP_OPCODE_W]; + uop_data_crt_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -1-: UOP_CRT_W ]; + uop_data_npq_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -UOP_CRT_W -1-: UOP_NPQ_W ]; + uop_data_aux_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -1-: UOP_AUX_W ]; + uop_data_ladder_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -1-: UOP_LADDER_W]; + uop_data_sel_wide_in_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -1-: BANK_ADDR_W ]; + uop_data_sel_narrow_in_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -1*BANK_ADDR_W -1-: BANK_ADDR_W ]; + uop_data_sel_wide_out_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -2*BANK_ADDR_W -1-: BANK_ADDR_W ]; + uop_data_sel_narrow_out_dec <= uop_data_int[UOP_W -UOP_OPCODE_W -UOP_CRT_W -UOP_NPQ_W -UOP_AUX_W -UOP_LADDER_W -3*BANK_ADDR_W -1-: BANK_ADDR_W ]; + end + + wire uop_opcode_is_stop = uop_data_opcode_dec == UOP_OPCODE_STOP ; + wire uop_opcode_is_in = (uop_data_opcode_dec == UOP_OPCODE_INPUT_TO_WIDE ) || + (uop_data_opcode_dec == UOP_OPCODE_INPUT_TO_NARROW ) ; + wire uop_opcode_is_out = uop_data_opcode_dec == UOP_OPCODE_OUTPUT_FROM_NARROW ; + wire uop_opcode_is_wrk = (uop_data_opcode_dec == UOP_OPCODE_COPY_CRT_Y2X ) || + (uop_data_opcode_dec == UOP_OPCODE_COPY_LADDERS_X2Y ) || + (uop_data_opcode_dec == UOP_OPCODE_CROSS_LADDERS_X2Y ) || + (uop_data_opcode_dec == UOP_OPCODE_MODULAR_SUBTRACT ) || + (uop_data_opcode_dec == UOP_OPCODE_MODULAR_REDUCE_INIT ) || + (uop_data_opcode_dec == UOP_OPCODE_PROPAGATE_CARRIES ) || + (uop_data_opcode_dec == UOP_OPCODE_MERGE_LH ) || + (uop_data_opcode_dec == UOP_OPCODE_REGULAR_ADD_UNEVEN ) ; + wire uop_opcode_is_mmm = (uop_data_opcode_dec == UOP_OPCODE_MODULAR_MULTIPLY ) || + (uop_data_opcode_dec == UOP_OPCODE_MODULAR_REDUCE_PROC ) || + (uop_data_opcode_dec == UOP_OPCODE_REGULAR_MULTIPLY ) ; + wire uop_opcode_is_ladder = (uop_data_opcode_dec == UOP_OPCODE_LADDER_INIT ) || + (uop_data_opcode_dec == UOP_OPCODE_LADDER_STEP ) ; // // Debug Signal // `ifdef MODEXPNG_ENABLE_DEBUG - assign uop_decoded_stop = (uop_fsm_state == UOP_FSM_STATE_DECODE) && uop_opcode_is_stop; + assign uop_decoded_stop = (uop_fsm_state == UOP_FSM_STATE_DECODE_2) && uop_opcode_is_stop; `else assign uop_decoded_stop = 1'b0; `endif @@ -369,13 +382,13 @@ module modexpng_uop_engine wire mmm_ena = mmm_ena_x & mmm_ena_y; wire mmm_rdy = mmm_rdy_x & mmm_rdy_y; - assign uop_loop_now = (uop_data_opcode == UOP_OPCODE_LADDER_STEP) && !io_mgr_ladder_done; + assign uop_loop_now = (uop_data_opcode_dec == UOP_OPCODE_LADDER_STEP) && !io_mgr_ladder_done; reg [1:0] uop_data_ladder_mux; - always @(uop_data_ladder, io_mgr_ladder_p, io_mgr_ladder_q, io_mgr_ladder_d) + always @(uop_data_ladder_dec, io_mgr_ladder_p, io_mgr_ladder_q, io_mgr_ladder_d) // - case (uop_data_ladder) + case (uop_data_ladder_dec) UOP_LADDER_00: uop_data_ladder_mux = 2'b00; UOP_LADDER_11: uop_data_ladder_mux = 2'b11; UOP_LADDER_D: uop_data_ladder_mux = {~io_mgr_ladder_d, io_mgr_ladder_d}; @@ -384,10 +397,10 @@ module modexpng_uop_engine reg [OP_ADDR_W-1:0] word_index_last_mux; - always @(uop_data_npq, word_index_last_n, word_index_last_pq) + always @(uop_data_npq_dec, word_index_last_n, word_index_last_pq) // - if (uop_data_npq == UOP_NPQ_N) word_index_last_mux = word_index_last_n; - else word_index_last_mux = word_index_last_pq; + if (uop_data_npq_dec == UOP_NPQ_N) word_index_last_mux = word_index_last_n; + else word_index_last_mux = word_index_last_pq; reg [BIT_INDEX_W-1:0] bit_index_last_mux; @@ -407,13 +420,13 @@ module modexpng_uop_engine mmm_ena_y_r <= 1'b0; wrk_ena_r <= 1'b0; end else begin - io_mgr_ena_r <= uop_fsm_state == UOP_FSM_STATE_DECODE ? (uop_opcode_is_in || - uop_opcode_is_out || - uop_opcode_is_ladder): 1'b0; - mmm_ena_x_r <= uop_fsm_state == UOP_FSM_STATE_DECODE ? uop_opcode_is_mmm : 1'b0; - mmm_ena_y_r <= uop_fsm_state == UOP_FSM_STATE_DECODE ? uop_opcode_is_mmm : 1'b0; - wrk_ena_r <= uop_fsm_state == UOP_FSM_STATE_DECODE ? (uop_opcode_is_wrk || - uop_opcode_is_out ): 1'b0; + io_mgr_ena_r <= uop_fsm_state == UOP_FSM_STATE_DECODE_2 ? (uop_opcode_is_in || + uop_opcode_is_out || + uop_opcode_is_ladder): 1'b0; + mmm_ena_x_r <= uop_fsm_state == UOP_FSM_STATE_DECODE_2 ? uop_opcode_is_mmm : 1'b0; + mmm_ena_y_r <= uop_fsm_state == UOP_FSM_STATE_DECODE_2 ? uop_opcode_is_mmm : 1'b0; + wrk_ena_r <= uop_fsm_state == UOP_FSM_STATE_DECODE_2 ? (uop_opcode_is_wrk || + uop_opcode_is_out ): 1'b0; end @@ -497,59 +510,59 @@ module modexpng_uop_engine always @(posedge clk) // - if (uop_fsm_state == UOP_FSM_STATE_DECODE) + if (uop_fsm_state == UOP_FSM_STATE_DECODE_2) // - case (uop_data_opcode) + case (uop_data_opcode_dec) // UOP_OPCODE_INPUT_TO_WIDE: - update_io_mgr_params(uop_data_crt, uop_data_aux, uop_data_sel_narrow_in, uop_data_sel_wide_out, uop_data_opcode); + update_io_mgr_params(uop_data_crt_dec, uop_data_aux_dec, uop_data_sel_narrow_in_dec, uop_data_sel_wide_out_dec, uop_data_opcode_dec); // UOP_OPCODE_INPUT_TO_NARROW: - update_io_mgr_params(uop_data_crt, uop_data_aux, uop_data_sel_narrow_in, uop_data_sel_narrow_out, uop_data_opcode); + update_io_mgr_params(uop_data_crt_dec, uop_data_aux_dec, uop_data_sel_narrow_in_dec, uop_data_sel_narrow_out_dec, uop_data_opcode_dec); // UOP_OPCODE_OUTPUT_FROM_NARROW: begin - update_io_mgr_params(uop_data_crt, uop_data_aux, BANK_DNC, uop_data_sel_narrow_out, uop_data_opcode); - update_wrk_params(BANK_DNC, uop_data_sel_narrow_in, BANK_DNC, BANK_DNC, uop_data_opcode); + update_io_mgr_params(uop_data_crt_dec, uop_data_aux_dec, BANK_DNC, uop_data_sel_narrow_out_dec, uop_data_opcode_dec); + update_wrk_params(BANK_DNC, uop_data_sel_narrow_in_dec, BANK_DNC, BANK_DNC, uop_data_opcode_dec); end // UOP_OPCODE_COPY_CRT_Y2X, UOP_OPCODE_COPY_LADDERS_X2Y, UOP_OPCODE_CROSS_LADDERS_X2Y: - update_wrk_params(uop_data_sel_wide_in, uop_data_sel_narrow_in, uop_data_sel_wide_out, uop_data_sel_narrow_out, uop_data_opcode); + update_wrk_params(uop_data_sel_wide_in_dec, uop_data_sel_narrow_in_dec, uop_data_sel_wide_out_dec, uop_data_sel_narrow_out_dec, uop_data_opcode_dec); // UOP_OPCODE_MODULAR_MULTIPLY: begin - update_mmm_params(uop_data_ladder_mux, uop_data_sel_wide_in, uop_data_sel_narrow_in, uop_data_aux, 1'b0, 1'b0); - update_rdct_params(uop_data_sel_wide_out, uop_data_sel_narrow_out); + update_mmm_params(uop_data_ladder_mux, uop_data_sel_wide_in_dec, uop_data_sel_narrow_in_dec, uop_data_aux_dec, 1'b0, 1'b0); + update_rdct_params(uop_data_sel_wide_out_dec, uop_data_sel_narrow_out_dec); end // UOP_OPCODE_MODULAR_SUBTRACT: - update_wrk_params(BANK_DNC, uop_data_sel_narrow_in, uop_data_sel_wide_out, uop_data_sel_narrow_out, uop_data_opcode); + update_wrk_params(BANK_DNC, uop_data_sel_narrow_in_dec, uop_data_sel_wide_out_dec, uop_data_sel_narrow_out_dec, uop_data_opcode_dec); // UOP_OPCODE_MODULAR_REDUCE_INIT: - update_wrk_params(BANK_DNC, uop_data_sel_narrow_in, BANK_DNC, BANK_DNC, uop_data_opcode); + update_wrk_params(BANK_DNC, uop_data_sel_narrow_in_dec, BANK_DNC, BANK_DNC, uop_data_opcode_dec); // UOP_OPCODE_MODULAR_REDUCE_PROC: begin update_mmm_params(2'bXX, BANK_DNC, BANK_DNC, 1'b0, 1'b1, 1'b0); - update_rdct_params(uop_data_sel_wide_out, uop_data_sel_narrow_out); + update_rdct_params(uop_data_sel_wide_out_dec, uop_data_sel_narrow_out_dec); end // UOP_OPCODE_PROPAGATE_CARRIES: - update_wrk_params(BANK_DNC, uop_data_sel_narrow_in, BANK_DNC, uop_data_sel_narrow_out, uop_data_opcode); + update_wrk_params(BANK_DNC, uop_data_sel_narrow_in_dec, BANK_DNC, uop_data_sel_narrow_out_dec, uop_data_opcode_dec); // UOP_OPCODE_MERGE_LH: - update_wrk_params(BANK_DNC, BANK_DNC, BANK_DNC, uop_data_sel_narrow_out, uop_data_opcode); + update_wrk_params(BANK_DNC, BANK_DNC, BANK_DNC, uop_data_sel_narrow_out_dec, uop_data_opcode_dec); // UOP_OPCODE_REGULAR_MULTIPLY: begin - update_mmm_params(2'b11, uop_data_sel_wide_in, uop_data_sel_narrow_in, 1'b0, 1'b0, 1'b1); - update_rdct_params(uop_data_sel_wide_out, uop_data_sel_narrow_out); + update_mmm_params(2'b11, uop_data_sel_wide_in_dec, uop_data_sel_narrow_in_dec, 1'b0, 1'b0, 1'b1); + update_rdct_params(uop_data_sel_wide_out_dec, uop_data_sel_narrow_out_dec); end // UOP_OPCODE_REGULAR_ADD_UNEVEN: - update_wrk_params(uop_data_sel_wide_in, uop_data_sel_narrow_in, BANK_DNC, uop_data_sel_narrow_out, uop_data_opcode); + update_wrk_params(uop_data_sel_wide_in_dec, uop_data_sel_narrow_in_dec, BANK_DNC, uop_data_sel_narrow_out_dec, uop_data_opcode_dec); // UOP_OPCODE_LADDER_INIT, UOP_OPCODE_LADDER_STEP: - update_io_mgr_params(UOP_CRT_DNC, UOP_AUX_DNC, BANK_DNC, BANK_DNC, uop_data_opcode); + update_io_mgr_params(UOP_CRT_DNC, UOP_AUX_DNC, BANK_DNC, BANK_DNC, uop_data_opcode_dec); // endcase @@ -595,9 +608,9 @@ module modexpng_uop_engine always @(posedge clk) // - if (uop_fsm_state == UOP_FSM_STATE_DECODE) + if (uop_fsm_state == UOP_FSM_STATE_DECODE_2) // - case (uop_data_opcode) + case (uop_data_opcode_dec) // UOP_OPCODE_INPUT_TO_WIDE, UOP_OPCODE_INPUT_TO_NARROW, @@ -660,10 +673,11 @@ module modexpng_uop_engine always @* begin // case (uop_fsm_state) - UOP_FSM_STATE_IDLE: uop_fsm_state_next = ena ? UOP_FSM_STATE_FETCH : UOP_FSM_STATE_IDLE; - UOP_FSM_STATE_FETCH: uop_fsm_state_next = UOP_FSM_STATE_DECODE ; - UOP_FSM_STATE_DECODE: uop_fsm_state_next = uop_opcode_is_stop ? UOP_FSM_STATE_IDLE : UOP_FSM_STATE_BUSY; - UOP_FSM_STATE_BUSY: uop_fsm_state_next = uop_exit_from_busy ? UOP_FSM_STATE_FETCH : UOP_FSM_STATE_BUSY; + UOP_FSM_STATE_IDLE: uop_fsm_state_next = ena ? UOP_FSM_STATE_FETCH : UOP_FSM_STATE_IDLE; + UOP_FSM_STATE_FETCH: uop_fsm_state_next = UOP_FSM_STATE_DECODE_1 ; + UOP_FSM_STATE_DECODE_1: uop_fsm_state_next = UOP_FSM_STATE_DECODE_2 ; + UOP_FSM_STATE_DECODE_2: uop_fsm_state_next = uop_opcode_is_stop ? UOP_FSM_STATE_IDLE : UOP_FSM_STATE_BUSY; + UOP_FSM_STATE_BUSY: uop_fsm_state_next = uop_exit_from_busy ? UOP_FSM_STATE_FETCH : UOP_FSM_STATE_BUSY; endcase // end @@ -677,10 +691,10 @@ module modexpng_uop_engine always @(posedge clk or negedge rst_n) // - if (!rst_n) rdy_r <= 1'b1; + if (!rst_n) rdy_r <= 1'b1; else case (uop_fsm_state) - UOP_FSM_STATE_IDLE: rdy_r <= ~ena; - UOP_FSM_STATE_DECODE: rdy_r <= uop_opcode_is_stop; + UOP_FSM_STATE_IDLE: rdy_r <= ~ena; + UOP_FSM_STATE_DECODE_2: rdy_r <= uop_opcode_is_stop; endcase -- cgit v1.2.3