From 3213b3ef3c1d40dfa416b6be409cfa3d15af0930 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Mon, 21 Oct 2019 12:46:22 +0300 Subject: Added "MERGE_LH" micro-operation. To be able to do Garner's formula we need regular (not modular) multiplication. We're doing this by telling the modular multiplier to stop after the "square" step, which computes A*B. The problem is that the multiplier stores the lower part of the product in the internal bank L and the upper part in the internal bank H, but we need to be able to do operations on the product as a whole. MERGE_LH that combines the two halves of the product into one bank. --- rtl/modexpng_core_top.v | 41 ++++++++++++++++++++++++++++++-- rtl/modexpng_general_worker.v | 55 ++++++++++++++++++++++++++++++++++++++++--- rtl/modexpng_microcode.vh | 4 ++++ rtl/modexpng_mmm_dual.v | 13 ++++++---- rtl/modexpng_uop_rom.v | 11 +++++++++ 5 files changed, 115 insertions(+), 9 deletions(-) diff --git a/rtl/modexpng_core_top.v b/rtl/modexpng_core_top.v index dea7f0a..4c1f065 100644 --- a/rtl/modexpng_core_top.v +++ b/rtl/modexpng_core_top.v @@ -83,13 +83,15 @@ module modexpng_core_top wire uop_opcode_is_ladder = (uop_data_opcode == UOP_OPCODE_LADDER_INIT ) || (uop_data_opcode == UOP_OPCODE_LADDER_STEP ) ; wire uop_opcode_is_mmm = (uop_data_opcode == UOP_OPCODE_MODULAR_MULTIPLY ) || - (uop_data_opcode == UOP_OPCODE_MODULAR_REDUCE_PROC ) ; + (uop_data_opcode == UOP_OPCODE_MODULAR_REDUCE_PROC ) || + (uop_data_opcode == UOP_OPCODE_REGULAR_MULTIPLY ) ; wire uop_opcode_is_wrk = (uop_data_opcode == UOP_OPCODE_PROPAGATE_CARRIES ) || (uop_data_opcode == UOP_OPCODE_COPY_CRT_Y2X ) || (uop_data_opcode == UOP_OPCODE_MODULAR_REDUCE_INIT ) || (uop_data_opcode == UOP_OPCODE_COPY_LADDERS_X2Y ) || (uop_data_opcode == UOP_OPCODE_CROSS_LADDERS_X2Y ) || - (uop_data_opcode == UOP_OPCODE_MODULAR_SUBTRACT ) ; + (uop_data_opcode == UOP_OPCODE_MODULAR_SUBTRACT ) || + (uop_data_opcode == UOP_OPCODE_MERGE_LH ) ; wire uop_loop_now; @@ -716,6 +718,9 @@ module modexpng_core_top reg mmm_only_reduce_x; reg mmm_only_reduce_y; + reg mmm_just_multiply_x; + reg mmm_just_multiply_y; + wire rdct_ena_x; wire rdct_ena_y; wire rdct_rdy_x; @@ -734,6 +739,7 @@ module modexpng_core_top .word_index_last_minus1 (mmm_word_index_last_minus1_x), .force_unity_b (mmm_force_unity_b_x), .only_reduce (mmm_only_reduce_x), + .just_multiply (mmm_just_multiply_x), .sel_wide_in (mmm_sel_wide_in_x), .sel_narrow_in (mmm_sel_narrow_in_x), @@ -790,6 +796,7 @@ module modexpng_core_top .word_index_last_minus1 (mmm_word_index_last_minus1_y), .force_unity_b (mmm_force_unity_b_y), .only_reduce (mmm_only_reduce_y), + .just_multiply (mmm_just_multiply_y), .sel_wide_in (mmm_sel_wide_in_y), .sel_narrow_in (mmm_sel_narrow_in_y), @@ -1088,6 +1095,7 @@ module modexpng_core_top UOP_LADDER_PQ: {mmm_ladder_mode_x, mmm_ladder_mode_y} <= {io_mgr_ladder_p, io_mgr_ladder_q}; endcase // + {mmm_just_multiply_x, mmm_just_multiply_y } <= {2{1'b0}}; {mmm_only_reduce_x, mmm_only_reduce_y } <= {2{1'b0}}; {mmm_force_unity_b_x, mmm_force_unity_b_y } <= {2{uop_aux_is_1 ? 1'b0 : 1'b1}}; {mmm_sel_wide_in_x, mmm_sel_wide_in_y } <= {2{uop_data_sel_wide_in }}; @@ -1110,6 +1118,20 @@ module modexpng_core_top // end // + UOP_OPCODE_REGULAR_MULTIPLY: begin + // + {mmm_ladder_mode_x, mmm_ladder_mode_y } <= {2{1'b1}}; + // + {mmm_just_multiply_x, mmm_just_multiply_y } <= {2{1'b1}}; + {mmm_only_reduce_x, mmm_only_reduce_y } <= {2{1'b0}}; + {mmm_force_unity_b_x, mmm_force_unity_b_y } <= {2{uop_aux_is_1 ? 1'b0 : 1'b1}}; + {mmm_sel_wide_in_x, mmm_sel_wide_in_y } <= {2{uop_data_sel_wide_in }}; + {mmm_sel_narrow_in_x, mmm_sel_narrow_in_y } <= {2{uop_data_sel_narrow_in }}; + {rdct_sel_wide_out_x, rdct_sel_wide_out_y } <= {2{uop_data_sel_wide_out }}; + {rdct_sel_narrow_out_x, rdct_sel_narrow_out_y} <= {2{uop_data_sel_narrow_out }}; + // + end + // UOP_OPCODE_PROPAGATE_CARRIES: begin wrk_sel_narrow_in <= uop_data_sel_narrow_in; wrk_sel_narrow_out <= uop_data_sel_narrow_out; @@ -1121,6 +1143,10 @@ module modexpng_core_top wrk_sel_narrow_out <= uop_data_sel_narrow_out; end // + UOP_OPCODE_MERGE_LH: begin + wrk_sel_narrow_out <= uop_data_sel_narrow_out; + end + // UOP_OPCODE_COPY_CRT_Y2X, UOP_OPCODE_COPY_LADDERS_X2Y, UOP_OPCODE_CROSS_LADDERS_X2Y: begin @@ -1181,10 +1207,21 @@ module modexpng_core_top {rdct_word_index_last_x, rdct_word_index_last_y } <= {2{word_index_last_pq }}; end // + UOP_OPCODE_REGULAR_MULTIPLY: begin + {mmm_word_index_last_x, mmm_word_index_last_y } <= {2{word_index_last_pq }}; + {mmm_word_index_last_minus1_x, mmm_word_index_last_minus1_y} <= {2{word_index_last_pq_minus1}}; + {rdct_word_index_last_x, rdct_word_index_last_y } <= {2{word_index_last_pq }}; + end + // UOP_OPCODE_MODULAR_SUBTRACT: begin wrk_word_index_last <= uop_npq_is_n ? word_index_last_n : word_index_last_pq; end // + UOP_OPCODE_MERGE_LH: begin + wrk_word_index_last <= word_index_last_n; + wrk_word_index_last_half <= word_index_last_pq; + end + // UOP_OPCODE_LADDER_INIT: begin io_mgr_word_index_last <= OP_ADDR_LADDER_LAST; io_mgr_ladder_steps <= crt_mode ? bit_index_last_pq : bit_index_last_n; diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v index 74c939b..d82a120 100644 --- a/rtl/modexpng_general_worker.v +++ b/rtl/modexpng_general_worker.v @@ -334,6 +334,10 @@ module modexpng_general_worker // end // + UOP_OPCODE_MERGE_LH: + // + enable_wide_xy_rd_en; + // endcase // endcase @@ -424,7 +428,8 @@ module modexpng_general_worker // case (opcode) // - UOP_OPCODE_PROPAGATE_CARRIES: + UOP_OPCODE_PROPAGATE_CARRIES, + UOP_OPCODE_MERGE_LH: // enable_narrow_xy_wr_en; // @@ -738,6 +743,13 @@ module modexpng_general_worker wrk_rd_narrow_x_din_y, wrk_rd_narrow_y_din_y); // + UOP_OPCODE_MERGE_LH: + // + update_narrow_dout(wrk_rd_wide_x_din_x, + wrk_rd_wide_y_din_x, + wrk_rd_wide_x_din_y, + wrk_rd_wide_y_din_y); + // endcase // endcase @@ -819,6 +831,8 @@ module modexpng_general_worker reg [OP_ADDR_W -1:0] rd_wide_xy_addr_xy_next; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_xy_next; + reg rd_wide_xy_addr_xy_next_last_seen; + wire rd_wide_xy_addr_xy_next_is_last = rd_wide_xy_addr_xy_next == word_index_last_half; wire rd_narrow_xy_addr_xy_next_is_last = rd_narrow_xy_addr_xy_next == word_index_last; @@ -874,6 +888,22 @@ module modexpng_general_worker rd_narrow_xy_addr_xy_next <= !rd_narrow_xy_addr_xy_next_is_last ? rd_narrow_xy_addr_xy_next + 1'b1 : OP_ADDR_ZERO; endtask + always @(posedge clk) + // + case (wrk_fsm_state_next_one_pass) + // + WRK_FSM_STATE_LATENCY_PRE1: + // + rd_wide_xy_addr_xy_next_last_seen <= 1'b0; + // + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_BUSY: + // + if (!rd_wide_xy_addr_xy_next_last_seen) + rd_wide_xy_addr_xy_next_last_seen <= rd_wide_xy_addr_xy_next_is_last; + // + endcase + always @(posedge clk) begin // update_rd_wide_bank_addr (BANK_DNC, OP_ADDR_DNC); @@ -897,6 +927,11 @@ module modexpng_general_worker // end // + UOP_OPCODE_MERGE_LH: begin + update_rd_wide_bank_addr (BANK_WIDE_L, OP_ADDR_ZERO); update_rd_wide_addr_next (OP_ADDR_ONE); + update_rd_narrow_bank_addr(sel_narrow_in, OP_ADDR_ZERO); update_rd_narrow_addr_next(OP_ADDR_ONE); + end + // endcase // WRK_FSM_STATE_LATENCY_PRE2, @@ -920,6 +955,15 @@ module modexpng_general_worker // end // + UOP_OPCODE_MERGE_LH: begin + // + if (!rd_wide_xy_addr_xy_next_last_seen) update_rd_wide_bank_addr (BANK_WIDE_L, rd_wide_xy_addr_xy_next ); + else update_rd_wide_bank_addr (BANK_WIDE_H, rd_wide_xy_addr_xy_next ); + advance_rd_wide_addr_next ; + update_rd_narrow_bank_addr(sel_narrow_in, rd_narrow_xy_addr_xy_next); advance_rd_narrow_addr_next; + // + end + // endcase // endcase @@ -1060,6 +1104,9 @@ module modexpng_general_worker UOP_OPCODE_MODULAR_REDUCE_INIT: update_wr_wide_bank_addr(uop_modular_reduce_init_bank_x, uop_modular_reduce_init_bank_y, rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_y_dly2); // + UOP_OPCODE_MERGE_LH: + update_wr_narrow_bank_addr(sel_narrow_out, sel_narrow_out, rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_y_dly2); + // endcase // endcase @@ -1121,7 +1168,8 @@ module modexpng_general_worker UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X, - UOP_OPCODE_MODULAR_REDUCE_INIT: wrk_fsm_state <= wrk_fsm_state_next_one_pass; + UOP_OPCODE_MODULAR_REDUCE_INIT, + UOP_OPCODE_MERGE_LH: wrk_fsm_state <= wrk_fsm_state_next_one_pass; UOP_OPCODE_COPY_LADDERS_X2Y, UOP_OPCODE_CROSS_LADDERS_X2Y: wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander; UOP_OPCODE_MODULAR_SUBTRACT: wrk_fsm_state <= wrk_fsm_state_next_two_pass; @@ -1148,7 +1196,8 @@ module modexpng_general_worker UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X, - UOP_OPCODE_MODULAR_REDUCE_INIT: + UOP_OPCODE_MODULAR_REDUCE_INIT, + UOP_OPCODE_MERGE_LH: // case (wrk_fsm_state) WRK_FSM_STATE_BUSY: diff --git a/rtl/modexpng_microcode.vh b/rtl/modexpng_microcode.vh index 3493e26..47cdeb2 100644 --- a/rtl/modexpng_microcode.vh +++ b/rtl/modexpng_microcode.vh @@ -78,6 +78,10 @@ localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_PROPAGATE_CARRIES = 5'd12; * source and destination WIDE are don't care */ +localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_MERGE_LH = 5'd13; + +localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_REGULAR_MULTIPLY = 5'd14; + localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_LADDER_INIT = 5'd16; localparam [UOP_OPCODE_W -1:0] UOP_OPCODE_LADDER_STEP = 5'd17; /* CRT is don't care diff --git a/rtl/modexpng_mmm_dual.v b/rtl/modexpng_mmm_dual.v index 6e52a97..13a8773 100644 --- a/rtl/modexpng_mmm_dual.v +++ b/rtl/modexpng_mmm_dual.v @@ -9,6 +9,7 @@ module modexpng_mmm_dual word_index_last_minus1, force_unity_b, only_reduce, + just_multiply, sel_wide_in, sel_narrow_in, @@ -74,6 +75,7 @@ module modexpng_mmm_dual input [7:0] word_index_last_minus1; input force_unity_b; input only_reduce; + input just_multiply; input [BANK_ADDR_W-1:0] sel_wide_in; input [BANK_ADDR_W-1:0] sel_narrow_in; @@ -127,6 +129,7 @@ module modexpng_mmm_dual wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_square; wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_triangle; wire [FSM_STATE_WIDTH-1:0] fsm_state_after_mult_rectangle; + wire [FSM_STATE_WIDTH-1:0] fsm_state_after_square_holdoff; // @@ -911,7 +914,9 @@ module modexpng_mmm_dual assign fsm_state_after_idle = !only_reduce ? FSM_STATE_MULT_SQUARE_COL_0_INIT : FSM_STATE_MULT_TRIANGLE_COL_0_INIT; assign fsm_state_after_mult_square = col_is_last ? FSM_STATE_MULT_SQUARE_HOLDOFF : FSM_STATE_MULT_SQUARE_COL_N_INIT; assign fsm_state_after_mult_triangle = col_is_last ? FSM_STATE_MULT_TRIANGLE_HOLDOFF : FSM_STATE_MULT_TRIANGLE_COL_N_INIT; - assign fsm_state_after_mult_rectangle = col_is_last ? FSM_STATE_MULT_RECTANGLE_HOLDOFF : FSM_STATE_MULT_RECTANGLE_COL_N_INIT; + assign fsm_state_after_mult_rectangle = col_is_last ? FSM_STATE_MULT_RECTANGLE_HOLDOFF : FSM_STATE_MULT_RECTANGLE_COL_N_INIT; + assign fsm_state_after_square_holdoff = just_multiply ? FSM_STATE_STOP : FSM_STATE_MULT_TRIANGLE_COL_0_INIT; + always @* begin // @@ -928,7 +933,7 @@ module modexpng_mmm_dual FSM_STATE_MULT_SQUARE_COL_N_TRIG: fsm_state_next = FSM_STATE_MULT_SQUARE_COL_N_BUSY ; FSM_STATE_MULT_SQUARE_COL_N_BUSY: fsm_state_next = square_done ? fsm_state_after_mult_square : FSM_STATE_MULT_SQUARE_COL_N_BUSY; - FSM_STATE_MULT_SQUARE_HOLDOFF: fsm_state_next = rcmb_rdy ? FSM_STATE_MULT_TRIANGLE_COL_0_INIT : FSM_STATE_MULT_SQUARE_HOLDOFF; + FSM_STATE_MULT_SQUARE_HOLDOFF: fsm_state_next = rcmb_rdy ? fsm_state_after_square_holdoff : FSM_STATE_MULT_SQUARE_HOLDOFF; FSM_STATE_MULT_TRIANGLE_COL_0_INIT: fsm_state_next = FSM_STATE_MULT_TRIANGLE_COL_0_TRIG ; FSM_STATE_MULT_TRIANGLE_COL_0_TRIG: fsm_state_next = FSM_STATE_MULT_TRIANGLE_COL_0_BUSY ; @@ -952,9 +957,9 @@ module modexpng_mmm_dual FSM_STATE_WAIT_REDUCTOR: fsm_state_next = rdct_rdy ? FSM_STATE_STOP : FSM_STATE_WAIT_REDUCTOR; - FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ; + FSM_STATE_STOP: fsm_state_next = FSM_STATE_IDLE ; - default: fsm_state_next = FSM_STATE_IDLE ; + default: fsm_state_next = FSM_STATE_IDLE ; endcase // diff --git a/rtl/modexpng_uop_rom.v b/rtl/modexpng_uop_rom.v index adc657a..61501f9 100644 --- a/rtl/modexpng_uop_rom.v +++ b/rtl/modexpng_uop_rom.v @@ -78,6 +78,17 @@ module modexpng_uop_rom // 6'd43: data <= {UOP_OPCODE_MODULAR_SUBTRACT, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_NARROW_D, BANK_WIDE_C, BANK_NARROW_C }; // // + 6'd44: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_1, UOP_LADDER_11, BANK_WIDE_C, BANK_NARROW_E, BANK_WIDE_C, BANK_NARROW_C }; // + 6'd45: data <= {UOP_OPCODE_MODULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_1, UOP_LADDER_11, BANK_WIDE_C, BANK_NARROW_A, BANK_WIDE_C, BANK_NARROW_C }; // + // + 6'd46: data <= {UOP_OPCODE_INPUT_TO_WIDE, UOP_CRT_X, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_Q, BANK_WIDE_E, BANK_DNC }; // + // + 6'd47: data <= {UOP_OPCODE_INPUT_TO_NARROW, UOP_CRT_X, UOP_NPQ_PQ, UOP_AUX_2, UOP_LADDER_DNC, BANK_DNC, BANK_IN_2_Q, BANK_DNC, BANK_NARROW_E }; // + // + 6'd48: data <= {UOP_OPCODE_REGULAR_MULTIPLY, UOP_CRT_DNC, UOP_NPQ_PQ, UOP_AUX_1, UOP_LADDER_11, BANK_WIDE_E, BANK_NARROW_C, BANK_DNC, BANK_DNC }; // + // + 6'd49: data <= {UOP_OPCODE_MERGE_LH, UOP_CRT_DNC, UOP_NPQ_DNC, UOP_AUX_DNC, UOP_LADDER_DNC, BANK_DNC, BANK_DNC, BANK_DNC, BANK_NARROW_A }; // + // default: data <= {UOP_OPCODE_STOP, UOP_CRT_DNC, UOP_NPQ_DNC, UOP_AUX_DNC, UOP_LADDER_DNC, UOP_SEL_DNC_ALL }; // endcase -- cgit v1.2.3