From 02247784f18dc683d5873a52c1650e72f02273b5 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Thu, 3 Oct 2019 16:50:25 +0300 Subject: Added more micro-operations, entire Montgomery exponentiation ladder works now. --- rtl/modexpng_general_worker.v | 402 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 364 insertions(+), 38 deletions(-) (limited to 'rtl/modexpng_general_worker.v') diff --git a/rtl/modexpng_general_worker.v b/rtl/modexpng_general_worker.v index c35f0b3..269ef98 100644 --- a/rtl/modexpng_general_worker.v +++ b/rtl/modexpng_general_worker.v @@ -14,6 +14,7 @@ module modexpng_general_worker opcode, word_index_last, + word_index_last_half, wrk_rd_wide_xy_ena_x, wrk_rd_wide_xy_bank_x, @@ -88,6 +89,7 @@ module modexpng_general_worker input [ UOP_OPCODE_W -1:0] opcode; input [ OP_ADDR_W -1:0] word_index_last; + input [ OP_ADDR_W -1:0] word_index_last_half; output wrk_rd_wide_xy_ena_x; output [ BANK_ADDR_W -1:0] wrk_rd_wide_xy_bank_x; @@ -141,18 +143,35 @@ module modexpng_general_worker // // FSM Declaration // - localparam [3:0] WRK_FSM_STATE_IDLE = 4'h0; - localparam [3:0] WRK_FSM_STATE_LATENCY_PRE1 = 4'h1; - localparam [3:0] WRK_FSM_STATE_LATENCY_PRE2 = 4'h2; - localparam [3:0] WRK_FSM_STATE_BUSY = 4'h3; - localparam [3:0] WRK_FSM_STATE_LATENCY_POST1 = 4'h5; // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug! - localparam [3:0] WRK_FSM_STATE_LATENCY_POST2 = 4'h6; - localparam [3:0] WRK_FSM_STATE_STOP = 4'h7; + localparam [4:0] WRK_FSM_STATE_IDLE = 5'h00; - reg [3:0] wrk_fsm_state = WRK_FSM_STATE_IDLE; - reg [3:0] wrk_fsm_state_next_one_pass; // single address space sweep + localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1 = 5'h01; + localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2 = 5'h02; + localparam [4:0] WRK_FSM_STATE_BUSY = 5'h03; + localparam [4:0] WRK_FSM_STATE_LATENCY_POST1 = 5'h05; // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug! + localparam [4:0] WRK_FSM_STATE_LATENCY_POST2 = 5'h06; + + localparam [4:0] WRK_FSM_STATE_STOP = 5'h07; + + localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M1 = 5'h10; + localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M2 = 5'h11; + localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M1 = 5'h12; + localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M2 = 5'h13; + localparam [4:0] WRK_FSM_STATE_BUSY_M1 = 5'h14; + localparam [4:0] WRK_FSM_STATE_BUSY_M2 = 5'h15; + localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 5'h16; + localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 5'h17; + localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 5'h18; + localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 5'h19; + + reg [4:0] wrk_fsm_state = WRK_FSM_STATE_IDLE; + reg [4:0] wrk_fsm_state_next_one_pass; // single address space sweep + reg [4:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y) + // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps) + + // // Control Signals // @@ -244,32 +263,62 @@ module modexpng_general_worker // // Delays // - //reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly1; - //reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly2; - //reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly1; - //reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly2; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly1; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly2; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly3; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly4; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly1; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly2; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly3; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly4; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly1; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly2; + reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly3; + reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly4; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly1; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly2; + reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly3; + reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly4; + + reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly1; + reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly2; + reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly3; + + reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly1; + reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly2; + reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly3; + + reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1; + reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2; + reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3; + + reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1; + reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2; + reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3; + always @(posedge clk) begin // - //{rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x}; - //{rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y}; + {rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x}; + {rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y}; + // + {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2}; + {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2}; // {rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x}; {rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y}; // + {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2}; + {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2}; + // + {wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x}; + {wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y}; + // + {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x}; + {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y}; + // end - - - // - // Handy Wires - // - wire rd_narrow_xy_addr_x_next_is_last; - wire rd_narrow_xy_addr_y_next_is_last; // @@ -310,7 +359,8 @@ module modexpng_general_worker case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES, - UOP_OPCODE_OUTPUT_FROM_NARROW: + UOP_OPCODE_OUTPUT_FROM_NARROW, + UOP_OPCODE_MODULAR_REDUCE_INIT: // case (wrk_fsm_state_next_one_pass) // @@ -333,12 +383,30 @@ module modexpng_general_worker WRK_FSM_STATE_LATENCY_PRE2, WRK_FSM_STATE_BUSY: begin // - enable_narrow_xy_rd_en; enable_wide_xy_rd_en; + enable_narrow_xy_rd_en; // end // endcase + // + UOP_OPCODE_COPY_LADDERS_X2Y: + // + case (wrk_fsm_state_next_one_pass_meander) + // + WRK_FSM_STATE_LATENCY_PRE1_M1, + WRK_FSM_STATE_LATENCY_PRE1_M2, + WRK_FSM_STATE_LATENCY_PRE2_M1, + WRK_FSM_STATE_LATENCY_PRE2_M2, + WRK_FSM_STATE_BUSY_M1, + WRK_FSM_STATE_BUSY_M2: begin + // + enable_wide_xy_rd_en; + enable_narrow_xy_rd_en; + // + end + // + endcase // endcase // @@ -389,8 +457,7 @@ module modexpng_general_worker WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: // - enable_narrow_xy_wr_en; - //{wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{1'b1}}; + enable_narrow_xy_wr_en; // // endcase @@ -405,7 +472,34 @@ module modexpng_general_worker // enable_wide_xy_wr_en; enable_narrow_xy_wr_en; - //{wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{1'b1}}; + // + end + // + endcase + // + UOP_OPCODE_MODULAR_REDUCE_INIT: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST2: + // + enable_wide_xy_wr_en; + // + // + endcase + // + UOP_OPCODE_COPY_LADDERS_X2Y: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_M2, + WRK_FSM_STATE_LATENCY_POST1_M2, + WRK_FSM_STATE_LATENCY_POST2_M2: begin + // + enable_wide_xy_wr_en; + enable_narrow_xy_wr_en; // end // @@ -424,7 +518,7 @@ module modexpng_general_worker reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r; reg [CARRY_W -1:0] rd_narrow_y_din_y_cry_r; - wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry = wrk_rd_narrow_x_din_x + {{WORD_W{1'b0}}, rd_narrow_x_din_x_cry_r}; + wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry = wrk_rd_narrow_x_din_x + {{WORD_W{1'b0}}, rd_narrow_x_din_x_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry = wrk_rd_narrow_y_din_x + {{WORD_W{1'b0}}, rd_narrow_y_din_x_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r}; @@ -497,6 +591,45 @@ module modexpng_general_worker end // endcase + // + UOP_OPCODE_COPY_LADDERS_X2Y: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_M2, + WRK_FSM_STATE_LATENCY_POST1_M2, + WRK_FSM_STATE_LATENCY_POST2_M2: begin + // + wr_wide_x_dout_x <= wrk_rd_wide_x_din_x_dly3; + wr_wide_y_dout_x <= wrk_rd_wide_x_din_x_dly2; + wr_wide_x_dout_y <= wrk_rd_wide_x_din_y_dly3; + wr_wide_y_dout_y <= wrk_rd_wide_x_din_y_dly2; + // + wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_x_dly3; + wr_narrow_y_dout_x <= wrk_rd_narrow_x_din_x_dly2; + wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y_dly3; + wr_narrow_y_dout_y <= wrk_rd_narrow_x_din_y_dly2; + // + end + // + endcase + // + UOP_OPCODE_MODULAR_REDUCE_INIT: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST2: begin + // + wr_wide_x_dout_x <= wrk_rd_narrow_x_din_x; + wr_wide_y_dout_x <= wrk_rd_narrow_y_din_x; + wr_wide_x_dout_y <= wrk_rd_narrow_x_din_y; + wr_wide_y_dout_y <= wrk_rd_narrow_y_din_y; + // + end + // + endcase // endcase // @@ -506,6 +639,9 @@ module modexpng_general_worker // // Write Address Logic // + wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half; + wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half; + always @(posedge clk) begin // {wr_wide_xy_bank_x, wr_wide_xy_addr_x } <= {BANK_DNC, OP_ADDR_DNC}; @@ -534,22 +670,64 @@ module modexpng_general_worker // endcase // + UOP_OPCODE_MODULAR_REDUCE_INIT: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY, + WRK_FSM_STATE_LATENCY_POST1, + WRK_FSM_STATE_LATENCY_POST2: begin + // + wr_wide_xy_bank_x <= uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H; + wr_wide_xy_bank_y <= uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H; + // + wr_wide_xy_addr_x <= rd_wide_xy_addr_x_dly2; + wr_wide_xy_addr_y <= rd_wide_xy_addr_y_dly2; + // + end + // + endcase + // + UOP_OPCODE_COPY_LADDERS_X2Y: + // + case (wrk_fsm_state) + // + WRK_FSM_STATE_BUSY_M2, + WRK_FSM_STATE_LATENCY_POST1_M2, + WRK_FSM_STATE_LATENCY_POST2_M2: begin + // + {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly4}; + {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly4}; + // + {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly4}; + {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly4}; + // + end + // + endcase + // // endcase // end - - + + // // Read Address Logic // + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_next; + reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_next; + reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_next; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_next; - assign rd_narrow_xy_addr_x_next_is_last = rd_narrow_xy_addr_x_next == word_index_last; - assign rd_narrow_xy_addr_y_next_is_last = rd_narrow_xy_addr_y_next == word_index_last; + wire rd_wide_xy_addr_x_next_is_last = rd_wide_xy_addr_x_next == word_index_last_half; + wire rd_wide_xy_addr_y_next_is_last = rd_wide_xy_addr_y_next == word_index_last_half; + + wire rd_narrow_xy_addr_x_next_is_last = rd_narrow_xy_addr_x_next == word_index_last; + wire rd_narrow_xy_addr_y_next_is_last = rd_narrow_xy_addr_y_next == word_index_last; - always @(posedge clk) begin + always @(posedge clk) begin // TODO: Maybe split into two blocks (read address / next address)?? // {rd_wide_xy_bank_x, rd_wide_xy_addr_x } <= {BANK_DNC, OP_ADDR_DNC}; // TODO: Add same default path for io_manager ?? {rd_wide_xy_bank_y, rd_wide_xy_addr_y } <= {BANK_DNC, OP_ADDR_DNC}; @@ -572,6 +750,9 @@ module modexpng_general_worker {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO}; // + rd_wide_xy_addr_x_next <= OP_ADDR_ONE; + rd_wide_xy_addr_y_next <= OP_ADDR_ONE; + // rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; // @@ -586,11 +767,113 @@ module modexpng_general_worker {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next}; // + rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; + rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; + // + rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; + rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; + // + end + // + endcase + // + UOP_OPCODE_MODULAR_REDUCE_INIT: + // + case (wrk_fsm_state_next_one_pass) + // + WRK_FSM_STATE_LATENCY_PRE1: begin + // + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO}; + // + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO}; + // + rd_wide_xy_addr_x_next <= OP_ADDR_ONE; + rd_wide_xy_addr_y_next <= OP_ADDR_ONE; + // + rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; + rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; + // + end + // + WRK_FSM_STATE_LATENCY_PRE2, + WRK_FSM_STATE_BUSY: begin + // + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x_next}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y_next}; + // + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next}; + // + rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; + rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; + // + rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; + rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; + // + end + // + endcase + // + UOP_OPCODE_COPY_LADDERS_X2Y: + // + case (wrk_fsm_state_next_one_pass_meander) + // + WRK_FSM_STATE_LATENCY_PRE1_M1: begin + // + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, OP_ADDR_ZERO}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, OP_ADDR_ZERO}; + // + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, OP_ADDR_ZERO}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, OP_ADDR_ZERO}; + // + rd_wide_xy_addr_x_next <= OP_ADDR_ONE; + rd_wide_xy_addr_y_next <= OP_ADDR_ONE; + // + rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; + rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; + // + end + // + WRK_FSM_STATE_LATENCY_PRE1_M2: begin + // + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y}; + // + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y}; + // + end + // + WRK_FSM_STATE_LATENCY_PRE2_M1, + WRK_FSM_STATE_BUSY_M1: begin + // + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_next}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_next}; + // + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_next}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_next}; + // + rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; + rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; + // rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; // end // + WRK_FSM_STATE_LATENCY_PRE2_M2, + WRK_FSM_STATE_BUSY_M2: begin + // + {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x}; + {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y}; + // + {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x}; + {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y}; + // + end + // endcase // // @@ -608,7 +891,9 @@ module modexpng_general_worker else case (opcode) UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_COPY_CRT_Y2X: wrk_fsm_state <= wrk_fsm_state_next_one_pass; + UOP_OPCODE_COPY_CRT_Y2X, + UOP_OPCODE_MODULAR_REDUCE_INIT: wrk_fsm_state <= wrk_fsm_state_next_one_pass; + UOP_OPCODE_COPY_LADDERS_X2Y: wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander; default: wrk_fsm_state <= WRK_FSM_STATE_IDLE; endcase @@ -616,17 +901,20 @@ module modexpng_general_worker // // Busy Exit Logic // - reg wrk_fsm_done_one_pass = 1'b0; + reg wrk_fsm_done_one_pass = 1'b0; + reg wrk_fsm_done_one_pass_meander = 1'b0; always @(posedge clk) begin // - wrk_fsm_done_one_pass <= 1'b0; + wrk_fsm_done_one_pass <= 1'b0; + wrk_fsm_done_one_pass_meander <= 1'b0; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, - UOP_OPCODE_COPY_CRT_Y2X: begin + UOP_OPCODE_COPY_CRT_Y2X, + UOP_OPCODE_MODULAR_REDUCE_INIT: begin // if (wrk_fsm_state == WRK_FSM_STATE_BUSY) begin // @@ -637,6 +925,20 @@ module modexpng_general_worker // end // + UOP_OPCODE_COPY_LADDERS_X2Y: begin + // + if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M2) begin + // + if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; // TODO: Check, whether both are necessary... + if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; + // + end + // + if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M1) + wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander; + // + end + // endcase // end @@ -654,7 +956,31 @@ module modexpng_general_worker WRK_FSM_STATE_BUSY: wrk_fsm_state_next_one_pass = wrk_fsm_done_one_pass ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY ; WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_LATENCY_POST2 ; WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_STOP ; - WRK_FSM_STATE_STOP: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_IDLE ; + WRK_FSM_STATE_STOP: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_IDLE ; + default: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_IDLE ; + endcase + // + end + + always @* begin + // + case (wrk_fsm_state) + WRK_FSM_STATE_IDLE: wrk_fsm_state_next_one_pass_meander = ena ? WRK_FSM_STATE_LATENCY_PRE1_M1 : WRK_FSM_STATE_IDLE ; + // + WRK_FSM_STATE_LATENCY_PRE1_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE1_M2 ; + WRK_FSM_STATE_LATENCY_PRE1_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE2_M1 ; + WRK_FSM_STATE_LATENCY_PRE2_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE2_M2 ; + WRK_FSM_STATE_LATENCY_PRE2_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_BUSY_M1 ; + WRK_FSM_STATE_BUSY_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_BUSY_M2 ; + WRK_FSM_STATE_BUSY_M2: wrk_fsm_state_next_one_pass_meander = wrk_fsm_done_one_pass_meander ? WRK_FSM_STATE_LATENCY_POST1_M1 : WRK_FSM_STATE_BUSY_M1 ; + WRK_FSM_STATE_LATENCY_POST1_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST1_M2 ; + WRK_FSM_STATE_LATENCY_POST1_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST2_M1 ; + WRK_FSM_STATE_LATENCY_POST2_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST2_M2 ; + WRK_FSM_STATE_LATENCY_POST2_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_STOP ; + // + WRK_FSM_STATE_STOP: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_IDLE ; + // + default: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_IDLE ; endcase // end -- cgit v1.2.3