module modexpng_general_worker ( clk, rst, ena, rdy, sel_narrow_in, sel_narrow_out, sel_wide_in, sel_wide_out, opcode, word_index_last, word_index_last_half, wrk_rd_wide_xy_ena_x, wrk_rd_wide_xy_bank_x, wrk_rd_wide_xy_addr_x, wrk_rd_wide_x_din_x, wrk_rd_wide_y_din_x, wrk_rd_narrow_xy_ena_x, wrk_rd_narrow_xy_bank_x, wrk_rd_narrow_xy_addr_x, wrk_rd_narrow_x_din_x, wrk_rd_narrow_y_din_x, wrk_rd_wide_xy_ena_y, wrk_rd_wide_xy_bank_y, wrk_rd_wide_xy_addr_y, wrk_rd_wide_x_din_y, wrk_rd_wide_y_din_y, wrk_rd_narrow_xy_ena_y, wrk_rd_narrow_xy_bank_y, wrk_rd_narrow_xy_addr_y, wrk_rd_narrow_x_din_y, wrk_rd_narrow_y_din_y, wrk_wr_wide_xy_ena_x, wrk_wr_wide_xy_bank_x, wrk_wr_wide_xy_addr_x, wrk_wr_wide_x_dout_x, wrk_wr_wide_y_dout_x, wrk_wr_narrow_xy_ena_x, wrk_wr_narrow_xy_bank_x, wrk_wr_narrow_xy_addr_x, wrk_wr_narrow_x_dout_x, wrk_wr_narrow_y_dout_x, wrk_wr_wide_xy_ena_y, wrk_wr_wide_xy_bank_y, wrk_wr_wide_xy_addr_y, wrk_wr_wide_x_dout_y, wrk_wr_wide_y_dout_y, wrk_wr_narrow_xy_ena_y, wrk_wr_narrow_xy_bank_y, wrk_wr_narrow_xy_addr_y, wrk_wr_narrow_x_dout_y, wrk_wr_narrow_y_dout_y ); // // Headers // `include "modexpng_parameters.vh" `include "modexpng_microcode.vh" // // Ports // input clk; input rst; input ena; output rdy; input [ BANK_ADDR_W -1:0] sel_narrow_in; input [ BANK_ADDR_W -1:0] sel_narrow_out; input [ BANK_ADDR_W -1:0] sel_wide_in; input [ BANK_ADDR_W -1:0] sel_wide_out; input [ UOP_OPCODE_W -1:0] opcode; input [ OP_ADDR_W -1:0] word_index_last; input [ OP_ADDR_W -1:0] word_index_last_half; output wrk_rd_wide_xy_ena_x; output [ BANK_ADDR_W -1:0] wrk_rd_wide_xy_bank_x; output [ OP_ADDR_W -1:0] wrk_rd_wide_xy_addr_x; input [ WORD_EXT_W -1:0] wrk_rd_wide_x_din_x; input [ WORD_EXT_W -1:0] wrk_rd_wide_y_din_x; output wrk_rd_narrow_xy_ena_x; output [ BANK_ADDR_W -1:0] wrk_rd_narrow_xy_bank_x; output [ OP_ADDR_W -1:0] wrk_rd_narrow_xy_addr_x; input [ WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x; input [ WORD_EXT_W -1:0] wrk_rd_narrow_y_din_x; output wrk_rd_wide_xy_ena_y; output [ BANK_ADDR_W -1:0] wrk_rd_wide_xy_bank_y; output [ OP_ADDR_W -1:0] wrk_rd_wide_xy_addr_y; input [ WORD_EXT_W -1:0] wrk_rd_wide_x_din_y; input [ WORD_EXT_W -1:0] wrk_rd_wide_y_din_y; output wrk_rd_narrow_xy_ena_y; output [ BANK_ADDR_W -1:0] wrk_rd_narrow_xy_bank_y; output [ OP_ADDR_W -1:0] wrk_rd_narrow_xy_addr_y; input [ WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y; input [ WORD_EXT_W -1:0] wrk_rd_narrow_y_din_y; output wrk_wr_wide_xy_ena_x; output [ BANK_ADDR_W -1:0] wrk_wr_wide_xy_bank_x; output [ OP_ADDR_W -1:0] wrk_wr_wide_xy_addr_x; output [ WORD_EXT_W -1:0] wrk_wr_wide_x_dout_x; output [ WORD_EXT_W -1:0] wrk_wr_wide_y_dout_x; output wrk_wr_narrow_xy_ena_x; output [ BANK_ADDR_W -1:0] wrk_wr_narrow_xy_bank_x; output [ OP_ADDR_W -1:0] wrk_wr_narrow_xy_addr_x; output [ WORD_EXT_W -1:0] wrk_wr_narrow_x_dout_x; output [ WORD_EXT_W -1:0] wrk_wr_narrow_y_dout_x; output wrk_wr_wide_xy_ena_y; output [ BANK_ADDR_W -1:0] wrk_wr_wide_xy_bank_y; output [ OP_ADDR_W -1:0] wrk_wr_wide_xy_addr_y; output [ WORD_EXT_W -1:0] wrk_wr_wide_x_dout_y; output [ WORD_EXT_W -1:0] wrk_wr_wide_y_dout_y; output wrk_wr_narrow_xy_ena_y; output [ BANK_ADDR_W -1:0] wrk_wr_narrow_xy_bank_y; output [ OP_ADDR_W -1:0] wrk_wr_narrow_xy_addr_y; output [ WORD_EXT_W -1:0] wrk_wr_narrow_x_dout_y; output [ WORD_EXT_W -1:0] wrk_wr_narrow_y_dout_y; // // FSM Declaration // localparam [4:0] WRK_FSM_STATE_IDLE = 5'h00; localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1 = 5'h01; localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2 = 5'h02; localparam [4:0] WRK_FSM_STATE_BUSY = 5'h03; localparam [4:0] WRK_FSM_STATE_LATENCY_POST1 = 5'h05; // NOTE: 4 is skipped to match the numbering in IO_MANAGER to ease debug! localparam [4:0] WRK_FSM_STATE_LATENCY_POST2 = 5'h06; localparam [4:0] WRK_FSM_STATE_STOP = 5'h07; localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M1 = 5'h10; localparam [4:0] WRK_FSM_STATE_LATENCY_PRE1_M2 = 5'h11; localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M1 = 5'h12; localparam [4:0] WRK_FSM_STATE_LATENCY_PRE2_M2 = 5'h13; localparam [4:0] WRK_FSM_STATE_BUSY_M1 = 5'h14; localparam [4:0] WRK_FSM_STATE_BUSY_M2 = 5'h15; localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M1 = 5'h16; localparam [4:0] WRK_FSM_STATE_LATENCY_POST1_M2 = 5'h17; localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M1 = 5'h18; localparam [4:0] WRK_FSM_STATE_LATENCY_POST2_M2 = 5'h19; reg [4:0] wrk_fsm_state = WRK_FSM_STATE_IDLE; reg [4:0] wrk_fsm_state_next_one_pass; // single address space sweep reg [4:0] wrk_fsm_state_next_one_pass_meander; // single address space sweep with interleaving source/destination banks (needed by copy_ladders_x2y) // TODO: Comment on how narrow/wide address increment works (narrow is one long sweep, wide is two twice shorter sweeps) // // Control Signals // reg rd_wide_xy_ena_x = 1'b0; reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_x; reg [ OP_ADDR_W -1:0] rd_wide_xy_addr_x; reg rd_narrow_xy_ena_x = 1'b0; reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_x; reg [ OP_ADDR_W -1:0] rd_narrow_xy_addr_x; reg rd_wide_xy_ena_y = 1'b0; reg [BANK_ADDR_W -1:0] rd_wide_xy_bank_y; reg [ OP_ADDR_W -1:0] rd_wide_xy_addr_y; reg rd_narrow_xy_ena_y = 1'b0; reg [BANK_ADDR_W -1:0] rd_narrow_xy_bank_y; reg [ OP_ADDR_W -1:0] rd_narrow_xy_addr_y; reg wr_wide_xy_ena_x = 1'b0; reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_x; reg [ OP_ADDR_W -1:0] wr_wide_xy_addr_x; reg [ WORD_EXT_W -1:0] wr_wide_x_dout_x; reg [ WORD_EXT_W -1:0] wr_wide_y_dout_x; reg wr_narrow_xy_ena_x = 1'b0; reg [BANK_ADDR_W -1:0] wr_narrow_xy_bank_x; reg [ OP_ADDR_W -1:0] wr_narrow_xy_addr_x; reg [ WORD_EXT_W -1:0] wr_narrow_x_dout_x; reg [ WORD_EXT_W -1:0] wr_narrow_y_dout_x; reg wr_wide_xy_ena_y = 1'b0; reg [BANK_ADDR_W -1:0] wr_wide_xy_bank_y; reg [ OP_ADDR_W -1:0] wr_wide_xy_addr_y; reg [ WORD_EXT_W -1:0] wr_wide_x_dout_y; reg [ WORD_EXT_W -1:0] wr_wide_y_dout_y; reg wr_narrow_xy_ena_y = 1'b0; reg [BANK_ADDR_W -1:0] wr_narrow_xy_bank_y; reg [ OP_ADDR_W -1:0] wr_narrow_xy_addr_y; reg [ WORD_EXT_W -1:0] wr_narrow_x_dout_y; reg [ WORD_EXT_W -1:0] wr_narrow_y_dout_y; // // Mapping // assign wrk_rd_wide_xy_ena_x = rd_wide_xy_ena_x; assign wrk_rd_wide_xy_bank_x = rd_wide_xy_bank_x; assign wrk_rd_wide_xy_addr_x = rd_wide_xy_addr_x; assign wrk_rd_narrow_xy_ena_x = rd_narrow_xy_ena_x; assign wrk_rd_narrow_xy_bank_x = rd_narrow_xy_bank_x; assign wrk_rd_narrow_xy_addr_x = rd_narrow_xy_addr_x; assign wrk_rd_wide_xy_ena_y = rd_wide_xy_ena_y; assign wrk_rd_wide_xy_bank_y = rd_wide_xy_bank_y; assign wrk_rd_wide_xy_addr_y = rd_wide_xy_addr_y; assign wrk_rd_narrow_xy_ena_y = rd_narrow_xy_ena_y; assign wrk_rd_narrow_xy_bank_y = rd_narrow_xy_bank_y; assign wrk_rd_narrow_xy_addr_y = rd_narrow_xy_addr_y; assign wrk_wr_wide_xy_ena_x = wr_wide_xy_ena_x; assign wrk_wr_wide_xy_bank_x = wr_wide_xy_bank_x; assign wrk_wr_wide_xy_addr_x = wr_wide_xy_addr_x; assign wrk_wr_wide_x_dout_x = wr_wide_x_dout_x; assign wrk_wr_wide_y_dout_x = wr_wide_y_dout_x; assign wrk_wr_narrow_xy_ena_x = wr_narrow_xy_ena_x; assign wrk_wr_narrow_xy_bank_x = wr_narrow_xy_bank_x; assign wrk_wr_narrow_xy_addr_x = wr_narrow_xy_addr_x; assign wrk_wr_narrow_x_dout_x = wr_narrow_x_dout_x; assign wrk_wr_narrow_y_dout_x = wr_narrow_y_dout_x; assign wrk_wr_wide_xy_ena_y = wr_wide_xy_ena_y; assign wrk_wr_wide_xy_bank_y = wr_wide_xy_bank_y; assign wrk_wr_wide_xy_addr_y = wr_wide_xy_addr_y; assign wrk_wr_wide_x_dout_y = wr_wide_x_dout_y; assign wrk_wr_wide_y_dout_y = wr_wide_y_dout_y; assign wrk_wr_narrow_xy_ena_y = wr_narrow_xy_ena_y; assign wrk_wr_narrow_xy_bank_y = wr_narrow_xy_bank_y; assign wrk_wr_narrow_xy_addr_y = wr_narrow_xy_addr_y; assign wrk_wr_narrow_x_dout_y = wr_narrow_x_dout_y; assign wrk_wr_narrow_y_dout_y = wr_narrow_y_dout_y; // // Delays // reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly1; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly2; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly3; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_dly4; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly1; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly2; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly3; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_dly4; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly1; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly2; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly3; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_dly4; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly1; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly2; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly3; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_dly4; reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly1; reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly2; reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_x_dly3; reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly1; reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly2; reg [WORD_EXT_W -1:0] wrk_rd_wide_x_din_y_dly3; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly1; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly2; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_x_dly3; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly1; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly2; reg [WORD_EXT_W -1:0] wrk_rd_narrow_x_din_y_dly3; always @(posedge clk) begin // {rd_wide_xy_addr_x_dly2, rd_wide_xy_addr_x_dly1} <= {rd_wide_xy_addr_x_dly1, rd_wide_xy_addr_x}; {rd_wide_xy_addr_y_dly2, rd_wide_xy_addr_y_dly1} <= {rd_wide_xy_addr_y_dly1, rd_wide_xy_addr_y}; // {rd_wide_xy_addr_x_dly4, rd_wide_xy_addr_x_dly3} <= {rd_wide_xy_addr_x_dly3, rd_wide_xy_addr_x_dly2}; {rd_wide_xy_addr_y_dly4, rd_wide_xy_addr_y_dly3} <= {rd_wide_xy_addr_y_dly3, rd_wide_xy_addr_y_dly2}; // {rd_narrow_xy_addr_x_dly2, rd_narrow_xy_addr_x_dly1} <= {rd_narrow_xy_addr_x_dly1, rd_narrow_xy_addr_x}; {rd_narrow_xy_addr_y_dly2, rd_narrow_xy_addr_y_dly1} <= {rd_narrow_xy_addr_y_dly1, rd_narrow_xy_addr_y}; // {rd_narrow_xy_addr_x_dly4, rd_narrow_xy_addr_x_dly3} <= {rd_narrow_xy_addr_x_dly3, rd_narrow_xy_addr_x_dly2}; {rd_narrow_xy_addr_y_dly4, rd_narrow_xy_addr_y_dly3} <= {rd_narrow_xy_addr_y_dly3, rd_narrow_xy_addr_y_dly2}; // {wrk_rd_wide_x_din_x_dly3, wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1} <= {wrk_rd_wide_x_din_x_dly2, wrk_rd_wide_x_din_x_dly1, wrk_rd_wide_x_din_x}; {wrk_rd_wide_x_din_y_dly3, wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1} <= {wrk_rd_wide_x_din_y_dly2, wrk_rd_wide_x_din_y_dly1, wrk_rd_wide_x_din_y}; // {wrk_rd_narrow_x_din_x_dly3, wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1} <= {wrk_rd_narrow_x_din_x_dly2, wrk_rd_narrow_x_din_x_dly1, wrk_rd_narrow_x_din_x}; {wrk_rd_narrow_x_din_y_dly3, wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1} <= {wrk_rd_narrow_x_din_y_dly2, wrk_rd_narrow_x_din_y_dly1, wrk_rd_narrow_x_din_y}; // end // // Read Enable Logic // task _update_wide_xy_rd_en; input _en; {rd_wide_xy_ena_x, rd_wide_xy_ena_y } <= {2{_en}}; endtask task _update_narrow_xy_rd_en; input _en; {rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{_en}}; endtask task enable_wide_xy_rd_en; _update_wide_xy_rd_en(1'b1); endtask task disable_wide_xy_rd_en; _update_wide_xy_rd_en(1'b0); endtask task enable_narrow_xy_rd_en; _update_narrow_xy_rd_en(1'b1); endtask task disable_narrow_xy_rd_en; _update_narrow_xy_rd_en(1'b0); endtask always @(posedge clk) // if (rst) begin // disable_wide_xy_rd_en; disable_narrow_xy_rd_en; /* rd_wide_xy_ena_x <= 1'b0; rd_wide_xy_ena_y <= 1'b0; rd_narrow_xy_ena_x <= 1'b0; rd_narrow_xy_ena_y <= 1'b0; */ end else begin // disable_wide_xy_rd_en; disable_narrow_xy_rd_en; // //rd_wide_xy_ena_x <= 1'b0; //rd_wide_xy_ena_y <= 1'b0; //rd_narrow_xy_ena_x <= 1'b0; //rd_narrow_xy_ena_y <= 1'b0; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_MODULAR_REDUCE_INIT: // case (wrk_fsm_state_next_one_pass) // WRK_FSM_STATE_LATENCY_PRE1, WRK_FSM_STATE_LATENCY_PRE2, WRK_FSM_STATE_BUSY: // enable_narrow_xy_rd_en; //{rd_narrow_xy_ena_x, rd_narrow_xy_ena_y} <= {2{1'b1}}; // // endcase // // UOP_OPCODE_COPY_CRT_Y2X: // case (wrk_fsm_state_next_one_pass) // WRK_FSM_STATE_LATENCY_PRE1, WRK_FSM_STATE_LATENCY_PRE2, WRK_FSM_STATE_BUSY: begin // enable_wide_xy_rd_en; enable_narrow_xy_rd_en; // end // endcase // UOP_OPCODE_COPY_LADDERS_X2Y: // case (wrk_fsm_state_next_one_pass_meander) // WRK_FSM_STATE_LATENCY_PRE1_M1, WRK_FSM_STATE_LATENCY_PRE1_M2, WRK_FSM_STATE_LATENCY_PRE2_M1, WRK_FSM_STATE_LATENCY_PRE2_M2, WRK_FSM_STATE_BUSY_M1, WRK_FSM_STATE_BUSY_M2: begin // enable_wide_xy_rd_en; enable_narrow_xy_rd_en; // end // endcase // endcase // end // // Write Enable Logic // task _update_wide_xy_wr_en; input _en; {wr_wide_xy_ena_x, wr_wide_xy_ena_y } <= {2{_en}}; endtask task _update_narrow_xy_wr_en; input _en; {wr_narrow_xy_ena_x, wr_narrow_xy_ena_y} <= {2{_en}}; endtask task enable_wide_xy_wr_en; _update_wide_xy_wr_en(1'b1); endtask task disable_wide_xy_wr_en; _update_wide_xy_wr_en(1'b0); endtask task enable_narrow_xy_wr_en; _update_narrow_xy_wr_en(1'b1); endtask task disable_narrow_xy_wr_en; _update_narrow_xy_wr_en(1'b0); endtask always @(posedge clk) // if (rst) begin // disable_wide_xy_wr_en; disable_narrow_xy_wr_en; //wr_wide_xy_ena_x <= 1'b0; //wr_wide_xy_ena_y <= 1'b0; //wr_narrow_xy_ena_x <= 1'b0; //wr_narrow_xy_ena_y <= 1'b0; // end else begin // disable_wide_xy_wr_en; disable_narrow_xy_wr_en; // //wr_wide_xy_ena_x <= 1'b0; //wr_wide_xy_ena_y <= 1'b0; //wr_narrow_xy_ena_x <= 1'b0; //wr_narrow_xy_ena_y <= 1'b0; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: // enable_narrow_xy_wr_en; // // endcase // UOP_OPCODE_COPY_CRT_Y2X: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: begin // enable_wide_xy_wr_en; enable_narrow_xy_wr_en; // end // endcase // UOP_OPCODE_MODULAR_REDUCE_INIT: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: // enable_wide_xy_wr_en; // // endcase // UOP_OPCODE_COPY_LADDERS_X2Y: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY_M2, WRK_FSM_STATE_LATENCY_POST1_M2, WRK_FSM_STATE_LATENCY_POST2_M2: begin // enable_wide_xy_wr_en; enable_narrow_xy_wr_en; // end // endcase // endcase // end // // Data Logic // reg [CARRY_W -1:0] rd_narrow_x_din_x_cry_r; reg [CARRY_W -1:0] rd_narrow_y_din_x_cry_r; reg [CARRY_W -1:0] rd_narrow_x_din_y_cry_r; reg [CARRY_W -1:0] rd_narrow_y_din_y_cry_r; wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry = wrk_rd_narrow_x_din_x + {{WORD_W{1'b0}}, rd_narrow_x_din_x_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry = wrk_rd_narrow_y_din_x + {{WORD_W{1'b0}}, rd_narrow_y_din_x_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry = wrk_rd_narrow_x_din_y + {{WORD_W{1'b0}}, rd_narrow_x_din_y_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry = wrk_rd_narrow_y_din_y + {{WORD_W{1'b0}}, rd_narrow_y_din_y_cry_r}; wire [WORD_EXT_W -1:0] rd_narrow_x_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_x_w_cry[WORD_W -1:0]}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_x_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_x_w_cry[WORD_W -1:0]}; wire [WORD_EXT_W -1:0] rd_narrow_x_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_x_din_y_w_cry[WORD_W -1:0]}; wire [WORD_EXT_W -1:0] rd_narrow_y_din_y_w_cry_reduced = {{CARRY_W{1'b0}}, rd_narrow_y_din_y_w_cry[WORD_W -1:0]}; always @(posedge clk) begin // wr_wide_x_dout_x <= WORD_EXT_DNC; wr_wide_y_dout_x <= WORD_EXT_DNC; wr_wide_x_dout_y <= WORD_EXT_DNC; wr_wide_y_dout_y <= WORD_EXT_DNC; wr_narrow_x_dout_x <= WORD_EXT_DNC; wr_narrow_y_dout_x <= WORD_EXT_DNC; wr_narrow_x_dout_y <= WORD_EXT_DNC; wr_narrow_y_dout_y <= WORD_EXT_DNC; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES: // case (wrk_fsm_state) // WRK_FSM_STATE_LATENCY_PRE2: begin rd_narrow_x_din_x_cry_r <= CARRY_ZERO; rd_narrow_y_din_x_cry_r <= CARRY_ZERO; rd_narrow_x_din_y_cry_r <= CARRY_ZERO; rd_narrow_y_din_y_cry_r <= CARRY_ZERO; end // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: begin // TODO: post2 doesn't need update of carry, since that's the last word // rd_narrow_x_din_x_cry_r <= rd_narrow_x_din_x_w_cry[WORD_EXT_W -1:WORD_W]; rd_narrow_y_din_x_cry_r <= rd_narrow_y_din_x_w_cry[WORD_EXT_W -1:WORD_W]; rd_narrow_x_din_y_cry_r <= rd_narrow_x_din_y_w_cry[WORD_EXT_W -1:WORD_W]; rd_narrow_y_din_y_cry_r <= rd_narrow_y_din_y_w_cry[WORD_EXT_W -1:WORD_W]; // wr_narrow_x_dout_x <= rd_narrow_x_din_x_w_cry_reduced; wr_narrow_y_dout_x <= rd_narrow_y_din_x_w_cry_reduced; wr_narrow_x_dout_y <= rd_narrow_x_din_y_w_cry_reduced; wr_narrow_y_dout_y <= rd_narrow_y_din_y_w_cry_reduced; // end // endcase // UOP_OPCODE_COPY_CRT_Y2X: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: begin // wr_wide_x_dout_x <= wrk_rd_wide_x_din_y; wr_wide_y_dout_x <= wrk_rd_wide_y_din_y; wr_wide_x_dout_y <= wrk_rd_wide_x_din_y; wr_wide_y_dout_y <= wrk_rd_wide_y_din_y; // wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_y; wr_narrow_y_dout_x <= wrk_rd_narrow_y_din_y; wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y; wr_narrow_y_dout_y <= wrk_rd_narrow_y_din_y; // end // endcase // UOP_OPCODE_COPY_LADDERS_X2Y: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY_M2, WRK_FSM_STATE_LATENCY_POST1_M2, WRK_FSM_STATE_LATENCY_POST2_M2: begin // wr_wide_x_dout_x <= wrk_rd_wide_x_din_x_dly3; wr_wide_y_dout_x <= wrk_rd_wide_x_din_x_dly2; wr_wide_x_dout_y <= wrk_rd_wide_x_din_y_dly3; wr_wide_y_dout_y <= wrk_rd_wide_x_din_y_dly2; // wr_narrow_x_dout_x <= wrk_rd_narrow_x_din_x_dly3; wr_narrow_y_dout_x <= wrk_rd_narrow_x_din_x_dly2; wr_narrow_x_dout_y <= wrk_rd_narrow_x_din_y_dly3; wr_narrow_y_dout_y <= wrk_rd_narrow_x_din_y_dly2; // end // endcase // UOP_OPCODE_MODULAR_REDUCE_INIT: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: begin // wr_wide_x_dout_x <= wrk_rd_narrow_x_din_x; wr_wide_y_dout_x <= wrk_rd_narrow_y_din_x; wr_wide_x_dout_y <= wrk_rd_narrow_x_din_y; wr_wide_y_dout_y <= wrk_rd_narrow_y_din_y; // end // endcase // endcase // end // // Write Address Logic // wire uop_modular_reduce_init_feed_lsb_x = rd_narrow_xy_addr_x_dly2 <= word_index_last_half; wire uop_modular_reduce_init_feed_lsb_y = rd_narrow_xy_addr_y_dly2 <= word_index_last_half; always @(posedge clk) begin // {wr_wide_xy_bank_x, wr_wide_xy_addr_x } <= {BANK_DNC, OP_ADDR_DNC}; {wr_wide_xy_bank_y, wr_wide_xy_addr_y } <= {BANK_DNC, OP_ADDR_DNC}; {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC}; {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC}; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_COPY_CRT_Y2X: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: begin // {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly2}; {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly2}; // {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly2}; {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly2}; // end // endcase // UOP_OPCODE_MODULAR_REDUCE_INIT: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY, WRK_FSM_STATE_LATENCY_POST1, WRK_FSM_STATE_LATENCY_POST2: begin // wr_wide_xy_bank_x <= uop_modular_reduce_init_feed_lsb_x ? BANK_WIDE_L : BANK_WIDE_H; wr_wide_xy_bank_y <= uop_modular_reduce_init_feed_lsb_y ? BANK_WIDE_L : BANK_WIDE_H; // wr_wide_xy_addr_x <= rd_wide_xy_addr_x_dly2; wr_wide_xy_addr_y <= rd_wide_xy_addr_y_dly2; // end // endcase // UOP_OPCODE_COPY_LADDERS_X2Y: // case (wrk_fsm_state) // WRK_FSM_STATE_BUSY_M2, WRK_FSM_STATE_LATENCY_POST1_M2, WRK_FSM_STATE_LATENCY_POST2_M2: begin // {wr_wide_xy_bank_x, wr_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_dly4}; {wr_wide_xy_bank_y, wr_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_dly4}; // {wr_narrow_xy_bank_x, wr_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_dly4}; {wr_narrow_xy_bank_y, wr_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_dly4}; // end // endcase // // endcase // end // // Read Address Logic // reg [OP_ADDR_W -1:0] rd_wide_xy_addr_x_next; reg [OP_ADDR_W -1:0] rd_wide_xy_addr_y_next; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_x_next; reg [OP_ADDR_W -1:0] rd_narrow_xy_addr_y_next; wire rd_wide_xy_addr_x_next_is_last = rd_wide_xy_addr_x_next == word_index_last_half; wire rd_wide_xy_addr_y_next_is_last = rd_wide_xy_addr_y_next == word_index_last_half; wire rd_narrow_xy_addr_x_next_is_last = rd_narrow_xy_addr_x_next == word_index_last; wire rd_narrow_xy_addr_y_next_is_last = rd_narrow_xy_addr_y_next == word_index_last; always @(posedge clk) begin // TODO: Maybe split into two blocks (read address / next address)?? // {rd_wide_xy_bank_x, rd_wide_xy_addr_x } <= {BANK_DNC, OP_ADDR_DNC}; // TODO: Add same default path for io_manager ?? {rd_wide_xy_bank_y, rd_wide_xy_addr_y } <= {BANK_DNC, OP_ADDR_DNC}; {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {BANK_DNC, OP_ADDR_DNC}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {BANK_DNC, OP_ADDR_DNC}; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X: // case (wrk_fsm_state_next_one_pass) // WRK_FSM_STATE_LATENCY_PRE1: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO}; // rd_wide_xy_addr_x_next <= OP_ADDR_ONE; rd_wide_xy_addr_y_next <= OP_ADDR_ONE; // rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; // end // WRK_FSM_STATE_LATENCY_PRE2, WRK_FSM_STATE_BUSY: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_narrow_xy_addr_x_next}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_narrow_xy_addr_y_next}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next}; // rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; // rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; // end // endcase // UOP_OPCODE_MODULAR_REDUCE_INIT: // case (wrk_fsm_state_next_one_pass) // WRK_FSM_STATE_LATENCY_PRE1: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, OP_ADDR_ZERO}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, OP_ADDR_ZERO}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, OP_ADDR_ZERO}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, OP_ADDR_ZERO}; // rd_wide_xy_addr_x_next <= OP_ADDR_ONE; rd_wide_xy_addr_y_next <= OP_ADDR_ONE; // rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; // end // WRK_FSM_STATE_LATENCY_PRE2, WRK_FSM_STATE_BUSY: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x_next}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y_next}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x_next}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y_next}; // rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; // rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; // end // endcase // UOP_OPCODE_COPY_LADDERS_X2Y: // case (wrk_fsm_state_next_one_pass_meander) // WRK_FSM_STATE_LATENCY_PRE1_M1: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, OP_ADDR_ZERO}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, OP_ADDR_ZERO}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, OP_ADDR_ZERO}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, OP_ADDR_ZERO}; // rd_wide_xy_addr_x_next <= OP_ADDR_ONE; rd_wide_xy_addr_y_next <= OP_ADDR_ONE; // rd_narrow_xy_addr_x_next <= OP_ADDR_ONE; rd_narrow_xy_addr_y_next <= OP_ADDR_ONE; // end // WRK_FSM_STATE_LATENCY_PRE1_M2: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y}; // end // WRK_FSM_STATE_LATENCY_PRE2_M1, WRK_FSM_STATE_BUSY_M1: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_out, rd_narrow_xy_addr_x_next}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_out, rd_narrow_xy_addr_y_next}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_out, rd_narrow_xy_addr_x_next}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_out, rd_narrow_xy_addr_y_next}; // rd_wide_xy_addr_x_next <= !rd_wide_xy_addr_x_next_is_last ? rd_wide_xy_addr_x_next + 1'b1: OP_ADDR_ZERO; rd_wide_xy_addr_y_next <= !rd_wide_xy_addr_y_next_is_last ? rd_wide_xy_addr_y_next + 1'b1: OP_ADDR_ZERO; // rd_narrow_xy_addr_x_next <= rd_narrow_xy_addr_x_next + 1'b1; rd_narrow_xy_addr_y_next <= rd_narrow_xy_addr_y_next + 1'b1; // end // WRK_FSM_STATE_LATENCY_PRE2_M2, WRK_FSM_STATE_BUSY_M2: begin // {rd_wide_xy_bank_x, rd_wide_xy_addr_x} <= {sel_wide_in, rd_wide_xy_addr_x}; {rd_wide_xy_bank_y, rd_wide_xy_addr_y} <= {sel_wide_in, rd_wide_xy_addr_y}; // {rd_narrow_xy_bank_x, rd_narrow_xy_addr_x} <= {sel_narrow_in, rd_narrow_xy_addr_x}; {rd_narrow_xy_bank_y, rd_narrow_xy_addr_y} <= {sel_narrow_in, rd_narrow_xy_addr_y}; // end // endcase // // endcase // end // // FSM Process // always @(posedge clk) // if (rst) wrk_fsm_state <= WRK_FSM_STATE_IDLE; else case (opcode) UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X, UOP_OPCODE_MODULAR_REDUCE_INIT: wrk_fsm_state <= wrk_fsm_state_next_one_pass; UOP_OPCODE_COPY_LADDERS_X2Y: wrk_fsm_state <= wrk_fsm_state_next_one_pass_meander; default: wrk_fsm_state <= WRK_FSM_STATE_IDLE; endcase // // Busy Exit Logic // reg wrk_fsm_done_one_pass = 1'b0; reg wrk_fsm_done_one_pass_meander = 1'b0; always @(posedge clk) begin // wrk_fsm_done_one_pass <= 1'b0; wrk_fsm_done_one_pass_meander <= 1'b0; // case (opcode) // UOP_OPCODE_PROPAGATE_CARRIES, UOP_OPCODE_OUTPUT_FROM_NARROW, UOP_OPCODE_COPY_CRT_Y2X, UOP_OPCODE_MODULAR_REDUCE_INIT: begin // if (wrk_fsm_state == WRK_FSM_STATE_BUSY) begin // if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass <= 1'b1; // TODO: Check, whether both are necessary... if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass <= 1'b1; // end // end // UOP_OPCODE_COPY_LADDERS_X2Y: begin // if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M2) begin // if (rd_narrow_xy_addr_x_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; // TODO: Check, whether both are necessary... if (rd_narrow_xy_addr_y_next_is_last) wrk_fsm_done_one_pass_meander <= 1'b1; // end // if (wrk_fsm_state == WRK_FSM_STATE_BUSY_M1) wrk_fsm_done_one_pass_meander <= wrk_fsm_done_one_pass_meander; // end // endcase // end // // FSM Transition Logic // always @* begin // case (wrk_fsm_state) WRK_FSM_STATE_IDLE: wrk_fsm_state_next_one_pass = ena ? WRK_FSM_STATE_LATENCY_PRE1 : WRK_FSM_STATE_IDLE ; WRK_FSM_STATE_LATENCY_PRE1: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_LATENCY_PRE2 ; WRK_FSM_STATE_LATENCY_PRE2: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_BUSY ; WRK_FSM_STATE_BUSY: wrk_fsm_state_next_one_pass = wrk_fsm_done_one_pass ? WRK_FSM_STATE_LATENCY_POST1 : WRK_FSM_STATE_BUSY ; WRK_FSM_STATE_LATENCY_POST1: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_LATENCY_POST2 ; WRK_FSM_STATE_LATENCY_POST2: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_STOP ; WRK_FSM_STATE_STOP: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_IDLE ; default: wrk_fsm_state_next_one_pass = WRK_FSM_STATE_IDLE ; endcase // end always @* begin // case (wrk_fsm_state) WRK_FSM_STATE_IDLE: wrk_fsm_state_next_one_pass_meander = ena ? WRK_FSM_STATE_LATENCY_PRE1_M1 : WRK_FSM_STATE_IDLE ; // WRK_FSM_STATE_LATENCY_PRE1_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE1_M2 ; WRK_FSM_STATE_LATENCY_PRE1_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE2_M1 ; WRK_FSM_STATE_LATENCY_PRE2_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_PRE2_M2 ; WRK_FSM_STATE_LATENCY_PRE2_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_BUSY_M1 ; WRK_FSM_STATE_BUSY_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_BUSY_M2 ; WRK_FSM_STATE_BUSY_M2: wrk_fsm_state_next_one_pass_meander = wrk_fsm_done_one_pass_meander ? WRK_FSM_STATE_LATENCY_POST1_M1 : WRK_FSM_STATE_BUSY_M1 ; WRK_FSM_STATE_LATENCY_POST1_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST1_M2 ; WRK_FSM_STATE_LATENCY_POST1_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST2_M1 ; WRK_FSM_STATE_LATENCY_POST2_M1: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_LATENCY_POST2_M2 ; WRK_FSM_STATE_LATENCY_POST2_M2: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_STOP ; // WRK_FSM_STATE_STOP: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_IDLE ; // default: wrk_fsm_state_next_one_pass_meander = WRK_FSM_STATE_IDLE ; endcase // end // // Ready Logic // reg rdy_reg = 1'b1; assign rdy = rdy_reg; always @(posedge clk) // if (rst) rdy_reg <= 1'b1; else case (wrk_fsm_state) WRK_FSM_STATE_IDLE: rdy_reg <= ~ena; WRK_FSM_STATE_STOP: rdy_reg <= 1'b1; endcase endmodule