From 71b75290bf2ade9a4022bad93dc80bfb77f87f40 Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" <meisterpaul1@yandex.ru> Date: Mon, 10 Jul 2017 15:31:25 +0300 Subject: * made separate file for low-level settings * turned crazy triple multiplier array into one array with input mux --- src/rtl/modexpa7_settings.v | 6 + src/rtl/modexpa7_systolic_multiplier.v | 1298 ++++++++++++++++--------------- src/rtl/pe/modexpa7_adder32.v | 2 +- src/rtl/pe/modexpa7_lowlevel_settings.v | 15 - src/rtl/pe/modexpa7_primitive_switch.v | 16 + src/rtl/pe/modexpa7_subtractor32.v | 2 +- src/rtl/pe/modexpa7_systolic_pe.v | 2 +- 7 files changed, 683 insertions(+), 658 deletions(-) create mode 100644 src/rtl/modexpa7_settings.v delete mode 100644 src/rtl/pe/modexpa7_lowlevel_settings.v create mode 100644 src/rtl/pe/modexpa7_primitive_switch.v (limited to 'src/rtl') diff --git a/src/rtl/modexpa7_settings.v b/src/rtl/modexpa7_settings.v new file mode 100644 index 0000000..0ec6978 --- /dev/null +++ b/src/rtl/modexpa7_settings.v @@ -0,0 +1,6 @@ +localparam SYSTOLIC_PE_LATENCY = 4; + +localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER; +localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER; +localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH; + diff --git a/src/rtl/modexpa7_systolic_multiplier.v b/src/rtl/modexpa7_systolic_multiplier.v index cb1c716..56e7be3 100644 --- a/src/rtl/modexpa7_systolic_multiplier.v +++ b/src/rtl/modexpa7_systolic_multiplier.v @@ -40,16 +40,16 @@ module modexpa7_systolic_multiplier # ( // // This sets the address widths of memory buffers. Internal data - // width is 32 bits, so for e.g. 1024-bit operands buffers must store - // 1024 / 32 = 32 words, and these need 5-bit address bus, because - // 2 ** 5 = 32. + // width is 32 bits, so for e.g. 2048-bit operands buffers must store + // 2048 / 32 = 64 words, and these need 5-bit address bus, because + // 2 ** 6 = 64. // - parameter OPERAND_ADDR_WIDTH = 5, + parameter OPERAND_ADDR_WIDTH = 4, // - // This sets the width of the systolic cycle counter. TODO: Explain. + // Explain. // - parameter SYSTOLIC_ARRAY_POWER = 3 + parameter SYSTOLIC_ARRAY_POWER = 2 ) ( input clk, @@ -72,801 +72,819 @@ module modexpa7_systolic_multiplier # output [ 32-1:0] r_bram_in, output r_bram_wr, - input [OPERAND_ADDR_WIDTH-1:0] n_num_words + input [OPERAND_ADDR_WIDTH-1:0] ab_num_words ); - + // - // Constants + // Include Settings // - localparam SYSTOLIC_CNTR_WIDTH = OPERAND_ADDR_WIDTH - SYSTOLIC_ARRAY_POWER; - localparam SYSTOLIC_ARRAY_LENGTH = 2 ** SYSTOLIC_ARRAY_POWER; - localparam SYSTOLIC_NUM_CYCLES = 2 ** SYSTOLIC_CNTR_WIDTH; - - localparam SYSTOLIC_PE_LATENCY = 4; - + `include "pe/modexpa7_primitive_switch.v" + `include "modexpa7_settings.v" + // // FSM Declaration // - localparam [ 3: 0] FSM_STATE_IDLE = 4'd0; - localparam [ 3: 0] FSM_STATE_INIT_ZERO_ADDR = 4'd1; - localparam [ 3: 0] FSM_STATE_INIT_NEXT_ADDR = 4'd2; - localparam [ 3: 0] FSM_STATE_INIT_LAST_ADDR = 4'd3; - localparam [ 3: 0] FSM_STATE_PIPE_CRUNCH = 4'd4; - localparam [ 3: 0] FSM_STATE_PIPE_RELOAD = 4'd5; - localparam [ 3: 0] FSM_STATE_SAVE_ZERO_ADDR = 4'd6; - localparam [ 3: 0] FSM_STATE_SAVE_NEXT_ADDR = 4'd7; - localparam [ 3: 0] FSM_STATE_SAVE_LAST_ADDR = 4'd8; - localparam [ 3: 0] FSM_STATE_STOP = 4'd9; - - reg [ 3: 0] fsm_state = FSM_STATE_IDLE; - reg [ 3: 0] fsm_next_state; + localparam [ 7: 0] FSM_STATE_IDLE = 8'h00; - - // - // Enable Delay (Trigger) - // - reg ena_dly = 1'b0; - wire ena_trig = ena && !ena_dly; - always @(posedge clk) ena_dly <= ena; + localparam [ 7: 0] FSM_STATE_LOAD_B_START = 8'h11; + localparam [ 7: 0] FSM_STATE_LOAD_B_SHIFT = 8'h12; + localparam [ 7: 0] FSM_STATE_LOAD_B_WRITE = 8'h13; + localparam [ 7: 0] FSM_STATE_LOAD_B_FINAL = 8'h14; - - // - // Parameters Latch - // - reg [OPERAND_ADDR_WIDTH-1:0] n_num_words_latch; + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_START = 8'h21; + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_SHIFT = 8'h22; + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_WRITE = 8'h23; + localparam [ 7: 0] FSM_STATE_LOAD_N_COEFF_FINAL = 8'h24; - always @(posedge clk) - // - if (fsm_next_state == FSM_STATE_INIT_ZERO_ADDR) - n_num_words_latch <= n_num_words; + localparam [ 7: 0] FSM_STATE_LOAD_N_START = 8'h31; + localparam [ 7: 0] FSM_STATE_LOAD_N_SHIFT = 8'h32; + localparam [ 7: 0] FSM_STATE_LOAD_N_WRITE = 8'h33; + localparam [ 7: 0] FSM_STATE_LOAD_N_FINAL = 8'h34; + localparam [ 7: 0] FSM_STATE_MULT_A_B_START = 8'h41; + localparam [ 7: 0] FSM_STATE_MULT_A_B_CRUNCH = 8'h42; + localparam [ 7: 0] FSM_STATE_MULT_A_B_RELOAD = 8'h43; + localparam [ 7: 0] FSM_STATE_MULT_A_B_FINAL = 8'h44; - // - // Addresses - // - localparam [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = {OPERAND_ADDR_WIDTH{1'b0}}; - wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = n_num_words_latch; + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_START = 8'h51; + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_CRUNCH = 8'h52; + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_RELOAD = 8'h53; + localparam [ 7: 0] FSM_STATE_MULT_AB_N_COEFF_FINAL = 8'h54; + + localparam [ 7: 0] FSM_STATE_MULT_Q_N_START = 8'h61; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_CRUNCH = 8'h62; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_RELOAD = 8'h63; + localparam [ 7: 0] FSM_STATE_MULT_Q_N_FINAL = 8'h64; + localparam [ 7: 0] FSM_STATE_STOP = 8'hFF; // - // BRAM Addresses + // FSM State / Next State // - reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_reg; - reg [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_reg; - reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_reg; - reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_reg; - reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_reg; - reg [OPERAND_ADDR_WIDTH-1:0] r_bram_addr_reg; + reg [ 7: 0] fsm_state = FSM_STATE_IDLE; + reg [ 7: 0] fsm_next_state; - wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr = s_bram_addr_reg; - - reg [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_dly; - reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_dly; - reg [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_dly; - reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_dly; - - wire [OPERAND_ADDR_WIDTH-1:0] b_bram_addr_next = b_bram_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] a_bram_addr_next = a_bram_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_bram_addr_next = n_coeff_bram_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] n_bram_addr_next = n_bram_addr + 1'b1; - wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_next = s_bram_addr + 1'b1; + + // + // Enable Delay and Trigger + // + reg ena_dly = 1'b0; - wire b_bram_addr_done = - (b_bram_addr == bram_addr_last) ? 1'b1 : 1'b0; + /* delay enable by one clock cycle */ + always @(posedge clk) ena_dly <= ena; - wire s_bram_addr_done = - (s_bram_addr == bram_addr_last) ? 1'b1 : 1'b0; + /* trigger new operation when enable goes high */ + wire ena_trig = ena && !ena_dly; + - assign b_bram_addr = b_bram_addr_reg; - assign a_bram_addr = a_bram_addr_reg; - assign n_coeff_bram_addr = n_coeff_bram_addr_reg; - assign n_bram_addr = n_bram_addr_reg; - assign r_bram_addr = r_bram_addr_reg; + // + // Ready Flag Logic + // + reg rdy_reg = 1'b1; + assign rdy = rdy_reg; - always @(posedge clk) b_bram_addr_dly <= b_bram_addr; - always @(posedge clk) n_coeff_bram_addr_dly <= n_coeff_bram_addr; - always @(posedge clk) n_bram_addr_dly <= n_bram_addr; - always @(posedge clk) s_bram_addr_dly <= s_bram_addr; + always @(posedge clk or negedge rst_n) + + /* reset flag */ + if (rst_n == 1'b0) rdy_reg <= 1'b1; + else begin + + /* clear flag when operation is started */ + if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; + + /* set flag after operation is finished */ + if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; + + end + - always @(posedge clk) // - case (fsm_next_state) - FSM_STATE_INIT_ZERO_ADDR: b_bram_addr_reg <= bram_addr_zero; - FSM_STATE_INIT_NEXT_ADDR: b_bram_addr_reg <= b_bram_addr_next; - endcase - - always @(posedge clk) - case (fsm_next_state) - FSM_STATE_SAVE_ZERO_ADDR: s_bram_addr_reg <= bram_addr_zero; - FSM_STATE_SAVE_NEXT_ADDR: s_bram_addr_reg <= s_bram_addr_next; - endcase - - always @(posedge clk) + // Parameters Latch // - case (fsm_next_state) - FSM_STATE_INIT_LAST_ADDR: a_bram_addr_reg <= bram_addr_zero; - FSM_STATE_PIPE_RELOAD: a_bram_addr_reg <= (a_bram_addr < bram_addr_last) ? a_bram_addr_next : a_bram_addr; - endcase + reg [OPERAND_ADDR_WIDTH-1:0] ab_num_words_latch; + /* save number of words in a and b when new operation starts */ always @(posedge clk) // - case (fsm_next_state) - FSM_STATE_INIT_ZERO_ADDR: n_coeff_bram_addr_reg <= bram_addr_zero; - FSM_STATE_INIT_NEXT_ADDR: n_coeff_bram_addr_reg <= n_coeff_bram_addr_next; - endcase - - - - + if (fsm_next_state == FSM_STATE_LOAD_B_START) + ab_num_words_latch <= ab_num_words; + + // - // Latency Compensation TODO: Remove ab maybe? Looks like latency should be consistent for all cycles... + // Systolic Cycle Counters // - wire [SYSTOLIC_PE_LATENCY:0] pe_latency_start = {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1}; - - reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb; - reg [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb; + + /* handy values */ + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = ab_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; - wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_lsb_next = - {pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]}; + /* counters */ + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init; + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load; + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload; + + /* handy increment values */ + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_init_next = syst_cnt_init + 1'b1; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_next = syst_cnt_load + 1'b1; + wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_unload_next = syst_cnt_unload + 1'b1; - wire [SYSTOLIC_PE_LATENCY:0] pe_latency_ab_msb_next = - {pe_latency_ab_msb[SYSTOLIC_PE_LATENCY-1:0], pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]}; + /* handy stop flags */ + wire syst_cnt_init_done = (syst_cnt_init == syst_cnt_last) ? 1'b1 : 1'b0; + wire syst_cnt_load_done = (syst_cnt_load == syst_cnt_last) ? 1'b1 : 1'b0; + wire syst_cnt_unload_done = (syst_cnt_unload == syst_cnt_last) ? 1'b1 : 1'b0; - wire pe_latency_ab_lsb_done = pe_latency_ab_lsb[SYSTOLIC_PE_LATENCY]; - wire pe_latency_ab_msb_done = pe_latency_ab_msb[SYSTOLIC_PE_LATENCY]; + /* delayed load counter */ + reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_load_dly; + always @(posedge clk) syst_cnt_load_dly <= syst_cnt_load; - always @(posedge clk) - // - if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) - // - case (fsm_state) - FSM_STATE_INIT_LAST_ADDR, - FSM_STATE_PIPE_RELOAD: pe_latency_ab_lsb <= pe_latency_start; - FSM_STATE_PIPE_CRUNCH: pe_latency_ab_lsb <= pe_latency_ab_lsb_done ? - pe_latency_ab_lsb : pe_latency_ab_lsb_next; - endcase // - // Buffers + // Multiplier Iteration Counter // - integer i, j; - - reg [31: 0] b_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; - reg [31: 0] n_coeff_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; - reg [31: 0] n_buf[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; + + /* handy values */ + wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_zero = {SYSTOLIC_ARRAY_POWER{1'b0}}; + wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_last = {SYSTOLIC_ARRAY_POWER{1'b1}}; - always @(posedge clk) + /* counter */ + reg [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt; + + /* handy increment value and stop flag */ + wire [SYSTOLIC_ARRAY_POWER-1:0] mult_cnt_next = mult_cnt + 1'b1; + wire mult_cnt_done = (mult_cnt == mult_cnt_last) ? 1'b1 : 1'b0; + + // - case (fsm_state) - FSM_STATE_INIT_ZERO_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - b_buf[i][j] <= 32'd0; - - FSM_STATE_INIT_NEXT_ADDR, - FSM_STATE_INIT_LAST_ADDR: - b_buf[b_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][b_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= b_bram_out; - endcase - - always @(posedge clk) + // Initialization Counter Control Logic + // + always @(posedge clk) begin // case (fsm_state) - FSM_STATE_INIT_ZERO_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - n_coeff_buf[i][j] <= 32'd0; - - FSM_STATE_INIT_NEXT_ADDR, - FSM_STATE_INIT_LAST_ADDR: - n_coeff_buf[n_coeff_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_coeff_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_coeff_bram_out; + FSM_STATE_LOAD_B_START, + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_LOAD_N_START: mult_cnt <= mult_cnt_zero; + + FSM_STATE_LOAD_B_SHIFT, + FSM_STATE_LOAD_N_COEFF_SHIFT, + FSM_STATE_LOAD_N_SHIFT: mult_cnt <= mult_cnt_next; endcase - - always @(posedge clk) // case (fsm_state) - FSM_STATE_INIT_ZERO_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - n_buf[i][j] <= 32'd0; - - FSM_STATE_INIT_NEXT_ADDR, - FSM_STATE_INIT_LAST_ADDR: - n_buf[n_bram_addr_dly[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]][n_bram_addr_dly[SYSTOLIC_ARRAY_POWER-1:0]] <= n_bram_out; + FSM_STATE_LOAD_B_START, + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_LOAD_N_START: syst_cnt_init <= syst_cnt_zero; + + FSM_STATE_LOAD_B_WRITE, + FSM_STATE_LOAD_N_COEFF_WRITE, + FSM_STATE_LOAD_N_WRITE: syst_cnt_init <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init; endcase - - - - + // + end // - // Cycle Counters + // Operand Loader // - reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab; - reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_q; - reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn; - reg [ OPERAND_ADDR_WIDTH :0] mult_cnt_s; - - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt; - reg [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_dly[SYSTOLIC_PE_LATENCY-1:0]; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_latency = syst_cnt_dly[SYSTOLIC_PE_LATENCY-1]; - - wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}}; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_zero = {SYSTOLIC_CNTR_WIDTH{1'b0}}; - - wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_half = {1'b0, n_num_words}; - wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_last = {n_num_words, 1'b1}; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_last = n_num_words_latch[OPERAND_ADDR_WIDTH-1:SYSTOLIC_ARRAY_POWER]; - - wire mult_cnt_ab_done = (mult_cnt_ab == mult_cnt_last) ? 1'b1 : 1'b0; - wire mult_cnt_q_done = (mult_cnt_q == mult_cnt_last) ? 1'b1 : 1'b0; - wire mult_cnt_qn_done = (mult_cnt_qn == mult_cnt_last) ? 1'b1 : 1'b0; - wire mult_cnt_s_done = (mult_cnt_s == mult_cnt_last) ? 1'b1 : 1'b0; + /* + * Explain how parallelized loader works here... + * + */ - wire syst_cnt_done = (syst_cnt == syst_cnt_last) ? 1'b1 : 1'b0; - - wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_ab_next = mult_cnt_ab + 1'b1; - wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_q_next = mult_cnt_q + 1'b1; - wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_qn_next = mult_cnt_qn + 1'b1; - wire [ OPERAND_ADDR_WIDTH :0] mult_cnt_s_next = mult_cnt_s + 1'b1; + /* loader banks */ + localparam [ 1: 0] LOADER_ADDR_MSB_B = 2'd0; + localparam [ 1: 0] LOADER_ADDR_MSB_N_COEFF = 2'd1; + localparam [ 1: 0] LOADER_ADDR_MSB_N = 2'd2; - wire [SYSTOLIC_CNTR_WIDTH-1:0] syst_cnt_next = syst_cnt_done ? syst_cnt_zero : syst_cnt + 1'b1; - + /* loader input */ + reg [ 2-1:0] loader_addr_msb[0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [SYSTOLIC_CNTR_WIDTH-1:0] loader_addr_lsb[0:SYSTOLIC_ARRAY_LENGTH-1]; + reg loader_wren [0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [ 32-1:0] loader_din [0:SYSTOLIC_ARRAY_LENGTH-1]; - always @(posedge clk) + /* loader output */ + wire [ 32-1:0] loader_dout [0:SYSTOLIC_ARRAY_LENGTH-1]; + + /* generate parallelized loader */ + // - if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) - // - case (fsm_state) - FSM_STATE_INIT_LAST_ADDR, - FSM_STATE_PIPE_RELOAD: syst_cnt <= syst_cnt_zero; - FSM_STATE_PIPE_CRUNCH: syst_cnt <= syst_cnt_done ? syst_cnt : syst_cnt_next; - endcase - - always @(posedge clk) + // Loader currently stores B, N_COEFF and N, it can be coded another way + // to initially stire B, then AB, then Q. Some memory can be saved thay way. + // Maybe later... // - if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) - // - case (fsm_state) - FSM_STATE_INIT_LAST_ADDR: mult_cnt_ab <= mult_cnt_zero; - FSM_STATE_PIPE_RELOAD: mult_cnt_ab <= mult_cnt_ab_done ? mult_cnt_ab : mult_cnt_ab_next; - endcase - - always @(posedge clk) + + genvar i; + generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1) // - if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) + begin : gen_bram_1rw_readfirst_loader // - case (fsm_state) - FSM_STATE_INIT_LAST_ADDR: mult_cnt_q <= mult_cnt_zero; - FSM_STATE_PIPE_RELOAD: if (mult_cnt_ab > mult_cnt_zero) mult_cnt_q <= mult_cnt_q_done ? mult_cnt_q : mult_cnt_q_next; - endcase + bram_1rw_readfirst # + ( + .MEM_WIDTH (32), + .MEM_ADDR_BITS (SYSTOLIC_CNTR_WIDTH + 2) + ) + bram_loader + ( + .clk (clk), + .a_addr ({loader_addr_msb[i], loader_addr_lsb[i]}), + .a_wr (loader_wren[i]), + .a_in (loader_din[i]), + .a_out (loader_dout[i]) + ); + // + end + // + endgenerate + - always @(posedge clk) // - if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) - // - case (fsm_state) - FSM_STATE_INIT_LAST_ADDR: mult_cnt_qn <= mult_cnt_zero; - FSM_STATE_PIPE_RELOAD: if (mult_cnt_q > mult_cnt_zero) mult_cnt_qn <= mult_cnt_qn_done ? mult_cnt_qn : mult_cnt_qn_next; - endcase - - always @(posedge clk) + // Block Memory Addresses // - if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) - // - case (fsm_state) - FSM_STATE_INIT_LAST_ADDR: mult_cnt_s <= mult_cnt_zero; - FSM_STATE_PIPE_RELOAD: if (mult_cnt_qn > mult_cnt_zero) mult_cnt_s <= mult_cnt_s_done ? mult_cnt_qn : mult_cnt_s_next; - endcase + /* + * Explain why there are two memory sizes. + * + */ + + /* the very first addresses */ + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_zero = { {OPERAND_ADDR_WIDTH{1'b0}}}; + wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_zero = {1'b0, {OPERAND_ADDR_WIDTH{1'b0}}}; + + /* the very last addresses */ + wire [OPERAND_ADDR_WIDTH-1:0] bram_addr_last = {ab_num_words_latch}; + wire [OPERAND_ADDR_WIDTH :0] bram_addr_ext_last = {ab_num_words_latch, 1'b1}; + + /* address registers */ + reg [OPERAND_ADDR_WIDTH-1:0] a_addr; + reg [OPERAND_ADDR_WIDTH-1:0] b_addr; + reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr; + reg [OPERAND_ADDR_WIDTH-1:0] n_addr; + reg [OPERAND_ADDR_WIDTH :0] ab_addr_ext; + reg [OPERAND_ADDR_WIDTH-1:0] q_addr; + reg [OPERAND_ADDR_WIDTH :0] qn_addr_ext; + /* handy increment values */ + wire [OPERAND_ADDR_WIDTH-1:0] a_addr_next = a_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] b_addr_next = b_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_next = n_coeff_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] n_addr_next = n_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH :0] ab_addr_ext_next = ab_addr_ext + 1'b1; + wire [OPERAND_ADDR_WIDTH-1:0] q_addr_next = q_addr + 1'b1; + wire [OPERAND_ADDR_WIDTH :0] qn_addr_ext_next = qn_addr_ext + 1'b1; + + /* handy stop flags */ + wire a_addr_done = (a_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire b_addr_done = (b_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire n_coeff_addr_done = (n_coeff_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire n_addr_done = (n_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire ab_addr_ext_done = (ab_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0; + wire q_addr_done = (q_addr == bram_addr_last) ? 1'b1 : 1'b0; + wire qn_addr_ext_done = (qn_addr_ext == bram_addr_ext_last) ? 1'b1 : 1'b0; + + /* delayed B address */ + reg [OPERAND_ADDR_WIDTH-1:0] b_addr_dly; + always @(posedge clk) b_addr_dly <= b_addr; + + reg [OPERAND_ADDR_WIDTH-1:0] n_coeff_addr_dly; + always @(posedge clk) n_coeff_addr_dly <= n_coeff_addr; + + reg [OPERAND_ADDR_WIDTH-1:0] n_addr_dly; + always @(posedge clk) n_addr_dly <= n_addr; + + /* map registers to top-level ports */ + assign a_bram_addr = a_addr; + assign b_bram_addr = b_addr; + assign n_coeff_bram_addr = n_coeff_addr; + assign n_bram_addr = n_addr; + + + // + // Memory Address Control Logic + // always @(posedge clk) begin - syst_cnt_dly[0] <= syst_cnt; - for (i=1; i<SYSTOLIC_PE_LATENCY; i=i+1) - syst_cnt_dly[i] <= syst_cnt_dly[i-1]; + // + case (fsm_next_state) + FSM_STATE_LOAD_B_START: b_addr <= bram_addr_zero; + FSM_STATE_LOAD_N_COEFF_START: n_coeff_addr <= bram_addr_zero; + FSM_STATE_LOAD_N_START: n_addr <= bram_addr_zero; + + FSM_STATE_LOAD_B_SHIFT: b_addr <= b_addr_next; + FSM_STATE_LOAD_N_COEFF_SHIFT: n_coeff_addr <= n_coeff_addr_next; + FSM_STATE_LOAD_N_SHIFT: n_addr <= n_addr_next; + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_A_B_START: a_addr <= bram_addr_zero; + FSM_STATE_MULT_A_B_RELOAD: a_addr <= !a_addr_done ? a_addr_next : a_addr; + endcase + // end + // - // Systolic Array + // Internal Memories // - wire [31: 0] mul_ab_p[SYSTOLIC_ARRAY_LENGTH-1:0]; - wire [31: 0] mul_ab_c_out[SYSTOLIC_ARRAY_LENGTH-1:0]; - wire [31: 0] mul_q_p[SYSTOLIC_ARRAY_LENGTH-1:0]; - wire [31: 0] mul_q_c_out[SYSTOLIC_ARRAY_LENGTH-1:0]; + /* memory inputs */ + reg [31: 0] ab_data_in; + reg [31: 0] q_data_in; + reg [31: 0] qn_data_in; - wire [31: 0] mul_qn_p[SYSTOLIC_ARRAY_LENGTH-1:0]; - wire [31: 0] mul_qn_c_out[SYSTOLIC_ARRAY_LENGTH-1:0]; - - wire [31: 0] mul_ab_a = (mult_cnt_ab <= mult_cnt_half) ? a_bram_out : 32'd0; - reg [31: 0] mul_q_a_int; - reg [31: 0] mul_q_a; - reg [31: 0] mul_qn_a_int; - reg [31: 0] mul_qn_a; - - reg [31: 0] t_ab[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; - reg [31: 0] c_ab_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; + /* memory outputs */ + wire [31: 0] ab_data_out; + wire [31: 0] q_data_out; + wire [31: 0] qn_data_out; - reg [31: 0] t_q[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; - reg [31: 0] c_q_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; + /* write enables */ + reg ab_wren; + reg q_wren; + reg qn_wren; - reg [31: 0] t_qn[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; - reg [31: 0] c_qn_in[SYSTOLIC_NUM_CYCLES-1:0][SYSTOLIC_ARRAY_LENGTH-1:0]; + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) + bram_ab (.clk(clk), .a_addr(ab_addr_ext), .a_wr(ab_wren), .a_in(ab_data_in), .a_out(ab_data_out)); - genvar syst; - generate for (syst=0; syst<SYSTOLIC_ARRAY_LENGTH; syst=syst+1) - begin : gen_mul + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_q (.clk(clk), .a_addr(q_addr), .a_wr(q_wren), .a_in(q_data_in), .a_out(q_data_out)); - modexpa7_systolic_pe mul_ab_inst - ( - .clk (clk), - .a (mul_ab_a), - .b (b_buf[syst_cnt][syst]), - .t (t_ab[syst_cnt][syst]), - .c_in (c_ab_in[syst_cnt][syst]), - - .p (mul_ab_p[syst]), - .c_out (mul_ab_c_out[syst]) - ); - - modexpa7_systolic_pe mul_q_inst - ( - .clk (clk), - .a (mul_q_a), - .b (n_coeff_buf[syst_cnt][syst]), - .t (t_q[syst_cnt][syst]), - .c_in (c_q_in[syst_cnt][syst]), - - .p (mul_q_p[syst]), - .c_out (mul_q_c_out[syst]) - ); - + bram_1rw_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH+1)) + bram_qn (.clk(clk), .a_addr(qn_addr_ext), .a_wr(qn_wren), .a_in(qn_data_in), .a_out(qn_data_out)); - modexpa7_systolic_pe mul_qn_inst - ( - .clk (clk), - .a (mul_qn_a), - .b (n_buf[syst_cnt][syst]), - .t (t_qn[syst_cnt][syst]), - .c_in (c_qn_in[syst_cnt][syst]), - - .p (mul_qn_p[syst]), - .c_out (mul_qn_c_out[syst]) - ); - - end - endgenerate // - // c_ab - // - always @(posedge clk) + // Wide Operand Loader // - case (fsm_state) - - FSM_STATE_INIT_LAST_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - c_ab_in[i][j] <= 32'd0; - - FSM_STATE_PIPE_CRUNCH: - if (pe_latency_ab_lsb_done) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - c_ab_in[syst_cnt_latency][j] <= mul_ab_c_out[j]; - endcase + integer j; - // - // c_q - // - always @(posedge clk) - // - case (fsm_state) - - FSM_STATE_INIT_LAST_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - c_q_in[i][j] <= 32'd0; - - FSM_STATE_PIPE_CRUNCH: - if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - c_q_in[syst_cnt_latency][j] <= mul_q_c_out[j]; - endcase - - // - // c_qn - // + /* shift logic */ always @(posedge clk) // case (fsm_state) - - FSM_STATE_INIT_LAST_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - c_qn_in[i][j] <= 32'd0; - - FSM_STATE_PIPE_CRUNCH: - if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - c_qn_in[syst_cnt_latency][j] <= mul_qn_c_out[j]; - endcase + // + FSM_STATE_LOAD_B_SHIFT: begin - // - // t_ab - // - always @(posedge clk) - // - case (fsm_state) + /* update the rightmost part of loader buffer */ + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (b_addr_dly <= bram_addr_last) ? b_bram_out : {32{1'b0}}; + + /* shift the loader buffer to the left */ + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_din[j-1] <= loader_din[j]; + + end + // + FSM_STATE_LOAD_N_COEFF_SHIFT: begin - FSM_STATE_INIT_LAST_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - t_ab[i][j] <= 32'd0; - - FSM_STATE_PIPE_CRUNCH: - if (pe_latency_ab_lsb_done) begin - if (syst_cnt_latency > syst_cnt_zero) - t_ab[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_ab_p[0]; - for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - t_ab[syst_cnt_latency][j-1] <= mul_ab_p[j]; - end + /* update the rightmost part of loader buffer */ + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_coeff_addr_dly <= bram_addr_last) ? n_coeff_bram_out : {32{1'b0}}; + + /* shift the loader buffer to the left */ + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_din[j-1] <= loader_din[j]; + + end + // + FSM_STATE_LOAD_N_SHIFT: begin + + /* update the rightmost part of loader buffer */ + loader_din[SYSTOLIC_ARRAY_LENGTH-1] <= (n_addr_dly <= bram_addr_last) ? n_bram_out : {32{1'b0}}; + /* shift the loader buffer to the left */ + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_din[j-1] <= loader_din[j]; + + end + // endcase + - - // - // t_q - // + /* write enable logic */ always @(posedge clk) // - case (fsm_state) + case (fsm_next_state) - FSM_STATE_INIT_LAST_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - t_q[i][j] <= 32'd0; - - FSM_STATE_PIPE_CRUNCH: - if (pe_latency_ab_lsb_done && (mult_cnt_ab > mult_cnt_zero)) begin - if (syst_cnt_latency > syst_cnt_zero) - t_q[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_q_p[0]; - for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - t_q[syst_cnt_latency][j-1] <= mul_q_p[j]; - end - + FSM_STATE_LOAD_B_WRITE, + FSM_STATE_LOAD_N_COEFF_WRITE, + FSM_STATE_LOAD_N_WRITE: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_wren[j] <= 1'b1; + + default: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_wren[j] <= 1'b0; + endcase - - // - // t_qn - // - always @(posedge clk) + /* loader address update logic */ + always @(posedge clk) begin // case (fsm_state) - FSM_STATE_INIT_LAST_ADDR: - for (i=0; i<SYSTOLIC_NUM_CYCLES; i=i+1) - for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - t_qn[i][j] <= 32'd0; - - FSM_STATE_PIPE_CRUNCH: - if (pe_latency_ab_lsb_done && (mult_cnt_q > mult_cnt_zero)) begin - if (syst_cnt_latency > syst_cnt_zero) - t_qn[syst_cnt_latency-1'b1][SYSTOLIC_ARRAY_LENGTH-1'b1] <= mul_qn_p[0]; - for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) - t_qn[syst_cnt_latency][j-1] <= mul_qn_p[j]; - end - + FSM_STATE_LOAD_B_START, + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_LOAD_N_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= syst_cnt_zero; + + FSM_STATE_LOAD_B_WRITE, + FSM_STATE_LOAD_N_COEFF_WRITE, + FSM_STATE_LOAD_N_WRITE: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= !syst_cnt_init_done ? syst_cnt_init_next : syst_cnt_init; + + endcase + // + case (fsm_next_state) + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START, + FSM_STATE_MULT_A_B_RELOAD, + FSM_STATE_MULT_AB_N_COEFF_RELOAD, + FSM_STATE_MULT_Q_N_RELOAD: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= syst_cnt_zero; + + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_lsb[j] <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_init; endcase + // + case (fsm_next_state) + + FSM_STATE_LOAD_B_START, + FSM_STATE_MULT_A_B_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_msb[j] <= LOADER_ADDR_MSB_B; + + FSM_STATE_LOAD_N_COEFF_START, + FSM_STATE_MULT_AB_N_COEFF_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_msb[j] <= LOADER_ADDR_MSB_N_COEFF; + + FSM_STATE_LOAD_N_START, + FSM_STATE_MULT_Q_N_START: + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + loader_addr_msb[j] <= LOADER_ADDR_MSB_N; + endcase // - // Latency 2 + end + + // - always @(posedge clk) + // Systolic Array of Processing Elements // - if (fsm_next_state == FSM_STATE_PIPE_CRUNCH) - // - case (fsm_state) - FSM_STATE_INIT_LAST_ADDR, - FSM_STATE_PIPE_RELOAD: pe_latency_ab_msb <= pe_latency_start; - FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) - pe_latency_ab_msb <= pe_latency_ab_msb_done ? - pe_latency_ab_msb : pe_latency_ab_msb_next; - endcase - + reg [31: 0] pe_a [0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [31: 0] pe_b [0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [31: 0] pe_t [0:SYSTOLIC_ARRAY_LENGTH-1]; + reg [31: 0] pe_c_in [0:SYSTOLIC_ARRAY_LENGTH-1]; + wire [31: 0] pe_p [0:SYSTOLIC_ARRAY_LENGTH-1]; + wire [31: 0] pe_c_out[0:SYSTOLIC_ARRAY_LENGTH-1]; + // - // Adder + // These can be turned into a FIFO (maybe later?)... // - reg pe_add_ce; - reg [31: 0] pe_add_a0; - reg [31: 0] pe_add_a1; - reg [31: 0] pe_add_a2; - reg [31: 0] pe_add_b0; + reg [31: 0] pe_c_out_mem[0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1]; + reg [31: 0] pe_t_mem [0:SYSTOLIC_ARRAY_LENGTH-1][0:SYSTOLIC_NUM_CYCLES-1]; - reg pe_add_c_in; - wire [31: 0] pe_add_s; - wire pe_add_c_out; + generate for (i=0; i<SYSTOLIC_ARRAY_LENGTH; i=i+1) + begin : modexpa7_systolic_pe_multiplier + modexpa7_systolic_pe systolic_pe_inst + ( + .clk (clk), + .a (pe_a[i]), + .b (pe_b[i]), + .t (pe_t[i]), + .c_in (pe_c_in[i]), + .p (pe_p[i]), + .c_out (pe_c_out[i]) + ); + end + endgenerate - reg pe_sub_ce; - reg [31: 0] pe_sub_a0; - reg [31: 0] pe_sub_b0; - reg pe_sub_b_in; - wire [31: 0] pe_sub_d; - wire pe_sub_b_out; - - always @(posedge clk) - pe_add_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done; - - always @(posedge clk) - pe_sub_ce <= pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero); + + + // + // Shift Registers + // + reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_load; + reg [SYSTOLIC_PE_LATENCY :0] shreg_latency; + reg [SYSTOLIC_NUM_CYCLES-1:0] shreg_unload; - always @(posedge clk) - // - if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_q > mult_cnt_zero) && !mult_cnt_s_done) - pe_add_c_in <= (mult_cnt_qn == mult_cnt_zero) ? 1'b0 : pe_add_c_out; + wire shreg_done_load = shreg_load[syst_cnt_last]; + wire shreg_done_latency = shreg_latency[SYSTOLIC_PE_LATENCY]; + wire shreg_done_unload = shreg_unload[syst_cnt_last]; - always @(posedge clk) - // - if (pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero) && (mult_cnt_qn > mult_cnt_zero)) - pe_sub_b_in <= (mult_cnt_s == mult_cnt_zero) ? 1'b0 : pe_sub_b_out; - + reg shreg_now_loading; + reg shreg_now_latency; + reg shreg_now_unloading; - modexpa7_adder32 pe_add_inst - ( - .clk (clk), - .ce (pe_add_ce), - .a (pe_add_a2), - .b (pe_add_b0), - .c_in (pe_add_c_in), - .s (pe_add_s), - .c_out (pe_add_c_out) - ); - - modexpa7_subtractor32 pe_sub_inst - ( - .clk (clk), - .ce (pe_sub_ce), - .a (pe_sub_a0), - .b (pe_sub_b0), - .b_in (pe_sub_b_in), - .d (pe_sub_d), - .b_out (pe_sub_b_out) - ); + reg shreg_done_latency_dly; always @(posedge clk) - // - if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) begin - pe_add_a0 <= mul_ab_p[0]; - pe_add_a1 <= pe_add_a0; - pe_add_a2 <= pe_add_a1; - end + shreg_done_latency_dly <= shreg_done_latency; always @(posedge clk) // - if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) - pe_sub_a0 <= pe_add_s; - - always @(posedge clk) - // - if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) - pe_add_b0 <= mul_qn_p[0]; - - always @(posedge clk) - // - if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) - pe_sub_b0 <= (mult_cnt_s <= mult_cnt_half) ? 32'd0 : n_bram_out; - - - always @(posedge clk) - // - case (fsm_next_state) - FSM_STATE_INIT_ZERO_ADDR: n_bram_addr_reg <= bram_addr_zero; - FSM_STATE_INIT_NEXT_ADDR: n_bram_addr_reg <= n_bram_addr_next; - FSM_STATE_PIPE_RELOAD: begin - if (mult_cnt_s == mult_cnt_half) n_bram_addr_reg <= bram_addr_zero; - if (mult_cnt_s > mult_cnt_half) n_bram_addr_reg <= n_bram_addr_next; + case (fsm_state) + // + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START, + FSM_STATE_MULT_A_B_RELOAD, + FSM_STATE_MULT_AB_N_COEFF_RELOAD, + FSM_STATE_MULT_Q_N_RELOAD: begin + shreg_now_loading <= 1'b1; + shreg_now_latency <= 1'b1; + shreg_now_unloading <= 1'b0; + shreg_load <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b1}; + shreg_latency <= {{SYSTOLIC_PE_LATENCY{1'b0}}, 1'b1}; + shreg_unload <= {{SYSTOLIC_NUM_CYCLES-1{1'b0}}, 1'b0}; + end + // + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: begin + shreg_load <= {shreg_load[SYSTOLIC_NUM_CYCLES-2:0], 1'b0}; + shreg_latency <= {shreg_latency[SYSTOLIC_PE_LATENCY-1:0], 1'b0}; + shreg_unload <= {shreg_unload[SYSTOLIC_NUM_CYCLES-2:0], shreg_latency[SYSTOLIC_PE_LATENCY]}; + + if (shreg_done_load) shreg_now_loading <= 1'b0; + if (shreg_done_latency) shreg_now_latency <= 1'b0; + if (shreg_done_latency) shreg_now_unloading <= 1'b1; + else if (shreg_done_unload) shreg_now_unloading <= 1'b0; + + end + // + default: begin + shreg_now_loading <= 1'b0; + shreg_now_latency <= 1'b0; + shreg_now_unloading <= 1'b0; end + // endcase + + + + always @(posedge clk) begin // - // Ready Flag Logic - // - reg rdy_reg = 1'b1; - assign rdy = rdy_reg; - - always @(posedge clk or negedge rst_n) - // - if (rst_n == 1'b0) rdy_reg <= 1'b1; - else begin - if (fsm_state == FSM_STATE_IDLE) rdy_reg <= ~ena_trig; - if (fsm_state == FSM_STATE_STOP) rdy_reg <= 1'b1; - end - - - // - // - // - always @(posedge clk) - // - if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) - mul_q_a_int <= mul_ab_p[0]; - - always @(posedge clk) + case (fsm_state) + FSM_STATE_MULT_A_B_START: ab_addr_ext <= bram_addr_ext_zero; + FSM_STATE_MULT_AB_N_COEFF_START: q_addr <= bram_addr_zero; + FSM_STATE_MULT_Q_N_START: qn_addr_ext <= bram_addr_ext_zero; + + FSM_STATE_MULT_A_B_RELOAD: ab_addr_ext <= ab_addr_ext_next; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: q_addr <= q_addr_next; + FSM_STATE_MULT_Q_N_RELOAD: qn_addr_ext <= qn_addr_ext_next; + + endcase // - if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) - mul_qn_a_int <= mul_q_p[0]; - - always @(posedge clk) + case (fsm_next_state) + FSM_STATE_MULT_AB_N_COEFF_START: ab_addr_ext <= bram_addr_ext_zero; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: ab_addr_ext <= ab_addr_ext_next; + endcase // - if (fsm_state == FSM_STATE_PIPE_RELOAD) - mul_q_a <= mul_q_a_int; // TODO: Add masking! Maybe not needed after all?.. + case (fsm_next_state) + FSM_STATE_MULT_Q_N_START: q_addr <= bram_addr_zero; + FSM_STATE_MULT_Q_N_RELOAD: q_addr <= !q_addr_done ? q_addr_next : q_addr; + endcase - always @(posedge clk) - // - if (fsm_state == FSM_STATE_PIPE_RELOAD) - mul_qn_a <= (mult_cnt_qn < mult_cnt_half) ? mul_qn_a_int : 32'd0; - - // - // Debug - // - //always @(posedge clk) begin - // - //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) - //$display("ab[%2d] = %08x", mult_cnt_ab, mul_ab_p[0]); - // - //if ((fsm_state == FSM_STATE_PIPE_CRUNCH) && pe_latency_ab_lsb_done && (syst_cnt_latency == syst_cnt_zero)) - //$display("q[%2d] = %08x", mult_cnt_q, mul_q_p[0]); - // - //if (fsm_state == FSM_STATE_PIPE_RELOAD) - //$display("s[%2d] = %08x", mult_cnt_qn, pe_add_s); // - //if (fsm_state == FSM_STATE_PIPE_RELOAD) - //$display("d[%2d] = %08x", mult_cnt_s, pe_sub_d); - // - //end - + end - wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_rd; - reg [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr; - wire [OPERAND_ADDR_WIDTH-1:0] s_bram_addr_wr_next = s_bram_addr_wr + 1'b1; - reg s_bram_en; - - wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_rd; - reg [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr; - wire [OPERAND_ADDR_WIDTH-1:0] sn_bram_addr_wr_next = sn_bram_addr_wr + 1'b1; - reg sn_bram_en; - - assign s_bram_addr_rd = s_bram_addr; - assign sn_bram_addr_rd = s_bram_addr; - - wire [31: 0] s_bram_din; - wire [31: 0] s_bram_dout; - - wire [31: 0] sn_bram_din; - wire [31: 0] sn_bram_dout; - - assign s_bram_din = pe_add_s; - assign sn_bram_din = pe_sub_d; - - always @(posedge clk) + always @(posedge clk) begin // - s_bram_en <= pe_add_ce && (mult_cnt_qn > mult_cnt_half); - - always @(posedge clk) + if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) begin + ab_wren <= shreg_done_latency_dly; + ab_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + ab_wren <= 1'b0; + ab_data_in <= 32'hXXXXXXXX; + end // - sn_bram_en <= pe_sub_ce && (mult_cnt_s > mult_cnt_half); - - always @(posedge clk) begin + if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) begin + q_wren <= shreg_done_latency_dly; + q_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + q_wren <= 1'b0; + q_data_in <= 32'hXXXXXXXX; + end // - if (pe_add_ce && (mult_cnt_qn == mult_cnt_half)) s_bram_addr_wr <= bram_addr_zero; - if (s_bram_en && (s_bram_addr_wr < bram_addr_last)) s_bram_addr_wr <= s_bram_addr_wr_next; - end + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) begin + qn_wren <= shreg_done_latency_dly; + qn_data_in <= shreg_done_latency_dly ? pe_p[0] : 32'hXXXXXXXX; + end else begin + qn_wren <= 1'b0; + qn_data_in <= 32'hXXXXXXXX; + end - always @(posedge clk) begin // - if (pe_sub_ce && (mult_cnt_s == mult_cnt_half)) sn_bram_addr_wr <= bram_addr_zero; - if (sn_bram_en && (sn_bram_addr_wr < bram_addr_last)) sn_bram_addr_wr <= sn_bram_addr_wr_next; end - bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_s (.clk(clk), - .a_addr(s_bram_addr_wr), .a_wr(s_bram_en), .a_in(s_bram_din), .a_out(), - .b_addr(s_bram_addr_rd), .b_out(s_bram_dout)); - - bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_sn (.clk(clk), - .a_addr(sn_bram_addr_wr), .a_wr(sn_bram_en), .a_in(sn_bram_din), .a_out(), - .b_addr(sn_bram_addr_rd), .b_out(sn_bram_dout)); - - - reg r_bram_en; always @(posedge clk) // - case (fsm_state) - FSM_STATE_SAVE_ZERO_ADDR, - FSM_STATE_SAVE_NEXT_ADDR: r_bram_en <= 1'b1; - default: r_bram_en <= 1'b0; + case (fsm_next_state) + FSM_STATE_MULT_A_B_START, + FSM_STATE_MULT_AB_N_COEFF_START, + FSM_STATE_MULT_Q_N_START, + FSM_STATE_MULT_A_B_RELOAD, + FSM_STATE_MULT_AB_N_COEFF_RELOAD, + FSM_STATE_MULT_Q_N_RELOAD: + // + syst_cnt_load <= syst_cnt_zero; + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: + // + syst_cnt_load <= !syst_cnt_load_done ? syst_cnt_load_next : syst_cnt_load; + endcase + + always @(posedge clk) + // + case (fsm_state) + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: begin - reg r_bram_wr_reg; - - assign r_bram_wr = r_bram_wr_reg; + if (shreg_done_latency) syst_cnt_unload <= syst_cnt_zero; + else if (shreg_now_unloading) + syst_cnt_unload <= !syst_cnt_unload_done ? syst_cnt_unload_next : syst_cnt_unload; + + end + endcase always @(posedge clk) // - r_bram_wr_reg <= r_bram_en; - - - wire r_select_s_over_sn = pe_sub_b_out && !pe_add_c_out; - + case (fsm_state) + FSM_STATE_MULT_A_B_CRUNCH, + FSM_STATE_MULT_AB_N_COEFF_CRUNCH, + FSM_STATE_MULT_Q_N_CRUNCH: begin - reg [31: 0] r_bram_in_reg; - - assign r_bram_in = r_bram_in_reg; + if (shreg_now_unloading) + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + pe_c_out_mem[syst_cnt_unload][j] <= pe_c_out[j]; + + if (shreg_now_unloading) begin + + for (j=1; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + pe_t_mem[syst_cnt_unload][j-1] <= pe_p[j]; + + if (syst_cnt_unload > syst_cnt_zero) + pe_t_mem[syst_cnt_unload-1'b1][SYSTOLIC_ARRAY_LENGTH-1] <= pe_p[0]; + else + pe_t_mem[syst_cnt_last][SYSTOLIC_ARRAY_LENGTH-1] <= 32'd0; + + end + end + endcase - always @(posedge clk) + // - if (r_bram_en) - r_bram_in_reg <= r_select_s_over_sn ? s_bram_dout : sn_bram_dout; - - always @(posedge clk) + // T and C_IN can be moved to a separate code block + // + always @(posedge clk) begin + // + if (fsm_state == FSM_STATE_MULT_A_B_CRUNCH) + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + // + if (shreg_now_loading) begin + pe_a[j] <= (ab_addr_ext > {1'b0, a_addr}) ? 32'd0 : a_bram_out; + pe_b[j] <= loader_dout[j]; + pe_t[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j]; + pe_c_in[j] <= (a_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + pe_t[j] <= 32'hXXXXXXXX; + pe_c_in[j] <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_AB_N_COEFF_CRUNCH) + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + // + if (shreg_now_loading) begin + pe_a[j] <= ab_data_out; + pe_b[j] <= loader_dout[j]; + pe_t[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j]; + pe_c_in[j] <= (ab_addr_ext == bram_addr_ext_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + pe_t[j] <= 32'hXXXXXXXX; + pe_c_in[j] <= 32'hXXXXXXXX; + end + // + if (fsm_state == FSM_STATE_MULT_Q_N_CRUNCH) + // + for (j=0; j<SYSTOLIC_ARRAY_LENGTH; j=j+1) + // + if (shreg_now_loading) begin + pe_a[j] <= (qn_addr_ext > {1'b0, q_addr}) ? 32'd0 : q_data_out; + pe_b[j] <= loader_dout[j]; + pe_t[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_t_mem[syst_cnt_load_dly][j]; + pe_c_in[j] <= (q_addr == bram_addr_zero) ? 32'd0 : pe_c_out_mem[syst_cnt_load_dly][j]; + end else begin + pe_a[j] <= 32'hXXXXXXXX; + pe_b[j] <= 32'hXXXXXXXX; + pe_t[j] <= 32'hXXXXXXXX; + pe_c_in[j] <= 32'hXXXXXXXX; + end // - if (r_bram_en) - r_bram_addr_reg <= s_bram_addr_dly; - // - // FSM Transition Logic + end + + + // + // FSM Process // always @(posedge clk or negedge rst_n) // if (rst_n == 1'b0) fsm_state <= FSM_STATE_IDLE; else fsm_state <= fsm_next_state; + + // + // FSM Transition Logic + // always @* begin // fsm_next_state = FSM_STATE_STOP; // case (fsm_state) - - FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_INIT_ZERO_ADDR; + + FSM_STATE_IDLE: if (ena_trig) fsm_next_state = FSM_STATE_LOAD_B_START; else fsm_next_state = FSM_STATE_IDLE; - - FSM_STATE_INIT_ZERO_ADDR: fsm_next_state = FSM_STATE_INIT_NEXT_ADDR; - - FSM_STATE_INIT_NEXT_ADDR: if (b_bram_addr_done) fsm_next_state = FSM_STATE_INIT_LAST_ADDR; - else fsm_next_state = FSM_STATE_INIT_NEXT_ADDR; - - FSM_STATE_INIT_LAST_ADDR: fsm_next_state = FSM_STATE_PIPE_CRUNCH; - - FSM_STATE_PIPE_CRUNCH: if (syst_cnt_done) fsm_next_state = pe_latency_ab_msb_done ? - FSM_STATE_PIPE_RELOAD : FSM_STATE_PIPE_CRUNCH; - else fsm_next_state = FSM_STATE_PIPE_CRUNCH; - - FSM_STATE_PIPE_RELOAD: if (mult_cnt_s_done) fsm_next_state = FSM_STATE_SAVE_ZERO_ADDR; - else fsm_next_state = FSM_STATE_PIPE_CRUNCH; - - FSM_STATE_SAVE_ZERO_ADDR: fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR; - - FSM_STATE_SAVE_NEXT_ADDR: if (s_bram_addr_done) fsm_next_state = FSM_STATE_SAVE_LAST_ADDR; - else fsm_next_state = FSM_STATE_SAVE_NEXT_ADDR; - - FSM_STATE_SAVE_LAST_ADDR: fsm_next_state = FSM_STATE_STOP; - + // + FSM_STATE_LOAD_B_START: fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_B_WRITE; + else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_B_FINAL; + else fsm_next_state = FSM_STATE_LOAD_B_SHIFT; + FSM_STATE_LOAD_B_FINAL: fsm_next_state = FSM_STATE_LOAD_N_COEFF_START; + // + FSM_STATE_LOAD_N_COEFF_START: fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_WRITE; + else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_COEFF_FINAL; + else fsm_next_state = FSM_STATE_LOAD_N_COEFF_SHIFT; + FSM_STATE_LOAD_N_COEFF_FINAL: fsm_next_state = FSM_STATE_LOAD_N_START; + // + FSM_STATE_LOAD_N_START: fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_SHIFT: if (mult_cnt_done) fsm_next_state = FSM_STATE_LOAD_N_WRITE; + else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_WRITE: if (syst_cnt_init_done) fsm_next_state = FSM_STATE_LOAD_N_FINAL; + else fsm_next_state = FSM_STATE_LOAD_N_SHIFT; + FSM_STATE_LOAD_N_FINAL: fsm_next_state = FSM_STATE_MULT_A_B_START; + // + FSM_STATE_MULT_A_B_START: fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_A_B_RELOAD; + else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_RELOAD: if (ab_addr_ext_done) fsm_next_state = FSM_STATE_MULT_A_B_FINAL; + else fsm_next_state = FSM_STATE_MULT_A_B_CRUNCH; + FSM_STATE_MULT_A_B_FINAL: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_START; + // + FSM_STATE_MULT_AB_N_COEFF_START: fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_RELOAD; + else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_RELOAD: if (q_addr_done) fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_FINAL; + else fsm_next_state = FSM_STATE_MULT_AB_N_COEFF_CRUNCH; + FSM_STATE_MULT_AB_N_COEFF_FINAL: fsm_next_state = FSM_STATE_MULT_Q_N_START; + // + FSM_STATE_MULT_Q_N_START: fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_CRUNCH: if (shreg_done_unload) fsm_next_state = FSM_STATE_MULT_Q_N_RELOAD; + else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_RELOAD: if (qn_addr_ext_done) fsm_next_state = FSM_STATE_MULT_Q_N_FINAL; + else fsm_next_state = FSM_STATE_MULT_Q_N_CRUNCH; + FSM_STATE_MULT_Q_N_FINAL: fsm_next_state = FSM_STATE_STOP; + // FSM_STATE_STOP: fsm_next_state = FSM_STATE_IDLE; - + endcase - end + // + end endmodule diff --git a/src/rtl/pe/modexpa7_adder32.v b/src/rtl/pe/modexpa7_adder32.v index ad296b1..04f8a18 100644 --- a/src/rtl/pe/modexpa7_adder32.v +++ b/src/rtl/pe/modexpa7_adder32.v @@ -51,7 +51,7 @@ module modexpa7_adder32 // // Include Primitive Selector // - `include "modexpa7_lowlevel_settings.v" + `include "modexpa7_primitive_switch.v" // diff --git a/src/rtl/pe/modexpa7_lowlevel_settings.v b/src/rtl/pe/modexpa7_lowlevel_settings.v deleted file mode 100644 index 93f5f34..0000000 --- a/src/rtl/pe/modexpa7_lowlevel_settings.v +++ /dev/null @@ -1,15 +0,0 @@ -//`define USE_VENDOR_PRIMITIVES - -`ifdef USE_VENDOR_PRIMITIVES - -`define ADDER32_PRIMITIVE adder32_artix7 -`define SUBTRACTOR32_PRIMITIVE subtractor32_artix7 -`define SYSTOLIC_PE_PRIMITIVE systolic_pe_artix7 - -`else - -`define ADDER32_PRIMITIVE adder32_generic -`define SUBTRACTOR32_PRIMITIVE subtractor32_generic -`define SYSTOLIC_PE_PRIMITIVE systolic_pe_generic - -`endif diff --git a/src/rtl/pe/modexpa7_primitive_switch.v b/src/rtl/pe/modexpa7_primitive_switch.v new file mode 100644 index 0000000..d38069b --- /dev/null +++ b/src/rtl/pe/modexpa7_primitive_switch.v @@ -0,0 +1,16 @@ +//`define USE_VENDOR_PRIMITIVES + +`ifdef USE_VENDOR_PRIMITIVES + +`define ADDER32_PRIMITIVE adder32_artix7 +`define SUBTRACTOR32_PRIMITIVE subtractor32_artix7 +`define SYSTOLIC_PE_PRIMITIVE systolic_pe_artix7 + +`else + +`define ADDER32_PRIMITIVE adder32_generic +`define SUBTRACTOR32_PRIMITIVE subtractor32_generic +`define SYSTOLIC_PE_PRIMITIVE systolic_pe_generic + + +`endif diff --git a/src/rtl/pe/modexpa7_subtractor32.v b/src/rtl/pe/modexpa7_subtractor32.v index 75b9c13..a43d670 100644 --- a/src/rtl/pe/modexpa7_subtractor32.v +++ b/src/rtl/pe/modexpa7_subtractor32.v @@ -51,7 +51,7 @@ module modexpa7_subtractor32 // // Include Primitive Selector // - `include "modexpa7_lowlevel_settings.v" + `include "modexpa7_primitive_switch.v" // diff --git a/src/rtl/pe/modexpa7_systolic_pe.v b/src/rtl/pe/modexpa7_systolic_pe.v index 22e6874..b284134 100644 --- a/src/rtl/pe/modexpa7_systolic_pe.v +++ b/src/rtl/pe/modexpa7_systolic_pe.v @@ -51,7 +51,7 @@ module modexpa7_systolic_pe // // Include Primitive Selector // - `include "modexpa7_lowlevel_settings.v" + `include "modexpa7_primitive_switch.v" // -- cgit v1.2.3