From 5807b0bfd7efe8dd3f83d679730241847517980b Mon Sep 17 00:00:00 2001 From: "Pavel V. Shatov (Meister)" Date: Thu, 23 Jan 2020 13:08:59 +0300 Subject: Reworked core selector generation script. The core selector is now multi-cycle (see /core/platform/alpha commit 35359243a63cac4a9e8cce6bd718f17756ce8a98 message for more details). In short, for write operations, every core now has its own copy of chip select, address and write data registers. For read operations we should never ever need the combinational readback multiplexor again, it just won't meet timing with so many complex cores. Cores with combinational outputs, primarily those that don't have block memory inside, always have additional output registers. Moreover, the readback multiplexor is now registered too, this is required to get the multicycle constraint to work properly (again, refer to the aforementioned commit message). --- config/core_config.py | 341 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 222 insertions(+), 119 deletions(-) diff --git a/config/core_config.py b/config/core_config.py index 4033279..61e77d5 100755 --- a/config/core_config.py +++ b/config/core_config.py @@ -5,7 +5,7 @@ Generate core_selector.v and core_vfiles.mk for a set of cores. """ #======================================================================= -# Copyright (c) 2015-2017, NORDUnet A/S All rights reserved. +# Copyright (c) 2015-2017, 2019 NORDUnet A/S All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -34,21 +34,6 @@ Generate core_selector.v and core_vfiles.mk for a set of cores. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #======================================================================= -# The modexpa7 core drags in a one clock cycle delay to other cores, -# to compensate for the extra clock cycle consumed by the block -# memories used in the modexpa7 core. We probably want a general -# solution for this, because we're going to run into this problem for -# any core that handles arguments big enough to require block memory. - -# To Do: -# -# - Consider automating the one-clock-cycle delay stuff by adding -# another boolean flag to the config file. Default would be no -# delay, if any included core sets the "I use block memories" flag, -# all other cores would get the delay. Slightly tedious but -# something we can calculate easily enough, and probably an -# improvement over wiring in the delay when nothing needs it. - def main(): """ Parse arguments and config file, generate core list, generate output. @@ -82,7 +67,7 @@ def main(): Core.modexp = cfg.get(board_section, "modexp") if Core.extra_wires: # restore formatting - Core.extra_wires = Core.extra_wires.replace("\n", "\n ") + "\n" + Core.extra_wires = Core.extra_wires.replace("\n", "\n ") + "\n" if args.core: cores = args.core @@ -98,7 +83,6 @@ def main(): except ValueError: if core not in cores: cores.append(core) - cores.insert(0, "board_regs") cores.insert(1, "comm_regs") @@ -111,21 +95,41 @@ def main(): core_number = 0 for core in cores: core_number = core.assign_core_number(core_number) - + + for i, core in enumerate(cores): + core.assign_seq_number(i) + + # On the unused piece of code below: we really should not try to + # optimize out the delay. This may have worked earlier, when we only + # had a small set of simple cores. There are a lot of complex cores + # by now, so the readback multiplexer gets pretty wide and will never + # meet timing if we make it purely combinatorial. Moreover, it turns + # out that additional delays are necessary to make it work at higher + # clock speeds. if False: # For some reason, attempting to optimize out the delay # code entirely results in a non-working bitstream. Don't # know why, disabling the optimization works, so just do # that for now. - + Core.need_one_cycle_delay = any(core.block_memory for core in cores) + # longest core/subcore instance name + max_name_len = 0 + for core in cores: + if len(core.instance_name) > max_name_len: + max_name_len = len(core.instance_name) + for subcore in core.subcores: + if len(subcore.instance_name) > max_name_len: + max_name_len = len(subcore.instance_name) + args.verilog.write(createModule_template.format( + core_count = len(cores), core = cores[0], - addrs = "".join(core.createAddr() for core in cores), - insts = "".join(core.createInstance() for core in cores), - muxes = "".join(core.createMux() for core in cores) )) + addrs = "".join(core.createAddr(max_name_len) for core in cores), + insts = "".join(core.createInstance() for core in cores), + muxes = "".join(core.createMux() for core in cores) )) args.makefile.write(listVfiles_template.format( vfiles = "".join(core.listVfiles() for core in cores))) @@ -193,6 +197,7 @@ class Core(object): self.name = name self.cfg_section = "core " + name self.core_number = None + self.seq_number = None self.vfiles = [] self.error_wire = True self.block_memory = False @@ -211,6 +216,9 @@ class Core(object): subcore.assign_core_number(n + i + 1) return n + self.blocks + def assign_seq_number(self, n): + self.seq_number = n + def configure(self, cfg): if self.instance_number == 0: self.vfiles.extend(cfg.getvalues(self.cfg_section, "vfiles")) @@ -221,7 +229,7 @@ class Core(object): self.block_memory = cfg.getboolean(self.cfg_section, "block memory", self.block_memory) self.extra_ports = cfg.get(self.cfg_section, "extra ports") if self.extra_ports: - self.extra_ports = self.extra_ports.replace("\n", "\n ") + "\n" + self.extra_ports = self.extra_ports.replace("\n", "\n ") + "\n" self.blocks = int(cfg.get(self.cfg_section, "core blocks") or 1) self.block_max = self.blocks - 1 if self.blocks > 1: @@ -257,27 +265,43 @@ class Core(object): @property def error_wire_decl(self): - return "\n wire error_{core.instance_name};".format(core = self) if self.error_wire else "" + return "\n wire error_{core.instance_name};".format(core = self) if self.error_wire else "" @property def error_port(self): - return ",\n .error(error_{core.instance_name})".format(core = self) if self.error_wire else "" + return ",\n .error(error_{core.instance_name})".format(core = self) if self.error_wire else "" @property def one_cycle_delay(self): return one_cycle_delay_template.format(core = self) if self.need_one_cycle_delay and not self.block_memory else "" + @property + def extra_pipeline_stage(self): + return extra_pipeline_stage_template.format(core = self) + @property def mux_core_addr(self): if self.blocks == 1 or self.subcores: return "CORE_ADDR_{core.upper_instance_name}".format(core=self) else: - return ",\n ".join("CORE_ADDR_{core.upper_instance_name} + {0}".format(i, core=self) for i in range(self.blocks)) + return ",\n ".join("CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{0:04X}".format(i, core=self) for i in range(self.blocks)) @property - def mux_data_reg(self): - return "read_data_" + self.instance_name + ("_reg" if self.need_one_cycle_delay and not self.block_memory else "") + def reg_data_out(self): + return "reg_read_data_" + self.instance_name + + @property + def comb_data_out(self): + return "comb_read_data_" + self.instance_name + + @property + def wire_data_out(self): + return self.comb_data_out if self.need_one_cycle_delay and not self.block_memory else self.reg_data_out + @property + def pipe_data_out(self): + return "pipe_read_data_" + self.instance_name + @property def mux_error_reg(self): return "error_" + self.instance_name if self.error_wire else "0" @@ -293,10 +317,10 @@ class Core(object): template = createInstance_template_dummy if self.dummy else createInstance_template_generic if self.blocks == 1 else createInstance_template_multi_block return template.format(core = self) - def createAddr(self): + def createAddr(self, max_name_len): if self.dummy: return "" - return createAddr_template.format(core = self) + "".join(subcore.createAddr() for subcore in self.subcores) + return createAddr_template.format(core = self, name_pad = max_name_len) + "".join(subcore.createAddr(max_name_len) for subcore in self.subcores) def createMux(self): if self.dummy: @@ -328,32 +352,44 @@ class SubCore(Core): # Template used by .createAddr() methods. createAddr_template = """\ - localparam CORE_ADDR_{core.upper_instance_name:21s} = {core.addr_width}'h{core.core_number:02x}; + localparam CORE_ADDR_{core.upper_instance_name:{name_pad}s} = {core.addr_width}'h{core.core_number:02x}; """ # Template used by Core.createInstance(). createInstance_template_generic = """\ - //---------------------------------------------------------------- - // {core.upper_instance_name} - //---------------------------------------------------------------- - wire enable_{core.instance_name} = (addr_core_num == CORE_ADDR_{core.upper_instance_name}); - wire [31: 0] read_data_{core.instance_name};{core.error_wire_decl} - - {core.module_name} {core.parameters}{core.instance_name}_inst - ( - .clk(sys_clk), - .{core.reset_name}(sys_rst_n), + //---------------------------------------------------------------- + // {core.upper_instance_name} + //---------------------------------------------------------------- + wire enable_{core.instance_name} = (addr_core_num == CORE_ADDR_{core.upper_instance_name}); + wire [31: 0] {core.wire_data_out};{core.error_wire_decl} + + reg select_{core.instance_name} = 1'b0; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg write_{core.instance_name} = 1'b0; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [31: 0] write_data_{core.instance_name}; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [ 7: 0] addr_{core.instance_name}; + + always @(posedge sys_clk) begin + select_{core.instance_name} <= enable_{core.instance_name} && sys_{core.bus_name}_cs; + write_{core.instance_name} <= sys_{core.bus_name}_wr; + write_data_{core.instance_name} <= sys_write_data; + addr_{core.instance_name} <= addr_core_reg; + end + + {core.module_name} {core.parameters}{core.instance_name}_inst + ( + .clk(sys_clk), + .{core.reset_name}(sys_rst_n_fanout[{core.seq_number}]), {core.extra_ports} - .cs(enable_{core.instance_name} & (sys_{core.bus_name}_rd | sys_{core.bus_name}_wr)), - .we(sys_{core.bus_name}_wr), - - .address(addr_core_reg), - .write_data(sys_write_data), - .read_data(read_data_{core.instance_name}){core.error_port} - ); + .cs(select_{core.instance_name}), + .we(write_{core.instance_name}), + .address(addr_{core.instance_name}), + .write_data(write_data_{core.instance_name}), + .read_data({core.wire_data_out}){core.error_port} + ); {core.one_cycle_delay} +{core.extra_pipeline_stage} """ @@ -361,27 +397,39 @@ createInstance_template_generic = """\ # enough from the base template that it's easier to make this separate. createInstance_template_multi_block = """\ - //---------------------------------------------------------------- - // {core.upper_instance_name} - //---------------------------------------------------------------- - wire enable_{core.instance_name} = (addr_core_num >= CORE_ADDR_{core.upper_instance_name}) && (addr_core_num <= CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{core.block_max:02x}); - wire [31: 0] read_data_{core.instance_name};{core.error_wire_decl} - wire [{core.block_bit_max}:0] {core.instance_name}_prefix = addr_core_num[{core.block_bit_max}:0] - CORE_ADDR_{core.upper_instance_name}; - - {core.module_name} {core.parameters}{core.instance_name}_inst - ( - .clk(sys_clk), - .{core.reset_name}(sys_rst_n), + //---------------------------------------------------------------- + // {core.upper_instance_name} + //---------------------------------------------------------------- + wire enable_{core.instance_name} = (addr_core_num >= CORE_ADDR_{core.upper_instance_name}) && (addr_core_num <= (CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{core.block_max:02x})); + wire [31: 0] {core.wire_data_out};{core.error_wire_decl} + wire [{core.block_bit_max:>2}: 0] prefix_{core.instance_name} = addr_core_num[{core.block_bit_max}:0] - CORE_ADDR_{core.upper_instance_name}[{core.block_bit_max}:0]; + + reg select_{core.instance_name} = 1'b0; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg write_{core.instance_name} = 1'b0; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [ 31: 0] write_data_{core.instance_name}; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.block_bits}+7: 0] addr_{core.instance_name}; + + always @(posedge sys_clk) begin + select_{core.instance_name} <= enable_{core.instance_name} && sys_{core.bus_name}_cs; + write_{core.instance_name} <= sys_{core.bus_name}_wr; + write_data_{core.instance_name} <= sys_write_data; + addr_{core.instance_name} <= {{prefix_{core.instance_name}, addr_core_reg}}; + end + + {core.module_name} {core.parameters}{core.instance_name}_inst + ( + .clk(sys_clk), + .{core.reset_name}(sys_rst_n_fanout[{core.seq_number}]), {core.extra_ports} - .cs(enable_{core.instance_name} & (sys_{core.bus_name}_rd | sys_{core.bus_name}_wr)), - .we(sys_{core.bus_name}_wr), - - .address({{{core.instance_name}_prefix, addr_core_reg}}), - .write_data(sys_write_data), - .read_data(read_data_{core.instance_name}){core.error_port} - ); + .cs(select_{core.instance_name}), + .we(write_{core.instance_name}), + .address(addr_{core.instance_name}), + .write_data(write_data_{core.instance_name}), + .read_data({core.wire_data_out}){core.error_port} + ); {core.one_cycle_delay} +{core.extra_pipeline_stage} """ @@ -395,19 +443,28 @@ createInstance_template_dummy = """\ # Template for one-cycle delay code. one_cycle_delay_template = """\ - reg [31: 0] read_data_{core.instance_name}_reg; - always @(posedge sys_clk) - read_data_{core.instance_name}_reg <= read_data_{core.instance_name}; + (* SHREG_EXTRACT="NO" *) + reg [31: 0] {core.reg_data_out}; + always @(posedge sys_clk) + {core.reg_data_out} <= {core.wire_data_out}; +""" + +# Template for an extra delay cycle code. + +extra_pipeline_stage_template = """\ + (* SHREG_EXTRACT="NO" *) + reg [31: 0] {core.pipe_data_out}; + always @(posedge sys_clk) + {core.pipe_data_out} <= {core.reg_data_out}; """ # Template for .createMux() methods. createMux_template = """\ - {core.mux_core_addr}: - begin - sys_read_data_mux = {core0.mux_data_reg}; - sys_error_mux = {core0.mux_error_reg}; - end + {core.mux_core_addr}: begin + sys_read_data_mux <= {core0.pipe_data_out}; + sys_error_mux <= {core0.mux_error_reg}; + end """ # Top-level (createModule) template. @@ -416,56 +473,102 @@ createModule_template = """\ // NOTE: This file is generated; do not edit. module core_selector - ( - input wire sys_clk, - input wire sys_rst_n, - - input wire [{core.bus_max}: 0] sys_{core.bus_name}_addr, - input wire sys_{core.bus_name}_wr, - input wire sys_{core.bus_name}_rd, - output wire [31: 0] sys_read_data, - input wire [31: 0] sys_write_data, - output wire sys_error, -{core.extra_wires} - input wire noise, - output wire [7 : 0] debug - ); - - - //---------------------------------------------------------------- - // Address Decoder - //---------------------------------------------------------------- - // upper {core.addr_width} bits specify core being addressed - wire [{core.addr_max:>2}: 0] addr_core_num = sys_{core.bus_name}_addr[{core.bus_max}: 8]; - // lower 8 bits specify register offset in core - wire [ 7: 0] addr_core_reg = sys_{core.bus_name}_addr[ 7: 0]; +( + input wire sys_clk, + input wire sys_rst_n, + + input wire [{core.bus_max}: 0] sys_{core.bus_name}_addr, + input wire sys_{core.bus_name}_wr, + input wire sys_{core.bus_name}_rd, + output wire [31: 0] sys_read_data, + input wire [31: 0] sys_write_data, + output wire sys_error, + {core.extra_wires} + input wire noise, + output wire [ 7 :0] debug +); + + + //---------------------------------------------------------------- + // Localized Resets Generator + //---------------------------------------------------------------- + wire [{core_count}-1:0] sys_rst_n_fanout; + reset_replicator # + ( + .SHREG_WIDTH(8), + .FANOUT_WIDTH({core_count}) + ) + reset_replicator_inst + ( + .sys_clk_in (sys_clk), + .sys_rst_n_in (sys_rst_n), + .sys_rst_n_out (sys_rst_n_fanout) + ); + + + //---------------------------------------------------------------- + // Address Decoder + //---------------------------------------------------------------- + // upper {core.addr_width} bits specify core being addressed + // lower 8 bits specify register offset in core + wire [{core.addr_max:>2}: 0] addr_core_num = sys_{core.bus_name}_addr[{core.bus_max}: 8]; + wire [ 7: 0] addr_core_reg = sys_{core.bus_name}_addr[ 7: 0]; + + + //---------------------------------------------------------------- + // Core Address Table + //---------------------------------------------------------------- +{addrs} - //---------------------------------------------------------------- - // Core Address Table - //---------------------------------------------------------------- -{addrs} + //---------------------------------------------------------------- + // Core Instances + //---------------------------------------------------------------- + wire sys_{core.bus_name}_cs = sys_{core.bus_name}_rd || sys_{core.bus_name}_wr; {insts} - //---------------------------------------------------------------- - // Output (Read Data) Multiplexer - //---------------------------------------------------------------- - reg [31: 0] sys_read_data_mux; - assign sys_read_data = sys_read_data_mux; - reg sys_error_mux; - assign sys_error = sys_error_mux; - always @* - - case (addr_core_num) + + //---------------------------------------------------------------- + // Output (Read Data) Multiplexer + //---------------------------------------------------------------- + (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly1 = 1'b0; + (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly2 = 1'b0; + (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly3 = 1'b0; + + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly1; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly2; + (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly3; + + always @(posedge sys_clk) begin + sys_{core.bus_name}_cs_dly1 <= sys_{core.bus_name}_cs; + sys_{core.bus_name}_cs_dly2 <= sys_{core.bus_name}_cs_dly1; + sys_{core.bus_name}_cs_dly3 <= sys_{core.bus_name}_cs_dly2; + end + + always @(posedge sys_clk) begin + if (sys_{core.bus_name}_cs) addr_core_num_dly1 <= addr_core_num; + if (sys_{core.bus_name}_cs_dly1) addr_core_num_dly2 <= addr_core_num_dly1; + if (sys_{core.bus_name}_cs_dly2) addr_core_num_dly3 <= addr_core_num_dly2; + end + + reg [31: 0] sys_read_data_mux; + reg sys_error_mux; + + assign sys_read_data = sys_read_data_mux; + assign sys_error = sys_error_mux; + + always @(posedge sys_clk) + + if (sys_{core.bus_name}_cs_dly3) + + case (addr_core_num_dly3) {muxes} - default: - begin - sys_read_data_mux = {{32{{1'b0}}}}; - sys_error_mux = 1; - end - endcase - + default: begin + sys_read_data_mux <= {{32{{1'b0}}}}; + sys_error_mux <= 1'b1; + end + endcase endmodule -- cgit v1.2.3