aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bench/tb_core_selector.v192
-rw-r--r--config/core.cfg50
-rwxr-xr-xconfig/core_config.py341
-rw-r--r--extra/reset_replicator.v93
4 files changed, 550 insertions, 126 deletions
diff --git a/bench/tb_core_selector.v b/bench/tb_core_selector.v
new file mode 100644
index 0000000..a8a174b
--- /dev/null
+++ b/bench/tb_core_selector.v
@@ -0,0 +1,192 @@
+`timescale 1ns / 1ps
+
+module tb_core_selector;
+
+
+ //
+ // System Clock, System Reset
+ //
+ `define SYS_CLK_FREQUENCY_MHZ ( 100.0 )
+ `define SYS_CLK_PERIOD_NS (1000.0 / `SYS_CLK_FREQUENCY_MHZ)
+ `define SYS_CLK_PERIOD_HALF_NS ( 0.5 * `SYS_CLK_PERIOD_NS )
+
+ reg sys_clk = 1'b0;
+ initial forever #`SYS_CLK_PERIOD_HALF_NS sys_clk = ~sys_clk;
+
+ `define SYS_RST_N_ACTIVE 1'b0
+ `define SYS_RST_N_INACTIVE 1'b1
+
+ reg sys_rst_n = `SYS_RST_N_ACTIVE;
+
+
+ //
+ // System Bus
+ //
+ reg [23: 0] sys_fmc_addr;
+ reg sys_fmc_wr = 1'b0;
+ reg sys_fmc_rd = 1'b0;
+ wire [31: 0] sys_read_data;
+ reg [31: 0] sys_write_data;
+ wire sys_error;
+
+
+ //
+ // UUT
+ //
+ core_selector uut
+ (
+ .sys_clk (sys_clk),
+ .sys_rst_n (sys_rst_n),
+
+ .sys_fmc_addr (sys_fmc_addr),
+ .sys_fmc_wr (sys_fmc_wr),
+ .sys_fmc_rd (sys_fmc_rd),
+ .sys_read_data (sys_read_data),
+ .sys_write_data (sys_write_data),
+ .sys_error (sys_error),
+
+ .mkm_sclk (),
+ .mkm_cs_n (),
+ .mkm_do (1'b0),
+ .mkm_di (),
+
+ .core_clk (1'b0),
+
+ .noise (1'b0),
+ .debug ()
+ );
+
+
+ //
+ // Script
+ //
+
+ //
+ // Here's what the following routine does. We know that at address 0 there's always a BOARD_REGS core, which
+ // has a 32-bit dummy register at offset 255. ECDSA cores also have a 32-bit dummy register at offset 15.
+ // We write some values into the two dummy registers to test the address decoding logic (we write to two cores
+ // with different numbers, the offsets of registers are also different). Then we do a readback and compare
+ // the read value with the written one. The code assumes, that the default "hsm" core configuration is used,
+ // where the number of ECDSA-256 is 0x37. If this is not the case, adapt the first parameter passed to the
+ // sys_bus_write() calls.
+ //
+
+ localparam [31:0] MAGIC_1 = 32'hCCAA5533;
+ localparam [31:0] MAGIC_2 = 32'hCA5335AC;
+
+ reg [31:0] wr, rd;
+ initial begin
+
+ wait_sys_clk_ticks(200);
+ sys_rst_n = `SYS_RST_N_INACTIVE;
+ wait_sys_clk_ticks(100);
+
+ wr = MAGIC_1;
+ sys_bus_write(16'h0000, 8'd255, wr);
+ wait_sys_clk_ticks(10);
+
+ wr = MAGIC_2;
+ sys_bus_write(16'h0037, 8'd15, wr);
+ wait_sys_clk_ticks(10);
+
+ wr = MAGIC_1;
+ sys_bus_read(16'h0000, 8'd255, rd);
+ wait_sys_clk_ticks(10);
+ if (rd !== wr) begin
+ $display("ERROR: wr = 0x%08x, rd = 0x%08x", wr, rd);
+ wait_sys_clk_ticks(100);
+ $finish;
+ end
+
+ wr = MAGIC_2;
+ sys_bus_read(16'h0037, 8'd15, rd);
+ wait_sys_clk_ticks(10);
+ if (rd !== wr) begin
+ $display("ERROR: wr = 0x%08x, rd = 0x%08x", wr, rd);
+ wait_sys_clk_ticks(100);
+ $finish;
+ end
+
+ $display("Test passed.");
+ $finish;
+
+ end
+
+
+ //
+ // _wait_half_sys_clk_tick()
+ //
+ task _wait_half_sys_clk_tick;
+ #`SYS_CLK_PERIOD_HALF_NS;
+ endtask
+
+
+ //
+ // wait_sys_clk_tick()
+ //
+ task wait_sys_clk_tick;
+ begin
+ _wait_half_sys_clk_tick;
+ _wait_half_sys_clk_tick;
+ end
+ endtask
+
+
+ //
+ // wait_sys_clk_ticks()
+ //
+ task wait_sys_clk_ticks;
+ input integer _num_ticks;
+ integer _n;
+ for (_n=0; _n<_num_ticks; _n=_n+1)
+ wait_sys_clk_tick;
+ endtask
+
+
+ //
+ // _sys_bus_drive()
+ //
+ task _sys_bus_drive;
+ input [23: 0] _addr;
+ input _wr;
+ input _rd;
+ input [31: 0] _write_data;
+ {sys_fmc_addr, sys_fmc_wr, sys_fmc_rd, sys_write_data} <=
+ { _addr, _wr, _rd, _write_data} ;
+ endtask
+
+
+ //
+ // sys_bus_read()
+ //
+ task sys_bus_read;
+ input [15:0] _num;
+ input [ 7:0] _reg;
+ output [31:0] _data;
+ begin
+ _sys_bus_drive({_num, _reg}, 1'b0, 1'b1, {32{1'bX}});
+ wait_sys_clk_tick;
+ _sys_bus_drive(24'hXXXX, 1'b0, 1'b0, {32{1'bX}});
+ wait_sys_clk_ticks(3);
+ _data = sys_read_data;
+ _sys_bus_drive(24'hXXXX, 1'b0, 1'b0, {32{1'bX}});
+ end
+ endtask
+
+
+ //
+ // sys_bus_write()
+ //
+ task sys_bus_write;
+ input [15:0] _num;
+ input [ 7:0] _reg;
+ input [31:0] _data;
+ begin
+ _sys_bus_drive({_num, _reg}, 1'b1, 1'b0, _data);
+ wait_sys_clk_tick;
+ _sys_bus_drive(24'hXXXX, 1'b0, 1'b0, {32{1'bX}});
+ end
+ endtask
+
+
+endmodule
diff --git a/config/core.cfg b/config/core.cfg
index 6520393..697f8bc 100644
--- a/config/core.cfg
+++ b/config/core.cfg
@@ -36,8 +36,9 @@ modexp = modexpa7
extra wires =
output wire mkm_sclk,
output wire mkm_cs_n,
- input wire mkm_do,
+ input wire mkm_do,
output wire mkm_di,
+ input wire core_clk,
requires = mkmif/dummy-mkmif
[board dev-bridge]
@@ -73,6 +74,10 @@ cores = trng
# for testing just the Modular Exponentiation
cores = modexp
+[project modexpng]
+# for testing just the ModExpNG
+cores = modexpng
+
[project mkmif]
# for testing just the Master Key Memory Interface
cores = mkmif
@@ -86,13 +91,13 @@ cores = sha256 aes trng modexp mkmif
# Make me one with everything, except we want two modexp cores for parallel CRT
cores = sha1 sha256 sha512 aes trng modexp modexp mkmif ecdsa256 ecdsa384
-[project keywrap]
-# for testing Joachim's keywrap core with RSA signing
-cores = mkmif sha256 aes trng modexp modexp ecdsa256 ecdsa384 keywrap
+[project hsm_ng]
+# Make me one with everything, except we want the new ModExpNG core
+cores = sha1 sha256 sha512 aes trng modexp modexp modexpng mkmif ecdsa256 ecdsa384
-[project keywrap]
-# for testing Joachim's keywrap core with RSA signing
-cores = mkmif sha256 aes trng modexp modexp ecdsa256 ecdsa384 keywrap
+[project hsm_ng_keywrap]
+# everything, with the full range of modexp and keywrap options
+cores = sha1 sha256 sha512 aes keywrap trng modexp modexp modexpng mkmif ecdsa256 ecdsa384
# [core] sections
#
@@ -219,6 +224,37 @@ vfiles =
lib/lowlevel/artix7/dsp48e1_wrapper.v
lib/lowlevel/artix7/dsp48e1_wrapper_modexp.v
+[core modexpng]
+# ModExpNG for Xilinx Artix-7
+core blocks = 16
+block memory = yes
+error wire = no
+module name = modexpng_wrapper
+reset name = rst_n
+extra ports =
+ .clk_core(core_clk),
+vfiles =
+ ../user/shatov/modexpng/rtl/modexpng_wrapper.v
+ ../user/shatov/modexpng/rtl/modexpng_core_top.v
+ ../user/shatov/modexpng/rtl/modexpng_general_worker.v
+ ../user/shatov/modexpng/rtl/modexpng_mmm_dual.v
+ ../user/shatov/modexpng/rtl/modexpng_reductor.v
+ ../user/shatov/modexpng/rtl/modexpng_dsp_array_block.v
+ ../user/shatov/modexpng/rtl/modexpng_io_block.v
+ ../user/shatov/modexpng/rtl/modexpng_io_manager.v
+ ../user/shatov/modexpng/rtl/modexpng_storage_block.v
+ ../user/shatov/modexpng/rtl/modexpng_storage_manager.v
+ ../user/shatov/modexpng/rtl/modexpng_uop_rom.v
+ ../user/shatov/modexpng/rtl/modexpng_uop_engine.v
+ ../user/shatov/modexpng/rtl/modexpng_recombinator_block.v
+ ../user/shatov/modexpng/rtl/modexpng_recombinator_cell.v
+ ../user/shatov/modexpng/rtl/modexpng_dsp_slice_mult_wrapper_xilinx.v
+ ../user/shatov/modexpng/rtl/modexpng_dsp_slice_addsub_wrapper_xilinx.v
+ ../user/shatov/modexpng/rtl/modexpng_sdp_36k_x18_wrapper_xilinx.v
+ ../user/shatov/modexpng/rtl/modexpng_sdp_36k_x16_x32_wrapper_xilinx.v
+ ../user/shatov/modexpng/rtl/modexpng_sdp_36k_x32_x16_wrapper_xilinx.v
+ ../user/shatov/modexpng/rtl/modexpng_tdp_36k_x16_x32_wrapper_xilinx.v
+
[core modexps6]
# ModExp for Xilinx Spartan-6
core blocks = 4
diff --git a/config/core_config.py b/config/core_config.py
index d84f8f1..d511228 100755
--- a/config/core_config.py
+++ b/config/core_config.py
@@ -5,7 +5,7 @@ Generate core_selector.v and core_vfiles.mk for a set of cores.
"""
#=======================================================================
-# Copyright (c) 2015-2017, NORDUnet A/S All rights reserved.
+# Copyright (c) 2015-2017, 2019 NORDUnet A/S All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
@@ -34,21 +34,6 @@ Generate core_selector.v and core_vfiles.mk for a set of cores.
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#=======================================================================
-# The modexpa7 core drags in a one clock cycle delay to other cores,
-# to compensate for the extra clock cycle consumed by the block
-# memories used in the modexpa7 core. We probably want a general
-# solution for this, because we're going to run into this problem for
-# any core that handles arguments big enough to require block memory.
-
-# To Do:
-#
-# - Consider automating the one-clock-cycle delay stuff by adding
-# another boolean flag to the config file. Default would be no
-# delay, if any included core sets the "I use block memories" flag,
-# all other cores would get the delay. Slightly tedious but
-# something we can calculate easily enough, and probably an
-# improvement over wiring in the delay when nothing needs it.
-
def main():
"""
Parse arguments and config file, generate core list, generate output.
@@ -82,7 +67,7 @@ def main():
Core.modexp = cfg.get(board_section, "modexp")
if Core.extra_wires:
# restore formatting
- Core.extra_wires = Core.extra_wires.replace("\n", "\n ") + "\n"
+ Core.extra_wires = Core.extra_wires.replace("\n", "\n ") + "\n"
if args.core:
cores = args.core
@@ -98,7 +83,6 @@ def main():
except ValueError:
if core not in cores:
cores.append(core)
-
cores.insert(0, "board_regs")
cores.insert(1, "comm_regs")
@@ -111,21 +95,41 @@ def main():
core_number = 0
for core in cores:
core_number = core.assign_core_number(core_number)
-
+
+ for i, core in enumerate(cores):
+ core.assign_seq_number(i)
+
+ # On the unused piece of code below: we really should not try to
+ # optimize out the delay. This may have worked earlier, when we only
+ # had a small set of simple cores. There are a lot of complex cores
+ # by now, so the readback multiplexer gets pretty wide and will never
+ # meet timing if we make it purely combinatorial. Moreover, it turns
+ # out that additional delays are necessary to make it work at higher
+ # clock speeds.
if False:
# For some reason, attempting to optimize out the delay
# code entirely results in a non-working bitstream. Don't
# know why, disabling the optimization works, so just do
# that for now.
-
+
Core.need_one_cycle_delay = any(core.block_memory for core in cores)
+ # longest core/subcore instance name
+ max_name_len = 0
+ for core in cores:
+ if len(core.instance_name) > max_name_len:
+ max_name_len = len(core.instance_name)
+ for subcore in core.subcores:
+ if len(subcore.instance_name) > max_name_len:
+ max_name_len = len(subcore.instance_name)
+
args.verilog.write(createModule_template.format(
+ core_count = len(cores),
core = cores[0],
- addrs = "".join(core.createAddr() for core in cores),
- insts = "".join(core.createInstance() for core in cores),
- muxes = "".join(core.createMux() for core in cores) ))
+ addrs = "".join(core.createAddr(max_name_len) for core in cores),
+ insts = "".join(core.createInstance() for core in cores),
+ muxes = "".join(core.createMux() for core in cores) ))
args.makefile.write(listVfiles_template.format(
vfiles = "".join(core.listVfiles() for core in cores)))
@@ -193,6 +197,7 @@ class Core(object):
self.name = name
self.cfg_section = "core " + name
self.core_number = None
+ self.seq_number = None
self.vfiles = []
self.error_wire = True
self.block_memory = False
@@ -211,6 +216,9 @@ class Core(object):
subcore.assign_core_number(n + i + 1)
return n + self.blocks
+ def assign_seq_number(self, n):
+ self.seq_number = n
+
def configure(self, cfg):
if self.instance_number == 0:
self.vfiles.extend(cfg.getvalues(self.cfg_section, "vfiles"))
@@ -221,7 +229,7 @@ class Core(object):
self.block_memory = cfg.getboolean(self.cfg_section, "block memory", self.block_memory)
self.extra_ports = cfg.get(self.cfg_section, "extra ports")
if self.extra_ports:
- self.extra_ports = self.extra_ports.replace("\n", "\n ") + "\n"
+ self.extra_ports = self.extra_ports.replace("\n", "\n ") + "\n"
self.blocks = int(cfg.get(self.cfg_section, "core blocks") or 1)
self.block_max = self.blocks - 1
if self.blocks > 1:
@@ -257,28 +265,44 @@ class Core(object):
@property
def error_wire_decl(self):
- return "\n wire error_{core.instance_name};".format(core = self) if self.error_wire else ""
+ return "\n wire error_{core.instance_name};".format(core = self) if self.error_wire else ""
@property
def error_port(self):
- return ",\n .error(error_{core.instance_name})".format(core = self) if self.error_wire else ""
+ return ",\n .error(error_{core.instance_name})".format(core = self) if self.error_wire else ""
@property
def one_cycle_delay(self):
return one_cycle_delay_template.format(core = self) if self.need_one_cycle_delay and not self.block_memory else ""
@property
+ def extra_pipeline_stage(self):
+ return extra_pipeline_stage_template.format(core = self)
+
+ @property
def mux_core_addr(self):
if self.blocks == 1 or self.subcores:
return "CORE_ADDR_{core.upper_instance_name}".format(core=self)
else:
- return ",\n ".join("CORE_ADDR_{core.upper_instance_name} + {0}".format(i, core=self) for i in range(self.blocks))
+ return ",\n ".join("CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{0:04X}".format(i, core=self) for i in range(self.blocks))
@property
- def mux_data_reg(self):
- return "read_data_" + self.instance_name + ("_reg" if self.need_one_cycle_delay and not self.block_memory else "")
+ def reg_data_out(self):
+ return "reg_read_data_" + self.instance_name
+
+ @property
+ def comb_data_out(self):
+ return "comb_read_data_" + self.instance_name
+
+ @property
+ def wire_data_out(self):
+ return self.comb_data_out if self.need_one_cycle_delay and not self.block_memory else self.reg_data_out
@property
+ def pipe_data_out(self):
+ return "pipe_read_data_" + self.instance_name
+
+ @property
def mux_error_reg(self):
return "error_" + self.instance_name if self.error_wire else "0"
@@ -293,10 +317,10 @@ class Core(object):
template = createInstance_template_dummy if self.dummy else createInstance_template_generic if self.blocks == 1 else createInstance_template_multi_block
return template.format(core = self)
- def createAddr(self):
+ def createAddr(self, max_name_len):
if self.dummy:
return ""
- return createAddr_template.format(core = self) + "".join(subcore.createAddr() for subcore in self.subcores)
+ return createAddr_template.format(core = self, name_pad = max_name_len) + "".join(subcore.createAddr(max_name_len) for subcore in self.subcores)
def createMux(self):
if self.dummy:
@@ -328,32 +352,44 @@ class SubCore(Core):
# Template used by .createAddr() methods.
createAddr_template = """\
- localparam CORE_ADDR_{core.upper_instance_name:21s} = {core.addr_width}'h{core.core_number:02x};
+ localparam CORE_ADDR_{core.upper_instance_name:{name_pad}s} = {core.addr_width}'h{core.core_number:02x};
"""
# Template used by Core.createInstance().
createInstance_template_generic = """\
- //----------------------------------------------------------------
- // {core.upper_instance_name}
- //----------------------------------------------------------------
- wire enable_{core.instance_name} = (addr_core_num == CORE_ADDR_{core.upper_instance_name});
- wire [31: 0] read_data_{core.instance_name};{core.error_wire_decl}
-
- {core.module_name} {core.parameters}{core.instance_name}_inst
- (
- .clk(sys_clk),
- .{core.reset_name}(sys_rst_n),
+ //----------------------------------------------------------------
+ // {core.upper_instance_name}
+ //----------------------------------------------------------------
+ wire enable_{core.instance_name} = (addr_core_num == CORE_ADDR_{core.upper_instance_name});
+ wire [31: 0] {core.wire_data_out};{core.error_wire_decl}
+
+ reg select_{core.instance_name} = 1'b0;
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg write_{core.instance_name} = 1'b0;
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [31: 0] write_data_{core.instance_name};
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [ 7: 0] addr_{core.instance_name};
+
+ always @(posedge sys_clk) begin
+ select_{core.instance_name} <= enable_{core.instance_name} && sys_{core.bus_name}_cs;
+ write_{core.instance_name} <= sys_{core.bus_name}_wr;
+ write_data_{core.instance_name} <= sys_write_data;
+ addr_{core.instance_name} <= addr_core_reg;
+ end
+
+ {core.module_name} {core.parameters}{core.instance_name}_inst
+ (
+ .clk(sys_clk),
+ .{core.reset_name}(sys_rst_n_fanout[{core.seq_number}]),
{core.extra_ports}
- .cs(enable_{core.instance_name} & (sys_{core.bus_name}_rd | sys_{core.bus_name}_wr)),
- .we(sys_{core.bus_name}_wr),
-
- .address(addr_core_reg),
- .write_data(sys_write_data),
- .read_data(read_data_{core.instance_name}){core.error_port}
- );
+ .cs(select_{core.instance_name}),
+ .we(write_{core.instance_name}),
+ .address(addr_{core.instance_name}),
+ .write_data(write_data_{core.instance_name}),
+ .read_data({core.wire_data_out}){core.error_port}
+ );
{core.one_cycle_delay}
+{core.extra_pipeline_stage}
"""
@@ -361,27 +397,39 @@ createInstance_template_generic = """\
# enough from the base template that it's easier to make this separate.
createInstance_template_multi_block = """\
- //----------------------------------------------------------------
- // {core.upper_instance_name}
- //----------------------------------------------------------------
- wire enable_{core.instance_name} = (addr_core_num >= CORE_ADDR_{core.upper_instance_name}) && (addr_core_num <= CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{core.block_max:02x});
- wire [31: 0] read_data_{core.instance_name};{core.error_wire_decl}
- wire [{core.block_bit_max}:0] {core.instance_name}_prefix = addr_core_num[{core.block_bit_max}:0] - CORE_ADDR_{core.upper_instance_name};
-
- {core.module_name} {core.parameters}{core.instance_name}_inst
- (
- .clk(sys_clk),
- .{core.reset_name}(sys_rst_n),
+ //----------------------------------------------------------------
+ // {core.upper_instance_name}
+ //----------------------------------------------------------------
+ wire enable_{core.instance_name} = (addr_core_num >= CORE_ADDR_{core.upper_instance_name}) && (addr_core_num <= (CORE_ADDR_{core.upper_instance_name} + {core.addr_width}'h{core.block_max:02x}));
+ wire [31: 0] {core.wire_data_out};{core.error_wire_decl}
+ wire [{core.block_bit_max:>2}: 0] prefix_{core.instance_name} = addr_core_num[{core.block_bit_max}:0] - CORE_ADDR_{core.upper_instance_name}[{core.block_bit_max}:0];
+
+ reg select_{core.instance_name} = 1'b0;
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg write_{core.instance_name} = 1'b0;
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [ 31: 0] write_data_{core.instance_name};
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.block_bits}+7: 0] addr_{core.instance_name};
+
+ always @(posedge sys_clk) begin
+ select_{core.instance_name} <= enable_{core.instance_name} && sys_{core.bus_name}_cs;
+ write_{core.instance_name} <= sys_{core.bus_name}_wr;
+ write_data_{core.instance_name} <= sys_write_data;
+ addr_{core.instance_name} <= {{prefix_{core.instance_name}, addr_core_reg}};
+ end
+
+ {core.module_name} {core.parameters}{core.instance_name}_inst
+ (
+ .clk(sys_clk),
+ .{core.reset_name}(sys_rst_n_fanout[{core.seq_number}]),
{core.extra_ports}
- .cs(enable_{core.instance_name} & (sys_{core.bus_name}_rd | sys_{core.bus_name}_wr)),
- .we(sys_{core.bus_name}_wr),
-
- .address({{{core.instance_name}_prefix, addr_core_reg}}),
- .write_data(sys_write_data),
- .read_data(read_data_{core.instance_name}){core.error_port}
- );
+ .cs(select_{core.instance_name}),
+ .we(write_{core.instance_name}),
+ .address(addr_{core.instance_name}),
+ .write_data(write_data_{core.instance_name}),
+ .read_data({core.wire_data_out}){core.error_port}
+ );
{core.one_cycle_delay}
+{core.extra_pipeline_stage}
"""
@@ -395,19 +443,28 @@ createInstance_template_dummy = """\
# Template for one-cycle delay code.
one_cycle_delay_template = """\
- reg [31: 0] read_data_{core.instance_name}_reg;
- always @(posedge sys_clk)
- read_data_{core.instance_name}_reg <= read_data_{core.instance_name};
+ (* SHREG_EXTRACT="NO" *)
+ reg [31: 0] {core.reg_data_out};
+ always @(posedge sys_clk)
+ {core.reg_data_out} <= {core.wire_data_out};
+"""
+
+# Template for an extra delay cycle code.
+
+extra_pipeline_stage_template = """\
+ (* SHREG_EXTRACT="NO" *)
+ reg [31: 0] {core.pipe_data_out};
+ always @(posedge sys_clk)
+ {core.pipe_data_out} <= {core.reg_data_out};
"""
# Template for .createMux() methods.
createMux_template = """\
- {core.mux_core_addr}:
- begin
- sys_read_data_mux = {core0.mux_data_reg};
- sys_error_mux = {core0.mux_error_reg};
- end
+ {core.mux_core_addr}: begin
+ sys_read_data_mux <= {core0.pipe_data_out};
+ sys_error_mux <= {core0.mux_error_reg};
+ end
"""
# Top-level (createModule) template.
@@ -416,56 +473,102 @@ createModule_template = """\
// NOTE: This file is generated; do not edit.
module core_selector
- (
- input wire sys_clk,
- input wire sys_rst_n,
-
- input wire [{core.bus_max}: 0] sys_{core.bus_name}_addr,
- input wire sys_{core.bus_name}_wr,
- input wire sys_{core.bus_name}_rd,
- output wire [31: 0] sys_read_data,
- input wire [31: 0] sys_write_data,
- output wire sys_error,
-{core.extra_wires}
- input wire noise,
- output wire [7 : 0] debug
- );
-
-
- //----------------------------------------------------------------
- // Address Decoder
- //----------------------------------------------------------------
- // upper {core.addr_width} bits specify core being addressed
- wire [{core.addr_max:>2}: 0] addr_core_num = sys_{core.bus_name}_addr[{core.bus_max}: 8];
- // lower 8 bits specify register offset in core
- wire [ 7: 0] addr_core_reg = sys_{core.bus_name}_addr[ 7: 0];
+(
+ input wire sys_clk,
+ input wire sys_rst_n,
+
+ input wire [{core.bus_max}: 0] sys_{core.bus_name}_addr,
+ input wire sys_{core.bus_name}_wr,
+ input wire sys_{core.bus_name}_rd,
+ output wire [31: 0] sys_read_data,
+ input wire [31: 0] sys_write_data,
+ output wire sys_error,
+ {core.extra_wires}
+ input wire noise,
+ output wire [ 7 :0] debug
+);
+
+
+ //----------------------------------------------------------------
+ // Localized Resets Generator
+ //----------------------------------------------------------------
+ wire [{core_count}-1:0] sys_rst_n_fanout;
+ reset_replicator #
+ (
+ .SHREG_WIDTH(8),
+ .FANOUT_WIDTH({core_count})
+ )
+ reset_replicator_inst
+ (
+ .sys_clk_in (sys_clk),
+ .sys_rst_n_in (sys_rst_n),
+ .sys_rst_n_out (sys_rst_n_fanout)
+ );
+
+
+ //----------------------------------------------------------------
+ // Address Decoder
+ //----------------------------------------------------------------
+ // upper {core.addr_width} bits specify core being addressed
+ // lower 8 bits specify register offset in core
+ wire [{core.addr_max:>2}: 0] addr_core_num = sys_{core.bus_name}_addr[{core.bus_max}: 8];
+ wire [ 7: 0] addr_core_reg = sys_{core.bus_name}_addr[ 7: 0];
+
+
+ //----------------------------------------------------------------
+ // Core Address Table
+ //----------------------------------------------------------------
+{addrs}
- //----------------------------------------------------------------
- // Core Address Table
- //----------------------------------------------------------------
-{addrs}
+ //----------------------------------------------------------------
+ // Core Instances
+ //----------------------------------------------------------------
+ wire sys_{core.bus_name}_cs = sys_{core.bus_name}_rd || sys_{core.bus_name}_wr;
{insts}
- //----------------------------------------------------------------
- // Output (Read Data) Multiplexer
- //----------------------------------------------------------------
- reg [31: 0] sys_read_data_mux;
- assign sys_read_data = sys_read_data_mux;
- reg sys_error_mux;
- assign sys_error = sys_error_mux;
- always @*
-
- case (addr_core_num)
+
+ //----------------------------------------------------------------
+ // Output (Read Data) Multiplexer
+ //----------------------------------------------------------------
+ (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly1 = 1'b0;
+ (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly2 = 1'b0;
+ (* SHREG_EXTRACT="NO" *) reg sys_{core.bus_name}_cs_dly3 = 1'b0;
+
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly1;
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly2;
+ (* SHREG_EXTRACT="NO" *) (* EQUIVALENT_REGISTER_REMOVAL="NO" *) reg [{core.addr_max:>2}: 0] addr_core_num_dly3;
+
+ always @(posedge sys_clk) begin
+ sys_{core.bus_name}_cs_dly1 <= sys_{core.bus_name}_cs;
+ sys_{core.bus_name}_cs_dly2 <= sys_{core.bus_name}_cs_dly1;
+ sys_{core.bus_name}_cs_dly3 <= sys_{core.bus_name}_cs_dly2;
+ end
+
+ always @(posedge sys_clk) begin
+ if (sys_{core.bus_name}_cs) addr_core_num_dly1 <= addr_core_num;
+ if (sys_{core.bus_name}_cs_dly1) addr_core_num_dly2 <= addr_core_num_dly1;
+ if (sys_{core.bus_name}_cs_dly2) addr_core_num_dly3 <= addr_core_num_dly2;
+ end
+
+ reg [31: 0] sys_read_data_mux;
+ reg sys_error_mux;
+
+ assign sys_read_data = sys_read_data_mux;
+ assign sys_error = sys_error_mux;
+
+ always @(posedge sys_clk)
+
+ if (sys_{core.bus_name}_cs_dly3)
+
+ case (addr_core_num_dly3)
{muxes}
- default:
- begin
- sys_read_data_mux = {{32{{1'b0}}}};
- sys_error_mux = 1;
- end
- endcase
-
+ default: begin
+ sys_read_data_mux <= {{32{{1'b0}}}};
+ sys_error_mux <= 1'b1;
+ end
+ endcase
endmodule
diff --git a/extra/reset_replicator.v b/extra/reset_replicator.v
new file mode 100644
index 0000000..ccb704b
--- /dev/null
+++ b/extra/reset_replicator.v
@@ -0,0 +1,93 @@
+//======================================================================
+//
+// reset_replicator.v
+// ------------------
+//
+// Generates localized copies of the system-wide reset so that each core can
+// have its own copy. This way there's more room for the placer to do its job.
+//
+// Author: Pavel Shatov
+// Copyright (c) 2016, 2018-2019 NORDUnet A/S All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// - Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// - Neither the name of the NORDUnet nor the names of its contributors may
+// be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//======================================================================
+
+module reset_replicator
+(
+ sys_clk_in,
+ sys_rst_n_in,
+ sys_rst_n_out
+);
+
+ //
+ // Parameters
+ //
+ parameter integer SHREG_WIDTH = 8;
+ parameter integer FANOUT_WIDTH = 8;
+
+ //
+ // Ports
+ //
+ input sys_clk_in;
+ input sys_rst_n_in;
+ output [FANOUT_WIDTH-1:0] sys_rst_n_out;
+
+ //
+ // Internals
+ //
+ wire [FANOUT_WIDTH-1:0] sys_rst_int;
+
+ //
+ // Localized Reset Replication
+ //
+ genvar i;
+ generate for (i=0; i<FANOUT_WIDTH; i=i+1)
+ //
+ begin : gen_sys_rst_n_out
+ //
+ LUT1 #(.INIT(2'b01)) LUT1_inst
+ ( .I0(sys_rst_n_in),
+ .O(sys_rst_int[i])
+ );
+ //
+ (* SHREG_EXTRACT="NO" *)
+ (* EQUIVALENT_REGISTER_REMOVAL="NO" *)
+ reg [SHREG_WIDTH-1:0] sys_rst_n_shreg_copy = {SHREG_WIDTH{1'b0}};
+ //
+ always @(posedge sys_clk_in or posedge sys_rst_int[i])
+ //
+ if (sys_rst_int[i]) sys_rst_n_shreg_copy <= {SHREG_WIDTH{1'b0}};
+ else sys_rst_n_shreg_copy <= {sys_rst_n_shreg_copy[SHREG_WIDTH-2:0], 1'b1};
+ //
+ assign sys_rst_n_out[i] = sys_rst_n_shreg_copy[SHREG_WIDTH-1];
+ //
+ end
+ //
+ endgenerate
+
+endmodule