path: root/modexpng_fpga_model.py



#!/usr/bin/python3
#
#
# ModExpNG core math model.
#
#
# Copyright (c) 2019, NORDUnet A/S
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
# - Neither the name of the NORDUnet nor the names of its contributors may
#   be used to endorse or promote products derived from this software
#   without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#


# -------
# Imports
#--------

import sys
import importlib
from enum import Enum, auto


# --------------
# Model Settings
# --------------

# length of public key
KEY_LENGTH = 1024

# how many parallel multipliers to use
NUM_MULTS  = 8


# ---------------
# Internal Values
# ---------------

# half of key length
_KEY_LENGTH_HALF = KEY_LENGTH // 2

# width of internal math pipeline
_WORD_WIDTH     = 16
_WORD_WIDTH_EXT = 18

_WORD_MASK     = 2 ** _WORD_WIDTH     - 1
_WORD_MASK_EXT = 2 ** _WORD_WIDTH_EXT - 1
_CARRY_MASK    = _WORD_MASK ^ _WORD_MASK_EXT

# folder with test vector scripts
_VECTOR_PATH = "/vector"

# name of test vector class
_VECTOR_CLASS = "Vector"


# ------------------
# Debugging Settings
# ------------------
DUMP_LADDER_INDEX      = -1     # at which ladder step to print debug vector
DUMP_VECTORS           = False  # print entire debug vector components
DUMP_INDICES           = False  # print indices of words at MAC inputs
DUMP_MACS_INPUTS       = False  # print MAC input words
DUMP_MACS_CLEARING     = False  # print MAC clearing bitmaps
DUMP_MACS_ACCUMULATION = False  # print MAC accumulators contents
DUMP_MULT_PARTS        = False  # print multiplication output parts
DUMP_RECOMBINATION     = False  # print recombination internals
DUMP_REDUCTION         = False  # print reduction internals
FORCE_OVERFLOW         = False  # force rarely seen internal overflow situation to verify how its handler works
DUMP_PROGRESS_FACTOR   = 16     # once per how many ladder steps to update progress indicator

#
# Multi-Precision Integer
#
class ModExpNG_Operand():

    def __init__(self, number, length, words = None):

        if words is None:

            # length must be divisible by word width
            if (length % _WORD_WIDTH) > 0:
                raise Exception("Bad number length!")

            self._init_from_number(number, length)

        else:

            # length must match words count
            if len(words) != length:
                raise Exception("Bad words count!")

            self._init_from_words(words, length)

    def format_verilog_concat(self, name):

        for i in range(len(self.words)):
            if i > 0:
                if (i % 4) == 0: print("")
                else:            print(" ", end='')
            print("%s[%3d] = 18'h%05x;" % (name, i, self.words[i]), end='')
        print("")

    def _init_from_words(self, words, count):

        for i in range(count):

            # word must not exceed 18 bits
            if words[i] >= (2 ** (_WORD_WIDTH_EXT)):
                raise Exception("Word is too large!")

        self.words = list(words)

    def _init_from_number(self, number, length):

        num_hexchars_per_word = _WORD_WIDTH // 4
        num_hexchars_total = length // num_hexchars_per_word

        value_hex = format(number, 'x')

        # value must not be larger than specified, but it can be smaller, so
        # we may need to prepend it with zeroes
        if len(value_hex) > num_hexchars_total:
            raise Exception("Number is too large!")
        else:
            while len(value_hex) < num_hexchars_total:
                value_hex = "0" + value_hex

        # create empty list
        self.words = list()

        # fill in words
        while len(value_hex) > 0:
            value_hex_part = value_hex[-num_hexchars_per_word:]
            value_hex = value_hex[:-num_hexchars_per_word]
            self.words.append(int(value_hex_part, 16))

    def number(self):
        ret = 0
        shift = 0
        for word in self.words:
            ret += word << shift
            shift += _WORD_WIDTH
        return ret

    def _get_half(self, part):
        num_words = len(self.words)
        num_words_half = num_words // 2
        if not part: return ModExpNG_Operand(None, num_words_half, self.words[:num_words_half])
        else:        return ModExpNG_Operand(None, num_words_half, self.words[num_words_half:])

    def lower_half(self):
        return self._get_half(False)

    def upper_half(self):
        return self._get_half(True)

#
# Test Vector
#
class ModExpNG_TestVector():

    def __init__(self):

        # format target filename
        filename = "vector_" + str(KEY_LENGTH) + "_randomized"

        # add ./vector to import search path
        sys.path.insert(1, sys.path[0] + _VECTOR_PATH)

        # import from filename
        vector_module = importlib.import_module(filename)

        # get vector class
        vector_class = getattr(vector_module, _VECTOR_CLASS)

        # instantiate vector class
        vector_inst = vector_class()

        # obtain parts of vector
        self.m        = ModExpNG_Operand(vector_inst.m,         KEY_LENGTH)
        self.n        = ModExpNG_Operand(vector_inst.n,         KEY_LENGTH)
        self.d        = ModExpNG_Operand(vector_inst.d,         KEY_LENGTH)
        self.p        = ModExpNG_Operand(vector_inst.p,        _KEY_LENGTH_HALF)
        self.q        = ModExpNG_Operand(vector_inst.q,        _KEY_LENGTH_HALF)
        self.dp       = ModExpNG_Operand(vector_inst.dp,       _KEY_LENGTH_HALF)
        self.dq       = ModExpNG_Operand(vector_inst.dq,       _KEY_LENGTH_HALF)
        self.qinv     = ModExpNG_Operand(vector_inst.qinv,     _KEY_LENGTH_HALF)
        self.n_factor = ModExpNG_Operand(vector_inst.n_factor,  KEY_LENGTH)
        self.p_factor = ModExpNG_Operand(vector_inst.p_factor, _KEY_LENGTH_HALF)
        self.q_factor = ModExpNG_Operand(vector_inst.q_factor, _KEY_LENGTH_HALF)
        self.n_coeff  = ModExpNG_Operand(vector_inst.n_coeff,   KEY_LENGTH      + _WORD_WIDTH)
        self.p_coeff  = ModExpNG_Operand(vector_inst.p_coeff,  _KEY_LENGTH_HALF + _WORD_WIDTH)
        self.q_coeff  = ModExpNG_Operand(vector_inst.q_coeff,  _KEY_LENGTH_HALF + _WORD_WIDTH)
        self.x        = ModExpNG_Operand(vector_inst.x,         KEY_LENGTH)
        self.y        = ModExpNG_Operand(vector_inst.y,         KEY_LENGTH)

class ModExpNG_WideBankEnum(Enum):
    A   = auto()
    B   = auto()
    C   = auto()
    D   = auto()
    E   = auto()
    N   = auto()
    L   = auto()
    H   = auto()

class ModExpNG_NarrowBankEnum(Enum):
    A       = auto()
    B       = auto()
    C       = auto()
    D       = auto()
    E       = auto()
    N_COEFF = auto()
    I       = auto()

class ModExpNG_CoreInputEnum(Enum):
    M        = auto()

    N        = auto()
    P        = auto()
    Q        = auto()

    N_COEFF  = auto()
    P_COEFF  = auto()
    Q_COEFF  = auto()

    N_FACTOR = auto()
    P_FACTOR = auto()
    Q_FACTOR = auto()

    X        = auto()
    Y        = auto()

    QINV     = auto()

class ModExpNG_CoreOutputEnum(Enum):
    XM = auto()
    YM = auto()
    S  = auto()

class ModExpNG_WideBank():

    def __init__(self):
        self.a = None
        self.b = None
        self.c = None
        self.d = None
        self.e = None
        self.n = None
        self.l = None
        self.h = None

    def _get_value(self, sel):
        if   sel == ModExpNG_WideBankEnum.A:   return self.a
        elif sel == ModExpNG_WideBankEnum.B:   return self.b
        elif sel == ModExpNG_WideBankEnum.C:   return self.c
        elif sel == ModExpNG_WideBankEnum.D:   return self.d
        elif sel == ModExpNG_WideBankEnum.E:   return self.e
        elif sel == ModExpNG_WideBankEnum.N:   return self.n
        elif sel == ModExpNG_WideBankEnum.L:   return self.l
        elif sel == ModExpNG_WideBankEnum.H:   return self.h
        else: raise Exception("ModExpNG_WideBank._get_value(): Invalid selector!")

    def _set_value(self, sel, value):
        if   sel == ModExpNG_WideBankEnum.A:   self.a   = value
        elif sel == ModExpNG_WideBankEnum.B:   self.b   = value
        elif sel == ModExpNG_WideBankEnum.C:   self.c   = value
        elif sel == ModExpNG_WideBankEnum.D:   self.d   = value
        elif sel == ModExpNG_WideBankEnum.E:   self.e   = value
        elif sel == ModExpNG_WideBankEnum.N:   self.n   = value
        elif sel == ModExpNG_WideBankEnum.L:   self.l   = value
        elif sel == ModExpNG_WideBankEnum.H:   self.h   = value
        else: raise Exception("ModExpNG_WideBank._set_value(): Invalid selector!")

class ModExpNG_NarrowBank():

    def __init__(self, i):
        self.a       = None
        self.b       = None
        self.c       = None
        self.d       = None
        self.e       = None
        self.n_coeff = None
        self.i       = i

    def _get_value(self, sel):
        if   sel == ModExpNG_NarrowBankEnum.A:       return self.a
        elif sel == ModExpNG_NarrowBankEnum.B:       return self.b
        elif sel == ModExpNG_NarrowBankEnum.C:       return self.c
        elif sel == ModExpNG_NarrowBankEnum.D:       return self.d
        elif sel == ModExpNG_NarrowBankEnum.E:       return self.e
        elif sel == ModExpNG_NarrowBankEnum.N_COEFF: return self.n_coeff
        elif sel == ModExpNG_NarrowBankEnum.I:       return self.i
        else: raise Exception("ModExpNG_NarrowBank._get_value(): Invalid selector!")

    def _set_value(self, sel, value):
        if   sel == ModExpNG_NarrowBankEnum.A:       self.a       = value
        elif sel == ModExpNG_NarrowBankEnum.B:       self.b       = value
        elif sel == ModExpNG_NarrowBankEnum.C:       self.c       = value
        elif sel == ModExpNG_NarrowBankEnum.D:       self.d       = value
        elif sel == ModExpNG_NarrowBankEnum.E:       self.e       = value
        elif sel == ModExpNG_NarrowBankEnum.N_COEFF: self.n_coeff = value
        else: raise Exception("ModExpNG_NarrowBank._set_value(): Invalid selector!")

class ModExpNG_CoreInput():

    def __init__(self):
        self._m        = None

        self._n        = None
        self._p        = None
        self._q        = None

        self._n_coeff  = None
        self._p_coeff  = None
        self._q_coeff  = None

        self._n_factor = None
        self._p_factor = None
        self._q_factor = None

        self._x        = None
        self._y        = None

        self._qinv     = None

    def set_value(self, sel, value):
        if   sel == ModExpNG_CoreInputEnum.M:        self._m        = value

        elif sel == ModExpNG_CoreInputEnum.N:        self._n        = value
        elif sel == ModExpNG_CoreInputEnum.P:        self._p        = value
        elif sel == ModExpNG_CoreInputEnum.Q:        self._q        = value

        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  self._n_coeff  = value
        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  self._p_coeff  = value
        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  self._q_coeff  = value

        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: self._n_factor = value
        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: self._p_factor = value
        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: self._q_factor = value

        elif sel == ModExpNG_CoreInputEnum.X:        self._x        = value
        elif sel == ModExpNG_CoreInputEnum.Y:        self._y        = value

        elif sel == ModExpNG_CoreInputEnum.QINV:     self._qinv     = value

        else: raise Exception("ModExpNG_CoreInput.set_value(): invalid selector!")

    def _get_value(self, sel):
        if   sel == ModExpNG_CoreInputEnum.M:        return self._m

        elif sel == ModExpNG_CoreInputEnum.N:        return self._n
        elif sel == ModExpNG_CoreInputEnum.P:        return self._p
        elif sel == ModExpNG_CoreInputEnum.Q:        return self._q

        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  return self._n_coeff
        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  return self._p_coeff
        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  return self._q_coeff

        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: return self._n_factor
        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: return self._p_factor
        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: return self._q_factor

        elif sel == ModExpNG_CoreInputEnum.X:        return self._x
        elif sel == ModExpNG_CoreInputEnum.Y:        return self._y

        elif sel == ModExpNG_CoreInputEnum.QINV:     return self._qinv

        else: raise Exception("ModExpNG_CoreInput._get_value(): invalid selector!")

class ModExpNG_CoreOutput():

    def __init__(self):
        self._xm = None
        self._ym = None
        self._s  = None

    def _set_value(self, sel, value):
        if   sel == ModExpNG_CoreOutputEnum.XM: self._xm = value
        elif sel == ModExpNG_CoreOutputEnum.YM: self._ym = value
        elif sel == ModExpNG_CoreOutputEnum.S:  self._s  = value
        else: raise Exception("ModExpNG_CoreOutput._set_value(): invalid selector!")

    def get_value(self, sel):
        if   sel == ModExpNG_CoreOutputEnum.XM: return self._xm
        elif sel == ModExpNG_CoreOutputEnum.YM: return self._ym
        elif sel == ModExpNG_CoreOutputEnum.S:  return self._s
        else: raise Exception("ModExpNG_CoreOutput.get_value(): invalid selector!")

class ModExpNG_BanksPair():

    def __init__(self, i):
        self.wide = ModExpNG_WideBank()
        self.narrow = ModExpNG_NarrowBank(i)

    def _get_wide(self, sel):
        return self.wide._get_value(sel)

    def _get_narrow(self, sel):
        return self.narrow._get_value(sel)

    def _set_wide(self, sel, value):
        self.wide._set_value(sel, value)

    def _set_narrow(self, sel, value):
        self.narrow._set_value(sel, value)

class ModExpNG_BanksLadder():

    def __init__(self, i):
        self.ladder_x = ModExpNG_BanksPair(i)
        self.ladder_y = ModExpNG_BanksPair(i)

class ModExpNG_BanksCRT():

    def __init__(self, i):
        self.crt_x = ModExpNG_BanksLadder(i)
        self.crt_y = ModExpNG_BanksLadder(i)

class ModExpNG_PartRecombinator():

    def _bit_select(self, x, msb, lsb):
        y = 0
        for pos in range(lsb, msb+1):
            y |= (x & (1 << pos)) >> lsb
        return y

    def _flush_pipeline(self, dump):
        self.z0, self.y0, self.x0 = 0, 0, 0
        if dump and DUMP_RECOMBINATION:
            print("RCMB -> flush()")

    def _push_pipeline(self, part, dump):

        # split next part into 16-bit words
        z = self._bit_select(part, 46, 32)
        y = self._bit_select(part, 31, 16)
        x = self._bit_select(part, 15,  0)

        # shift to the right
        z1 = z
        y1 = y + self.z0
        x1 = x + self.y0 + (self.x0 >> _WORD_WIDTH) # IMPORTANT: This carry can be up to two bits wide!!

        # save lower 16 bits of the rightmost cell
        t = self.x0 & _WORD_MASK

        # update internal latches
        self.z0, self.y0, self.x0 = z1, y1, x1

        # dump
        if dump and DUMP_RECOMBINATION:
            print("RCMB -> push(): part = 0x%012x, word = 0x%04x" % (part, t))

        # done
        return t

    def recombine_square(self, parts, ab_num_words, dump):

        # empty results so far
        words_lsb = list()  # n words
        words_msb = list()  # n words

        # recombine the lower half (n parts)
        # the first tick produces null result, the last part
        # produces three words and needs two extra ticks
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 1 + 2):
            next_part = parts[i] if i < ab_num_words else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_lsb.append(next_word)

        # recombine the upper half (n-1 parts)
        # the first tick produces null result
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 1):
            next_part = parts[i + ab_num_words] if i < (ab_num_words - 1) else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_msb.append(next_word)

        # merge words
        words = list()

        # merge lower half
        for x in range(ab_num_words):
            next_word = words_lsb[x]
            words.append(next_word)

        # merge upper half adding the two overlapping words
        for x in range(ab_num_words):
            next_word = words_msb[x]
            if x < 2:
                next_word += words_lsb[x + ab_num_words]
            words.append(next_word)

        return words

    def recombine_triangle(self, parts, ab_num_words, dump):

        # empty result so far
        words_lsb = list()

        # recombine the lower half (n+1 parts)
        # the first tick produces null result, so we need n + 1 + 1 = n + 2
        # ticks total and should only save the result word during the last
        # n + 1 ticks
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 2):

            next_part = parts[i] if i < (ab_num_words + 1) else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_lsb.append(next_word)

        return words_lsb

    def recombine_rectangle(self, parts, ab_num_words, dump):

        # empty result so far
        words_lsb = list()  # n words
        words_msb = list()  # n+1 words

        # recombine the lower half (n parts)
        # the first tick produces null result, the last part
        # produces three words and needs two extra ticks
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 1 + 2):
            next_part = parts[i] if i < ab_num_words else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_lsb.append(next_word)

        # recombine the upper half (n parts)
        # the first tick produces null result, the last part
        # produces two words and needs an extra tick
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 2):
            next_part = parts[i + ab_num_words] if i < ab_num_words else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_msb.append(next_word)

        # merge words
        words = list()

        # merge lower half
        for x in range(ab_num_words):
            next_word = words_lsb[x]
            words.append(next_word)

        # merge upper half adding the two overlapping words
        for x in range(ab_num_words + 1):
            next_word = words_msb[x]
            if x < 2:
                next_word += words_lsb[x + ab_num_words]
            words.append(next_word)

        return words

class ModExpNG_WordMultiplier():

    def __init__(self):

        self._macs = list()
        self._indices = list()

        self._mac_aux = list()
        self._index_aux = list()

        for x in range(NUM_MULTS):
            self._macs.append(0)
            self._indices.append(0)

        self._mac_aux.append(0)
        self._index_aux.append(0)

    def _clear_all_macs(self, t, col, dump):
        for x in range(NUM_MULTS):
            self._macs[x] = 0
        if dump and DUMP_MACS_CLEARING:
            print("t=%2d, col=%2d > clear > all" % (t, col))

    def _clear_one_mac(self, x, t, col, dump):
        self._macs[x] = 0
        if dump and DUMP_MACS_CLEARING:
            print("t=%2d, col=%2d > clear > x=%d" % (t, col, x))

    def _clear_mac_aux(self, t, col, dump):
        self._mac_aux[0] = 0
        if dump and DUMP_MACS_CLEARING:
            print("t= 0, col=%2d > clear > aux" % (col))

    def _update_one_mac(self, x, t, col, a, b, dump, need_aux=False):

        if a >= (2 ** _WORD_WIDTH_EXT):
            raise Exception("a > 0x3FFFF!")

        if b >= (2 ** _WORD_WIDTH):
            raise Exception("b > 0xFFFF!")

        p = a * b
        if dump and DUMP_MACS_INPUTS:
            if x == 0: print("t=%2d, col=%2d > b=%05x > " % (t, col, b), end='')
            if x > 0: print("; ", end='')
            print("MAC[%d]: a=%05x" % (x, a), end='')
            if x == (NUM_MULTS-1) and not need_aux: print("")

        self._macs[x] += p

    def _update_mac_aux(self, y, col, a, b, dump):

        if a >= (2 ** _WORD_WIDTH_EXT):
            raise Exception("a > 0x3FFFF!")

        if b >= (2 ** _WORD_WIDTH):
            raise Exception("b > 0xFFFF!")

        p = a * b
        if dump and DUMP_MACS_INPUTS:
            print("; AUX: a=%05x" % a)
            
        self._mac_aux[0] += p

    def _preset_indices(self, col):
        for x in range(len(self._indices)):
            self._indices[x] = col * len(self._indices) + x

    def _preset_index_aux(self, num_cols):
        self._index_aux[0] = num_cols * len(self._indices)

    def _dump_macs_helper(self, t, col, aux=False):
        print("t=%2d, col=%2d > "% (t, col), end='')
        for i in range(NUM_MULTS):
            if i > 0: print(" | ", end='')
            print("mac[%d]: 0x%012x" % (i, self._macs[i]), end='')
        if aux:
            print(" | mac_aux[ 0]: 0x%012x" % (self._mac_aux[0]), end='')
        print("")

    def _dump_macs(self, t, col):
        self._dump_macs_helper(t, col)

    def _dump_macs_with_aux(self, t, col):
        self._dump_macs_helper(t, col, True)

    def _dump_indices_helper(self, t, col, aux=False):
        print("t=%2d, col=%2d > indices:" % (t, col), end='')
        for i in range(NUM_MULTS):
            print(" %2d" % self._indices[i], end='')
        if aux:
            print(" %2d" % self._index_aux[0], end='')
        print("")

    def _dump_indices(self, t, col):
        self._dump_indices_helper(t, col)

    def _dump_indices_with_aux(self, t, col):
        self._dump_indices_helper(t, col, True)

    def _rotate_indices(self, num_words):
        for x in range(len(self._indices)):
            if self._indices[x] > 0:
                self._indices[x] -= 1
            else:
                self._indices[x] = num_words - 1

    def _rotate_index_aux(self):
        self._index_aux[0] -= 1

    def _mult_store_part(self, parts, time, column, part_index, mac_index, dump):
        parts[part_index] = self._macs[mac_index]
        if dump and DUMP_MULT_PARTS:
            print("t=%2d, col=%2d > parts[%2d]: mac[%d] = 0x%012x" %
                (time, column, part_index, mac_index, parts[part_index]))

    def _mult_store_part_aux(self, parts, time, column, part_index, dump):
        parts[part_index] = self._mac_aux[0]
        if dump and DUMP_MULT_PARTS:
            print("t=%2d, col=%2d > parts[%2d]: mac_aux[%d] = 0x%012x" %
                (time, column, part_index, 0, parts[part_index]))

    def multiply_square(self, a_wide, b_narrow, ab_num_words, dump=False):

        num_cols = ab_num_words // NUM_MULTS

        parts = list()
        for i in range(2 * ab_num_words - 1):
            parts.append(0)

        for col in range(num_cols):

            b_carry = 0

            for t in range(ab_num_words):

                # take care of indices
                if t == 0: self._preset_indices(col)
                else:      self._rotate_indices(ab_num_words)

                # take care of macs
                if t == 0:
                    self._clear_all_macs(t, col, dump)
                else:
                    t1 = t - 1
                    if (t1 // 8) == col:
                        self._clear_one_mac(t1 % NUM_MULTS, t, col, dump)

                # debug output
                if dump and DUMP_INDICES: self._dump_indices(t, col)

                # current b-word
                # multiplier's b-input is limited to 16-bit words, so we need to propagate
                # carries on the fly here, carry can be up to two bits
                bt = b_narrow.words[t] + b_carry
                b_carry = (bt & _CARRY_MASK) >> _WORD_WIDTH
                if dump and b_carry > 1:
                    print("Rare overflow case was detected and then successfully corrected.")
                bt &= _WORD_MASK

                # multiply by a-words
                for x in range(NUM_MULTS):
                    ax = a_wide.words[self._indices[x]]
                    self._update_one_mac(x, t, col, ax, bt, dump)

                    if t == (col * NUM_MULTS + x):
                        part_index = t
                        self._mult_store_part(parts, t, col, part_index, x, dump)

                # debug output
                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs(t, col)

                # save the uppers part of product at end of column,
                # for the last column don't save the very last part
                if t == (ab_num_words - 1):
                    for x in range(NUM_MULTS):
                        if not (col == (num_cols - 1) and x == (NUM_MULTS - 1)):
                            part_index = ab_num_words + col * NUM_MULTS + x
                            self._mult_store_part(parts, t, col, part_index, x, dump)

        return parts

    def multiply_triangle(self, a_wide, b_narrow, ab_num_words, dump=False):

        num_cols = ab_num_words // NUM_MULTS

        parts = list()
        for i in range(ab_num_words + 1):
            parts.append(0)

        for col in range(num_cols):

            last_col = col == (num_cols - 1)

            for t in range(ab_num_words + 1):

                # take care of indices
                if t == 0: self._preset_indices(col)
                else:      self._rotate_indices(ab_num_words)

                # take care of auxilary index
                if last_col:
                    if t == 0: self._preset_index_aux(num_cols)
                    else:      self._rotate_index_aux()

                # take care of macs
                if t == 0: self._clear_all_macs(t, col, dump)

                # take care of auxilary mac
                if last_col:
                    if t == 0: self._clear_mac_aux(t, col, dump)

                # debug output
                if dump and DUMP_INDICES: self._dump_indices_with_aux(t, col)

                # current b-word
                bt = b_narrow.words[t]

                # multiply by a-words
                for x in range(NUM_MULTS):
                    ax = a_wide.words[self._indices[x]]
                    self._update_one_mac(x, t, col, ax, bt, dump, last_col)

                    if t == (col * NUM_MULTS + x):
                        part_index = t
                        self._mult_store_part(parts, t, col, part_index, x, dump)

                # aux multiplier
                if last_col:
                    ax = a_wide.words[self._index_aux[0]]
                    self._update_mac_aux(t, col, ax, bt, dump)

                    if t == ab_num_words:
                        part_index = t
                        self._mult_store_part_aux(parts, t, col, part_index, dump)

                # debug output
                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs_with_aux(t, col)

                # shortcut
                if not last_col:
                    if t == (NUM_MULTS * (col + 1) - 1): break

        return parts

    def multiply_rectangle(self, a_wide, b_narrow, ab_num_words, dump=False):

        num_cols = ab_num_words // NUM_MULTS

        parts = list()
        for i in range(2 * ab_num_words):
            parts.append(0)

        for col in range(num_cols):

            for t in range(ab_num_words + 1):

                # take care of indices
                if t == 0: self._preset_indices(col)
                else:      self._rotate_indices(ab_num_words)

                # take care of macs
                if t == 0:
                    self._clear_all_macs(t, col, dump)
                else:
                    t1 = t - 1
                    if (t1 // 8) == col:
                        self._clear_one_mac(t1 % NUM_MULTS, t, col, dump)

                # debug output
                if dump and DUMP_INDICES: self._dump_indices(t, col)

                # current b-word
                bt = b_narrow.words[t]

                # multiply by a-words
                for x in range(NUM_MULTS):
                    ax = a_wide.words[self._indices[x]]
                    self._update_one_mac(x, t, col, ax, bt, dump)

                    # don't save one value for the very last time instant per column
                    if t < ab_num_words and t == (col * NUM_MULTS + x):
                        part_index = t
                        self._mult_store_part(parts, t, col, part_index, x, dump)

                # debug output
                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs(t, col)

                # save the upper parts of product at end of column
                if t == ab_num_words:
                    for x in range(NUM_MULTS):
                        part_index = ab_num_words + col * NUM_MULTS + x
                        self._mult_store_part(parts, t, col, part_index, x, dump)

        return parts

class ModExpNG_LowlevelOperator():

    def _check_word(self, a):
        if a < 0 or a > _WORD_MASK:
            raise Exception("Word out of range!")

    def _check_carry_borrow(self, cb):
        if cb < 0 or cb > 1:
            raise Exception("Carry or borrow out of range!")

    def add_words(self, a, b, c_in):

        self._check_word(a)
        self._check_word(b)
        self._check_carry_borrow(c_in)

        sum = a + b + c_in

        sum_s = sum & _WORD_MASK
        sum_c = sum >> _WORD_WIDTH

        return (sum_c, sum_s)

    def sub_words(self, a, b, b_in):

        self._check_word(a)
        self._check_word(b)
        self._check_carry_borrow(b_in)

        dif = a - b - b_in

        if dif < 0:
            dif_b = 1
            dif_d = dif + 2 ** _WORD_WIDTH
        else:
            dif_b = 0
            dif_d = dif

        return (dif_b, dif_d)

class ModExpNG_Worker():

    def __init__(self):
        self.lowlevel     = ModExpNG_LowlevelOperator()
        self.multiplier   = ModExpNG_WordMultiplier()
        self.recombinator = ModExpNG_PartRecombinator()

    def serial_subtract_modular(self, a, b, n, ab_num_words):
        c_in = 0
        b_in = 0
        ab = list()
        ab_n = list()
        for x in range(ab_num_words):
            a_word = a.words[x]
            b_word = b.words[x]
            (b_out, d_out) = self.lowlevel.sub_words(a_word, b_word, b_in)
            (c_out, s_out) = self.lowlevel.add_words(d_out, n.words[x], c_in)
            ab.append(d_out)
            ab_n.append(s_out)
            (c_in, b_in) = (c_out, b_out)
        d = ab if not b_out else ab_n
        return ModExpNG_Operand(None, ab_num_words, d)

    def serial_add_uneven(self, a, b, ab_num_words):
        c_in = 0
        ab = list()
        for x in range(2 * ab_num_words):
            a_word = a.words[x] if x < ab_num_words else 0
            b_word = b.words[x]
            (c_out, s_out) = self.lowlevel.add_words(a_word, b_word, c_in)
            ab.append(s_out)
            c_in = c_out
        return ModExpNG_Operand(None, 2*ab_num_words, ab)

    def multipurpose_multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False, dump_crt="", dump_ladder=""):

        #
        # 1. AB = A * B
        #
        if dump: print("multiply_square(%s_%s)" % (dump_crt, dump_ladder))

        if reduce_only:
            ab = b
        else:
            ab_parts = self.multiplier.multiply_square(a, b, ab_num_words, dump)
            ab_words = self.recombinator.recombine_square(ab_parts, ab_num_words, dump)
            ab = ModExpNG_Operand(None, 2 * ab_num_words, ab_words)

        if dump and DUMP_VECTORS:
            ab.format_verilog_concat("%s_%s_AB" % (dump_crt, dump_ladder))

        if multiply_only:
            return ModExpNG_Operand(None, 2*ab_num_words, ab_words)

        #
        # 2. Q = LSB(AB) * N_COEFF
        #
        if dump: print("multiply_triangle(%s_%s)" % (dump_crt, dump_ladder))

        q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words, dump)
        q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words, dump)
        q = ModExpNG_Operand(None, ab_num_words + 1, q_words)

        if dump and DUMP_VECTORS:
            q.format_verilog_concat("%s_%s_Q" % (dump_crt, dump_ladder))

        #
        # 3. M = Q * N
        #
        if dump: print("multiply_rectangle(%s_%s)" % (dump_crt, dump_ladder))

        m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words, dump)
        m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words, dump)
        m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)

        if dump and DUMP_VECTORS:
            m.format_verilog_concat("%s_%s_M" % (dump_crt, dump_ladder))

        #
        # 4. R = AB + M
        #

        #
        # 4a. compute carry (actual sum is all zeroes and need not be stored)
        #
        
        r_cy = 0 # this can be up to two bits, since we're adding extended words!!
        for i in range(ab_num_words + 1):
            s = ab.words[i] + m.words[i] + r_cy
            r_cy_new = s >> _WORD_WIDTH

            if dump and DUMP_REDUCTION:
                print("[%2d] 0x%05x + 0x%05x + 0x%x => {0x%x, [0x%05x]}" %
                    (i, ab.words[i], m.words[i], r_cy, r_cy_new, s & 0xffff))   # ???

            r_cy = r_cy_new


        #
        # 4b. Initialize empty result
        #
        
        R = list()
        for i in range(ab_num_words):
            R.append(0)

        #
        # 4c. compute the actual upper part of sum (take carry into account)
        #
        
        for i in range(ab_num_words):

            if dump and DUMP_REDUCTION:
                print("[%2d]" % i, end='')

            ab_word = ab.words[ab_num_words + i + 1] if i < (ab_num_words - 1) else 0
            if dump and DUMP_REDUCTION:
                print(" 0x%05x" % ab_word, end='')

            m_word = m.words[ab_num_words + i + 1]
            if dump and DUMP_REDUCTION:
                print(" + 0x%05x" % m_word, end='')

            if i == 0: R[i] = r_cy
            else:      R[i] = 0

            if dump and DUMP_REDUCTION:
                print(" + 0x%x" % R[i], end='')

            R[i] += ab_word
            R[i] += m_word
            if dump and DUMP_REDUCTION:
                print(" = 0x%05x" % R[i])

        return ModExpNG_Operand(None, ab_num_words, R)

    def convert_nonredundant(self, a, num_words):
        carry = 0
        for x in range(num_words):
            a.words[x] += carry
            carry = a.words[x] >> _WORD_WIDTH
            a.words[x] &= _WORD_MASK
        return carry

class ModExpNG_Core():

    def __init__(self, i):
        self.wrk = ModExpNG_Worker()
        self.bnk = ModExpNG_BanksCRT(i)
        self.inp = ModExpNG_CoreInput()
        self.out = ModExpNG_CoreOutput()

    #
    # CRT_(X|Y) means either CRT_X or CRT_Y
    # LADDER_{X,Y} means both LADDER_X and LADDER_Y
    #

    #
    # copy from CRT_(X|Y).LADDER_X.NARROW to OUTPUT
    #
    def set_output_from_narrow(self, sel_output, bank_crt, sel_narrow):
        self.out._set_value(sel_output, bank_crt.ladder_x._get_narrow(sel_narrow))

    #
    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.NARROW
    #
    def set_narrow_from_input(self, bank_crt, sel_narrow, sel_input):
        bank_crt.ladder_x._set_narrow(sel_narrow, self.inp._get_value(sel_input))
        bank_crt.ladder_y._set_narrow(sel_narrow, self.inp._get_value(sel_input))

    #
    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.WIDE
    #
    def set_wide_from_input(self, bank_crt, sel_wide, sel_input):
        bank_crt.ladder_x._set_wide(sel_wide, self.inp._get_value(sel_input))
        bank_crt.ladder_y._set_wide(sel_wide, self.inp._get_value(sel_input))

    #
    # copy from CRT_Y.LADDER_{X,Y}.{WIDE,NARROW} to CRT_X.LADDER_{X,Y}.{WIDE,NARROW}
    #
    def copy_crt_y2x(self, sel_wide, sel_narrow):

        self.bnk.crt_x.ladder_x._set_wide(sel_wide, self.bnk.crt_y.ladder_x._get_wide(sel_wide))
        self.bnk.crt_x.ladder_y._set_wide(sel_wide, self.bnk.crt_y.ladder_y._get_wide(sel_wide))

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow))
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, self.bnk.crt_y.ladder_y._get_narrow(sel_narrow))

    #
    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{X,Y}.LADDER_Y.{WIDE,NARROW}
    #
    def copy_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):

        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))

        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))

    #
    # copy from CRT_{X,Y}.LADDER_Y.{WIDE,NARROW} to CRT_{X,Y}.LADDER_X.{WIDE,NARROW}
    #
    def copy_ladders_y2x(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, self.bnk.crt_x.ladder_y._get_wide(sel_wide_in))
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, self.bnk.crt_y.ladder_y._get_wide(sel_wide_in))

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in))
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in))

    #
    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{Y,X}.LADDER_Y.{WIDE,NARROW}
    #
    def cross_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):

        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
        
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))

    #
    # modular multiply sel_wide_in by sel_narrow_in
    # stores intermediate result in WIDE.L and WIDE.H
    # needs modulus WIDE.N and speed-up coefficients NARROW.N_COEFF to be filled
    # places two copies of resulting quantity in sel_wide_out and sel_narrow_out
    # sel_*_in and sel_*_out can overlap (overwriting of input operands is ok)
    #
    def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True), d=False):

        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)

        xxa       = self.bnk.crt_x.ladder_x._get_wide(sel_wide_in)
        xya       = self.bnk.crt_x.ladder_y._get_wide(sel_wide_in)

        yxa       = self.bnk.crt_y.ladder_x._get_wide(sel_wide_in)
        yya       = self.bnk.crt_y.ladder_y._get_wide(sel_wide_in)

        xxb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        xyb       = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in)

        yxb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
        yyb       = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in)

        if not mode[0]: xb = xxb
        else:           xb = xyb

        if not mode[1]: yb = yxb
        else:           yb = yyb

        xxp = self.wrk.multipurpose_multiply(xxa, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="X")
        xyp = self.wrk.multipurpose_multiply(xya, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="Y")

        yxp = self.wrk.multipurpose_multiply(yxa, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="X")
        yyp = self.wrk.multipurpose_multiply(yya, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="Y")

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xxp)
        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, xyp)
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yxp)
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, yyp)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxp)
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyp)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxp)
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyp)

    #
    # modular subtract values in sel_narrow_in (X-Y)
    # stores two copies of the result in sel_*_out
    #
    def modular_subtract(self, sel_narrow_in, sel_narrow_out, sel_wide_out, num_words):

        xa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        xb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in)
        xn = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        ya = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
        yb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in)
        yn = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xd = self.wrk.serial_subtract_modular(xa, xb, xn, num_words)
        yd = self.wrk.serial_subtract_modular(ya, yb, yn, num_words)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xd)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yd)

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xd)
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yd)
    
    #
    # modular reduce sel_narrow_in
    # stores two copies of the result in sel_*_out
    #
    def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):

        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)

        xb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        yb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)

        xp = self.wrk.multipurpose_multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
        yp = self.wrk.multipurpose_multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xp)
        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, xp)
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yp)
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, yp)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xp)
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xp)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yp)
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yp)

    #
    # propagate carries (convert to non-redundant representation) content in sel_narrow
    # overwrites input value
    #
    def propagate_carries(self, sel_narrow, num_words):
        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_x._get_narrow(sel_narrow), num_words)
        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_y._get_narrow(sel_narrow), num_words)
        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_x._get_narrow(sel_narrow), num_words)
        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_y._get_narrow(sel_narrow), num_words)

    #
    # copy from CRT_{X,Y}.LADDER_{X,Y}.WIDE.{H,L} to CRT_{X,Y}.LADDER_{X,Y}.NARROW
    #
    def merge_lha(self, sel_narrow, num_words):
        xx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
        xy_lsb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.L)
        yx_lsb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
        yy_lsb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.L)

        xx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
        xy_msb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.H)
        yx_msb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
        yy_msb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.H)

        xx = xx_lsb.words + xx_msb.words
        xy = xy_lsb.words + xy_msb.words
        yx = yx_lsb.words + yx_msb.words
        yy = yy_lsb.words + yy_msb.words

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xx))
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xy))
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yx))
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yy))

    #
    # multiply sel_wide_in by sel_narrow_in
    # stores twice larger product in WIDE.L and WIDE.H
    #
    def regular_multiply(self, sel_wide_in, sel_narrow_in, num_words):

        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)

        xxa       = self.bnk.crt_x.ladder_x._get_wide(sel_wide_in)
        xya       = self.bnk.crt_x.ladder_y._get_wide(sel_wide_in)

        yxa       = self.bnk.crt_y.ladder_x._get_wide(sel_wide_in)
        yya       = self.bnk.crt_y.ladder_y._get_wide(sel_wide_in)

        xb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        yb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)

        xxp = self.wrk.multipurpose_multiply(xxa, xb, None, None, num_words, multiply_only=True)
        xyp = self.wrk.multipurpose_multiply(xya, xb, None, None, num_words, multiply_only=True)

        yxp = self.wrk.multipurpose_multiply(yxa, yb, None, None, num_words, multiply_only=True)
        yyp = self.wrk.multipurpose_multiply(yya, yb, None, None, num_words, multiply_only=True)

        xxp_lsb = xxp.lower_half()
        xxp_msb = xxp.upper_half()

        xyp_lsb = xyp.lower_half()
        xyp_msb = xyp.upper_half()

        yxp_lsb = yxp.lower_half()
        yxp_msb = yxp.upper_half()

        yyp_lsb = yyp.lower_half()
        yyp_msb = yyp.upper_half()

        self.bnk.crt_x.ladder_x._set_wide(ModExpNG_WideBankEnum.L, xxp_lsb)
        self.bnk.crt_x.ladder_y._set_wide(ModExpNG_WideBankEnum.L, xyp_lsb)
        self.bnk.crt_y.ladder_x._set_wide(ModExpNG_WideBankEnum.L, yxp_lsb)
        self.bnk.crt_y.ladder_y._set_wide(ModExpNG_WideBankEnum.L, yyp_lsb)

        self.bnk.crt_x.ladder_x._set_wide(ModExpNG_WideBankEnum.H, xxp_msb)
        self.bnk.crt_x.ladder_y._set_wide(ModExpNG_WideBankEnum.H, xyp_msb)
        self.bnk.crt_y.ladder_x._set_wide(ModExpNG_WideBankEnum.H, yxp_msb)
        self.bnk.crt_y.ladder_y._set_wide(ModExpNG_WideBankEnum.H, yyp_msb)

    #
    # adds sel_narrow_a_in to sel_narrow_b_in
    # stores result in sel_narrow_out
    #
    def regular_add(self, sel_narrow_a_in, sel_narrow_b_in, sel_narrow_out, num_words):
        xxa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_a_in)
        xya = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_a_in)
        yxa = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_a_in)
        yya = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_a_in)

        xxb = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_b_in)
        xyb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_b_in)
        yxb = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_b_in)
        yyb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_b_in)

        xxc = self.wrk.serial_add_uneven(xxa, xxb, num_words)
        xyc = self.wrk.serial_add_uneven(xya, xyb, num_words)
        yxc = self.wrk.serial_add_uneven(yxa, yxb, num_words)
        yyc = self.wrk.serial_add_uneven(yya, yyb, num_words)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxc)
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyc)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxc)
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyc)

    #
    # dump working variables before ladder step
    #
    def dump_before_step_using_crt(self, pq, m):
        print("num_words = %d" % pq)
        print("\rladder_mode_x = %d" % m[0])
        print("\rladder_mode_y = %d" % m[1])
        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")
        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("X_N")
        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("Y_N")
        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("X_N_COEFF")
        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("Y_N_COEFF")

    #
    # dump working variables after ladder step
    #
    def dump_after_step_using_crt(self):
        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")

    #
    # this deliberately converts narrow operand into redundant representation
    #
    def _force_overflow(self, bank_crt, sel_narrow):

        # original words
        T = bank_crt.ladder_x._get_narrow(sel_narrow).words

        # loop through upper N-1 words
        for i in range(1, len(T)):

            # get msbs of the previous word
            upper_bits = T[i-1] & _CARRY_MASK

            # if the previous msbs are empty, force lsbs of the current word
            # into them and then wipe the current lsbs
            if upper_bits == 0:
                lower_bits = T[i] & (_CARRY_MASK >> _WORD_WIDTH)
                T[i] ^= lower_bits
                T[i-1] |= (lower_bits << _WORD_WIDTH)

        # overwrite original words
        bank_crt.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, len(T), T))

        print("Forced overflow.")

#
# read content of core's output bank and compare it against known good values
#
def compare_signature():

    c  = core
    s  = s_known
    xm = xm_known
    ym = ym_known

    core_s  = c.out.get_value(O.S)
    core_xm = c.out.get_value(O.XM)
    core_ym = c.out.get_value(O.YM)

    if core_s.number()  != s:  print("ERROR: core_s != s!")
    else:                      print("s is OK")

    if core_xm.number() != xm: print("ERROR: core_xm != xm!")
    else:                      print("x_mutated is OK")

    if core_ym.number() != ym: print("ERROR: core_ym != ym!")
    else:                      print("y_mutated is OK")

#
# get current ladder mode based on two exponents' bits
#
def get_ladder_mode_using_crt(v, bit):

    bit_value_p = (v.dp.number() & (1 << bit)) >> bit
    bit_value_q = (v.dq.number() & (1 << bit)) >> bit

    bit_value_p = bit_value_p > 0
    bit_value_q = bit_value_q > 0

    return (bit_value_p, bit_value_q)

#
# get current ladder mode based on private exponent's bit
#
def get_ladder_mode_without_crt(v, bit):

    bit_value_d = (v.d.number() & (1 << bit)) >> bit

    bit_value_d = bit_value_d > 0

    return (not bit_value_d, bit_value_d)

#
# print current exponentiation progress
#
def print_ladder_progress(current, total):

    # this will always print "100.0%" at the very last iteration, since we're
    # counting bits from msb to lsb and the very last index is zero, which
    # is congruent to 0 mod DUMP_PROGRESS_FACTOR
    if (current % DUMP_PROGRESS_FACTOR) == 0:
        pct = float((_WORD_WIDTH * total - current) / (_WORD_WIDTH * total)) * 100.0
        print("\rdone: %5.1f%%" % pct, end='')

    # move to next line after the very last iteration
    if current == 0: print("")

#
# try to exponentiate using the quad-multiplier (dual-core, dual-ladder) scheme
#
def sign_using_crt():

    c  = core
    v  = vector
    n  = n_num_words
    pq = pq_num_words

    ff = (False, False)
                                                                   #
                                                                   # A / B => different content in banks (A in WIDE, B in NARROW)
                                                                   # [XY]Z => different content in ladders (XZ in X, YZ in Y)
                                                                   # ..    => temporarily half-filled bank (omitted to save space)
                                                                   # *     => "crossed" content (X.Y == Y.X and Y.Y == X.X)
                                                                   #
                                                                   # +------------------------+-------+------------------+---------+-----------+
                                                                   # |  A                     |  B    |  C               |  D      |  E        |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input   (c.bnk.crt_x, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_y, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_x, W.A,       I.X)          # |  ..                    |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_y, W.A,       I.Y)          # | [XY] / ?               |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_x, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | .. / ?    |
    c.set_wide_from_input   (c.bnk.crt_y, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input (c.bnk.crt_x, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_y, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_x, N.A,       I.N_FACTOR)   # | [XY] / ..              |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_y, N.A,       I.N_FACTOR)   # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_x, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ..   |
    c.set_narrow_from_input (c.bnk.crt_y, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.A, N.A, W.B, N.B, n)                      # | [XY] / N_FACTOR        | [XY]F |  ?               |  ?      | M         | [XY]F = [XY] * N_FACTOR
    c.modular_multiply(W.B, N.B, W.C, N.C, n, mode=ff)             # | [XY] / N_FACTOR        | [XY]F | [XY]YM           |  ?      | M         | [XY]MF = [XY]F * [XY]F
    c.modular_multiply(W.C, N.I, W.D, N.D, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         | [XY]M = [XY]MF * 1
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.D, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
    c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.E, N.B, W.C, N.C, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         | [XY]MB = M * [XY]F
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.C, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.copy_crt_y2x(W.C, N.C)                                       # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input  (c.bnk.crt_x, W.N,       I.P)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_y, W.N,       I.Q)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_x, W.A,       I.P_FACTOR)    # | ...         / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_y, W.A,       I.Q_FACTOR)    # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | ..        |
    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.P_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.Q_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_x, N.A,       I.P_FACTOR)    # | [PQ]_FACTOR / ...      | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_y, N.A,       I.Q_FACTOR)    # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / .. |
    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_reduce(N.C, W.D, N.D, pq)                            # | [PQ]_FACTOR            | [XY]F |  YMB             | [PQ]MBZ | QINV      | [PQ]MBZ = YMB mod [PQ]
    c.modular_multiply(W.D, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBZ | QINV      | [PQ]MB = [PQ]MBZ * [PQ]_FACTOR
    c.modular_multiply(W.C, N.A, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBF | QINV      | [PQ]MBF = [PQ]MB * [PQ]_FACTOR
    c.modular_multiply(W.A, N.I, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]IF           | [PQ]MBF | QINV      | [PQ]IF = 1 * [PQ]_FACTOR
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.copy_ladders_x2y(W.D, N.D, W.C, N.C)                         # | [PQ]_FACTOR            | [XY]F | [PQ]IF / [PQ]MBF | [PQ]MBF | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    ###########################                                    # |                        |       |                  |         |           |
    # Begin Montgomery Ladder #                                    # |                        |       |                  |         |           |
    ###########################                                    # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
    for bit in range(_WORD_WIDTH * pq - 1, -1, -1):                # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
        m  = get_ladder_mode_using_crt(v, bit)                     # |                        |       |                  |         |           |
        dbg = bit == DUMP_LADDER_INDEX                             # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
        if dbg:                                                    # |                        |       |                  |         |           |
            if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C) # |                        |       |                  |         |           |
            if DUMP_VECTORS: c.dump_before_step_using_crt(pq, m)   # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
        c.modular_multiply(W.C, N.C, W.C, N.C, pq, mode=m, d=dbg)  # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]MBF | QINV      | <LADDER>
                                                                   # +------------------------+-------+------------------+---------+-----------+
        if dbg and DUMP_VECTORS: c.dump_after_step_using_crt()     # |                        |       |                  |         |           |
        print_ladder_progress(bit, pq)                             # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
    #########################                                      # |                        |       |                  |         |           |
    # End Montgomery Ladder #                                      # |                        |       |                  |         |           |
    #########################                                      # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.C, N.I, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      | [PQ]SB = [PQ]SBF * 1
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.D, pq)                                   # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.cross_ladders_x2y(W.D, N.D, W.D, N.D)                        # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB* | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_subtract(N.D, N.C, W.C, pq)                          # | [PQ]_FACTOR            | [XY]F |  RSB             | [PQ]SB* | QINV      | RSB = PSB - QSB
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.C, N.E, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBIZ           | [PQ]SB* | QINV      | RSBIZ = RSB * QINV
    c.modular_multiply(W.C, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | QINV      | RSBI = RSBIZ * P_FACTOR
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | ..        |
    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | Q / QINV  |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q / ..    |
    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.regular_multiply(W.E, N.C, pq)                               # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         | = RSBI * Q
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.merge_lha(N.A, pq)                                           # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.A, n)                                    # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.copy_crt_y2x(W.D, N.D)                                       # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            |  QSB*   | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.regular_add(N.D, N.A, N.C, pq)                               # | [PQ]_FACTOR / QRSBI    | [XY]F |  SB              |  QSB*   | Q         | SB = QSB + RSBI
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input  (c.bnk.crt_x, W.N, I.N)                 # |                        |       |                  |         |           |
    c.set_wide_from_input  (c.bnk.crt_y, W.N, I.N)                 # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.B, N.C, W.A, N.A, n, ff)                  # |  S                     |       |                  |         |           | S = XF * SB
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.A, n)                                    # |  S                     |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_output_from_narrow(O.S, c.bnk.crt_x, N.A)                # |  S                     |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+

#
# try to exponentiate using only half of the quad-multiplier (one dual-ladder core)
#
def sign_without_crt():

    c  = core
    v  = vector
    n  = n_num_words

    ff = (False, False)

    c.set_wide_from_input   (c.bnk.crt_x, W.N,       I.N)
    c.set_wide_from_input   (c.bnk.crt_y, W.N,       I.N)
    c.set_wide_from_input   (c.bnk.crt_x, W.A,       I.X)
    c.set_wide_from_input   (c.bnk.crt_y, W.A,       I.Y)
    c.set_wide_from_input   (c.bnk.crt_x, W.E,       I.M)
    c.set_wide_from_input   (c.bnk.crt_y, W.E,       I.M)

    c.set_narrow_from_input (c.bnk.crt_x, N.N_COEFF, I.N_COEFF)
    c.set_narrow_from_input (c.bnk.crt_y, N.N_COEFF, I.N_COEFF)
    c.set_narrow_from_input (c.bnk.crt_x, N.A,       I.N_FACTOR)
    c.set_narrow_from_input (c.bnk.crt_y, N.A,       I.N_FACTOR)
    c.set_narrow_from_input (c.bnk.crt_x, N.E,       I.M)
    c.set_narrow_from_input (c.bnk.crt_y, N.E,       I.M)

    c.modular_multiply(W.A, N.A, W.B, N.B, n)           # [XY]F = [XY] * N_FACTOR
    c.modular_multiply(W.B, N.B, W.C, N.C, n, mode=ff)  # [XY]MF = [XY]F * [XY]F
    c.modular_multiply(W.C, N.I, W.D, N.D, n)           # [XY]M = [XY]MF * 1

    c.propagate_carries(N.D, n)

    c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)
    c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)

    c.modular_multiply(W.E, N.B, W.C, N.C, n)   # [XY]MB = M * [XY]F

    XF = c.bnk.crt_x.ladder_x._get_narrow(N.B)

    c.set_wide_from_input(c.bnk.crt_x, W.A, I.N_FACTOR)
    c.set_wide_from_input(c.bnk.crt_y, W.A, I.N_FACTOR)

    c.modular_multiply(W.C, N.A, W.D, N.D, n)   # MBF = MB * N_FACTOR
    c.modular_multiply(W.A, N.I, W.C, N.C, n)   # IF = 1 * N_FACTOR    
    
    c.copy_ladders_x2y(W.D, N.D, W.C, N.C)

    ###########################
    # Begin Montgomery Ladder #
    ###########################

    for bit in range(_WORD_WIDTH * n - 1, -1, -1):

        m  = get_ladder_mode_without_crt(v, bit)
        dbg = bit == DUMP_LADDER_INDEX

        if dbg:
            if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C)
            if DUMP_VECTORS: c.dump_before_step_without_crt(n, m)

        c.modular_multiply(W.C, N.C, W.C, N.C, n, mode=m, d=dbg)

        if dbg and DUMP_VECTORS: c.dump_after_step_without_crt()
        print_ladder_progress(bit, n)
            
    #########################
    # End Montgomery Ladder #
    #########################

    c.cross_ladders_x2y(W.B, N.B, W.B, N.B)

    c.modular_multiply(W.C, N.I, W.D, N.D, n)           # SB = SBF * 1    
    c.modular_multiply(W.B, N.D, W.A, N.A, n, mode=ff)  # S = XF * SB

    c.copy_ladders_y2x(W.A, N.A, W.B, N.B)
    
    c.propagate_carries(N.B, n)
    
    c.set_output_from_narrow(O.S, c.bnk.crt_y, N.B)


#
# main()
#
if __name__ == "__main__":

    # handy shortcuts
    W = ModExpNG_WideBankEnum
    N = ModExpNG_NarrowBankEnum
    I = ModExpNG_CoreInputEnum
    O = ModExpNG_CoreOutputEnum

    # set helper quantity
    # instantiate core
    # load test vector
    # transfer numbers from vector to core
    # set numbers of words
    # obtain known good reference value with built-in math
    # mutate blinding quantities with built-in math

    i = ModExpNG_Operand(1, KEY_LENGTH)

    core   = ModExpNG_Core(i)
    vector = ModExpNG_TestVector()

    core.inp.set_value(I.M,        vector.m)

    core.inp.set_value(I.N,        vector.n)
    core.inp.set_value(I.P,        vector.p)
    core.inp.set_value(I.Q,        vector.q)

    core.inp.set_value(I.N_COEFF,  vector.n_coeff)
    core.inp.set_value(I.P_COEFF,  vector.p_coeff)
    core.inp.set_value(I.Q_COEFF,  vector.q_coeff)

    core.inp.set_value(I.N_FACTOR, vector.n_factor)
    core.inp.set_value(I.P_FACTOR, vector.p_factor)
    core.inp.set_value(I.Q_FACTOR, vector.q_factor)

    core.inp.set_value(I.X,        vector.x)
    core.inp.set_value(I.Y,        vector.y)

    core.inp.set_value(I.QINV,     vector.qinv)

    n_num_words  = KEY_LENGTH  // _WORD_WIDTH
    pq_num_words = n_num_words // 2

    s_known = pow(vector.m.number(), vector.d.number(), vector.n.number())

    xm_known = pow(vector.x.number(), 2, vector.n.number())
    ym_known = pow(vector.y.number(), 2, vector.n.number())

    # sign using CRT and check
    print("Signing using CRT...")
    sign_using_crt()
    compare_signature()

    # sign without CRT and check
    print("Signing without CRT...")
    sign_without_crt()
    compare_signature()


#
# End-of-File
#
#!/usr/bin/python3
#
#
# ModExpNG core math model.
#
#
# Copyright (c) 2019, NORDUnet A/S
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# - Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
#
# - Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
#
# - Neither the name of the NORDUnet nor the names of its contributors may
#   be used to endorse or promote products derived from this software
#   without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#


# -------
# Imports
#--------

import sys
import importlib
from enum import Enum, auto


# --------------
# Model Settings
# --------------

# length of public key
KEY_LENGTH = 1024

# how many parallel multipliers to use
NUM_MULTS  = 8


# ---------------
# Internal Values
# ---------------

# half of key length
_KEY_LENGTH_HALF = KEY_LENGTH // 2

# width of internal math pipeline
_WORD_WIDTH     = 16
_WORD_WIDTH_EXT = 18

_WORD_MASK     = 2 ** _WORD_WIDTH     - 1
_WORD_MASK_EXT = 2 ** _WORD_WIDTH_EXT - 1
_CARRY_MASK    = _WORD_MASK ^ _WORD_MASK_EXT

# folder with test vector scripts
_VECTOR_PATH = "/vector"

# name of test vector class
_VECTOR_CLASS = "Vector"


# ------------------
# Debugging Settings
# ------------------
DUMP_LADDER_INDEX      = -1     # at which ladder step to print debug vector
DUMP_VECTORS           = False  # print entire debug vector components
DUMP_INDICES           = False  # print indices of words at MAC inputs
DUMP_MACS_INPUTS       = False  # print MAC input words
DUMP_MACS_CLEARING     = False  # print MAC clearing bitmaps
DUMP_MACS_ACCUMULATION = False  # print MAC accumulators contents
DUMP_MULT_PARTS        = False  # print multiplication output parts
DUMP_RECOMBINATION     = False  # print recombination internals
DUMP_REDUCTION         = False  # print reduction internals
FORCE_OVERFLOW         = False  # force rarely seen internal overflow situation to verify how its handler works
DUMP_PROGRESS_FACTOR   = 16     # once per how many ladder steps to update progress indicator

#
# Multi-Precision Integer
#
class ModExpNG_Operand():

    def __init__(self, number, length, words = None):

        if words is None:

            # length must be divisible by word width
            if (length % _WORD_WIDTH) > 0:
                raise Exception("Bad number length!")

            self._init_from_number(number, length)

        else:

            # length must match words count
            if len(words) != length:
                raise Exception("Bad words count!")

            self._init_from_words(words, length)

    def format_verilog_concat(self, name):

        for i in range(len(self.words)):
            if i > 0:
                if (i % 4) == 0: print("")
                else:            print(" ", end='')
            print("%s[%3d] = 18'h%05x;" % (name, i, self.words[i]), end='')
        print("")

    def _init_from_words(self, words, count):

        for i in range(count):

            # word must not exceed 18 bits
            if words[i] >= (2 ** (_WORD_WIDTH_EXT)):
                raise Exception("Word is too large!")

        self.words = list(words)

    def _init_from_number(self, number, length):

        num_hexchars_per_word = _WORD_WIDTH // 4
        num_hexchars_total = length // num_hexchars_per_word

        value_hex = format(number, 'x')

        # value must not be larger than specified, but it can be smaller, so
        # we may need to prepend it with zeroes
        if len(value_hex) > num_hexchars_total:
            raise Exception("Number is too large!")
        else:
            while len(value_hex) < num_hexchars_total:
                value_hex = "0" + value_hex

        # create empty list
        self.words = list()

        # fill in words
        while len(value_hex) > 0:
            value_hex_part = value_hex[-num_hexchars_per_word:]
            value_hex = value_hex[:-num_hexchars_per_word]
            self.words.append(int(value_hex_part, 16))

    def number(self):
        ret = 0
        shift = 0
        for word in self.words:
            ret += word << shift
            shift += _WORD_WIDTH
        return ret

    def _get_half(self, part):
        num_words = len(self.words)
        num_words_half = num_words // 2
        if not part: return ModExpNG_Operand(None, num_words_half, self.words[:num_words_half])
        else:        return ModExpNG_Operand(None, num_words_half, self.words[num_words_half:])

    def lower_half(self):
        return self._get_half(False)

    def upper_half(self):
        return self._get_half(True)

#
# Test Vector
#
class ModExpNG_TestVector():

    def __init__(self):

        # format target filename
        filename = "vector_" + str(KEY_LENGTH) + "_randomized"

        # add ./vector to import search path
        sys.path.insert(1, sys.path[0] + _VECTOR_PATH)

        # import from filename
        vector_module = importlib.import_module(filename)

        # get vector class
        vector_class = getattr(vector_module, _VECTOR_CLASS)

        # instantiate vector class
        vector_inst = vector_class()

        # obtain parts of vector
        self.m        = ModExpNG_Operand(vector_inst.m,         KEY_LENGTH)
        self.n        = ModExpNG_Operand(vector_inst.n,         KEY_LENGTH)
        self.d        = ModExpNG_Operand(vector_inst.d,         KEY_LENGTH)
        self.p        = ModExpNG_Operand(vector_inst.p,        _KEY_LENGTH_HALF)
        self.q        = ModExpNG_Operand(vector_inst.q,        _KEY_LENGTH_HALF)
        self.dp       = ModExpNG_Operand(vector_inst.dp,       _KEY_LENGTH_HALF)
        self.dq       = ModExpNG_Operand(vector_inst.dq,       _KEY_LENGTH_HALF)
        self.qinv     = ModExpNG_Operand(vector_inst.qinv,     _KEY_LENGTH_HALF)
        self.n_factor = ModExpNG_Operand(vector_inst.n_factor,  KEY_LENGTH)
        self.p_factor = ModExpNG_Operand(vector_inst.p_factor, _KEY_LENGTH_HALF)
        self.q_factor = ModExpNG_Operand(vector_inst.q_factor, _KEY_LENGTH_HALF)
        self.n_coeff  = ModExpNG_Operand(vector_inst.n_coeff,   KEY_LENGTH      + _WORD_WIDTH)
        self.p_coeff  = ModExpNG_Operand(vector_inst.p_coeff,  _KEY_LENGTH_HALF + _WORD_WIDTH)
        self.q_coeff  = ModExpNG_Operand(vector_inst.q_coeff,  _KEY_LENGTH_HALF + _WORD_WIDTH)
        self.x        = ModExpNG_Operand(vector_inst.x,         KEY_LENGTH)
        self.y        = ModExpNG_Operand(vector_inst.y,         KEY_LENGTH)

class ModExpNG_WideBankEnum(Enum):
    A   = auto()
    B   = auto()
    C   = auto()
    D   = auto()
    E   = auto()
    N   = auto()
    L   = auto()
    H   = auto()

class ModExpNG_NarrowBankEnum(Enum):
    A       = auto()
    B       = auto()
    C       = auto()
    D       = auto()
    E       = auto()
    N_COEFF = auto()
    I       = auto()

class ModExpNG_CoreInputEnum(Enum):
    M        = auto()

    N        = auto()
    P        = auto()
    Q        = auto()

    N_COEFF  = auto()
    P_COEFF  = auto()
    Q_COEFF  = auto()

    N_FACTOR = auto()
    P_FACTOR = auto()
    Q_FACTOR = auto()

    X        = auto()
    Y        = auto()

    QINV     = auto()

class ModExpNG_CoreOutputEnum(Enum):
    XM = auto()
    YM = auto()
    S  = auto()

class ModExpNG_WideBank():

    def __init__(self):
        self.a = None
        self.b = None
        self.c = None
        self.d = None
        self.e = None
        self.n = None
        self.l = None
        self.h = None

    def _get_value(self, sel):
        if   sel == ModExpNG_WideBankEnum.A:   return self.a
        elif sel == ModExpNG_WideBankEnum.B:   return self.b
        elif sel == ModExpNG_WideBankEnum.C:   return self.c
        elif sel == ModExpNG_WideBankEnum.D:   return self.d
        elif sel == ModExpNG_WideBankEnum.E:   return self.e
        elif sel == ModExpNG_WideBankEnum.N:   return self.n
        elif sel == ModExpNG_WideBankEnum.L:   return self.l
        elif sel == ModExpNG_WideBankEnum.H:   return self.h
        else: raise Exception("ModExpNG_WideBank._get_value(): Invalid selector!")

    def _set_value(self, sel, value):
        if   sel == ModExpNG_WideBankEnum.A:   self.a   = value
        elif sel == ModExpNG_WideBankEnum.B:   self.b   = value
        elif sel == ModExpNG_WideBankEnum.C:   self.c   = value
        elif sel == ModExpNG_WideBankEnum.D:   self.d   = value
        elif sel == ModExpNG_WideBankEnum.E:   self.e   = value
        elif sel == ModExpNG_WideBankEnum.N:   self.n   = value
        elif sel == ModExpNG_WideBankEnum.L:   self.l   = value
        elif sel == ModExpNG_WideBankEnum.H:   self.h   = value
        else: raise Exception("ModExpNG_WideBank._set_value(): Invalid selector!")

class ModExpNG_NarrowBank():

    def __init__(self, i):
        self.a       = None
        self.b       = None
        self.c       = None
        self.d       = None
        self.e       = None
        self.n_coeff = None
        self.i       = i

    def _get_value(self, sel):
        if   sel == ModExpNG_NarrowBankEnum.A:       return self.a
        elif sel == ModExpNG_NarrowBankEnum.B:       return self.b
        elif sel == ModExpNG_NarrowBankEnum.C:       return self.c
        elif sel == ModExpNG_NarrowBankEnum.D:       return self.d
        elif sel == ModExpNG_NarrowBankEnum.E:       return self.e
        elif sel == ModExpNG_NarrowBankEnum.N_COEFF: return self.n_coeff
        elif sel == ModExpNG_NarrowBankEnum.I:       return self.i
        else: raise Exception("ModExpNG_NarrowBank._get_value(): Invalid selector!")

    def _set_value(self, sel, value):
        if   sel == ModExpNG_NarrowBankEnum.A:       self.a       = value
        elif sel == ModExpNG_NarrowBankEnum.B:       self.b       = value
        elif sel == ModExpNG_NarrowBankEnum.C:       self.c       = value
        elif sel == ModExpNG_NarrowBankEnum.D:       self.d       = value
        elif sel == ModExpNG_NarrowBankEnum.E:       self.e       = value
        elif sel == ModExpNG_NarrowBankEnum.N_COEFF: self.n_coeff = value
        else: raise Exception("ModExpNG_NarrowBank._set_value(): Invalid selector!")

class ModExpNG_CoreInput():

    def __init__(self):
        self._m        = None

        self._n        = None
        self._p        = None
        self._q        = None

        self._n_coeff  = None
        self._p_coeff  = None
        self._q_coeff  = None

        self._n_factor = None
        self._p_factor = None
        self._q_factor = None

        self._x        = None
        self._y        = None

        self._qinv     = None

    def set_value(self, sel, value):
        if   sel == ModExpNG_CoreInputEnum.M:        self._m        = value

        elif sel == ModExpNG_CoreInputEnum.N:        self._n        = value
        elif sel == ModExpNG_CoreInputEnum.P:        self._p        = value
        elif sel == ModExpNG_CoreInputEnum.Q:        self._q        = value

        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  self._n_coeff  = value
        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  self._p_coeff  = value
        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  self._q_coeff  = value

        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: self._n_factor = value
        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: self._p_factor = value
        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: self._q_factor = value

        elif sel == ModExpNG_CoreInputEnum.X:        self._x        = value
        elif sel == ModExpNG_CoreInputEnum.Y:        self._y        = value

        elif sel == ModExpNG_CoreInputEnum.QINV:     self._qinv     = value

        else: raise Exception("ModExpNG_CoreInput.set_value(): invalid selector!")

    def _get_value(self, sel):
        if   sel == ModExpNG_CoreInputEnum.M:        return self._m

        elif sel == ModExpNG_CoreInputEnum.N:        return self._n
        elif sel == ModExpNG_CoreInputEnum.P:        return self._p
        elif sel == ModExpNG_CoreInputEnum.Q:        return self._q

        elif sel == ModExpNG_CoreInputEnum.N_COEFF:  return self._n_coeff
        elif sel == ModExpNG_CoreInputEnum.P_COEFF:  return self._p_coeff
        elif sel == ModExpNG_CoreInputEnum.Q_COEFF:  return self._q_coeff

        elif sel == ModExpNG_CoreInputEnum.N_FACTOR: return self._n_factor
        elif sel == ModExpNG_CoreInputEnum.P_FACTOR: return self._p_factor
        elif sel == ModExpNG_CoreInputEnum.Q_FACTOR: return self._q_factor

        elif sel == ModExpNG_CoreInputEnum.X:        return self._x
        elif sel == ModExpNG_CoreInputEnum.Y:        return self._y

        elif sel == ModExpNG_CoreInputEnum.QINV:     return self._qinv

        else: raise Exception("ModExpNG_CoreInput._get_value(): invalid selector!")

class ModExpNG_CoreOutput():

    def __init__(self):
        self._xm = None
        self._ym = None
        self._s  = None

    def _set_value(self, sel, value):
        if   sel == ModExpNG_CoreOutputEnum.XM: self._xm = value
        elif sel == ModExpNG_CoreOutputEnum.YM: self._ym = value
        elif sel == ModExpNG_CoreOutputEnum.S:  self._s  = value
        else: raise Exception("ModExpNG_CoreOutput._set_value(): invalid selector!")

    def get_value(self, sel):
        if   sel == ModExpNG_CoreOutputEnum.XM: return self._xm
        elif sel == ModExpNG_CoreOutputEnum.YM: return self._ym
        elif sel == ModExpNG_CoreOutputEnum.S:  return self._s
        else: raise Exception("ModExpNG_CoreOutput.get_value(): invalid selector!")

class ModExpNG_BanksPair():

    def __init__(self, i):
        self.wide = ModExpNG_WideBank()
        self.narrow = ModExpNG_NarrowBank(i)

    def _get_wide(self, sel):
        return self.wide._get_value(sel)

    def _get_narrow(self, sel):
        return self.narrow._get_value(sel)

    def _set_wide(self, sel, value):
        self.wide._set_value(sel, value)

    def _set_narrow(self, sel, value):
        self.narrow._set_value(sel, value)

class ModExpNG_BanksLadder():

    def __init__(self, i):
        self.ladder_x = ModExpNG_BanksPair(i)
        self.ladder_y = ModExpNG_BanksPair(i)

class ModExpNG_BanksCRT():

    def __init__(self, i):
        self.crt_x = ModExpNG_BanksLadder(i)
        self.crt_y = ModExpNG_BanksLadder(i)

class ModExpNG_PartRecombinator():

    def _bit_select(self, x, msb, lsb):
        y = 0
        for pos in range(lsb, msb+1):
            y |= (x & (1 << pos)) >> lsb
        return y

    def _flush_pipeline(self, dump):
        self.z0, self.y0, self.x0 = 0, 0, 0
        if dump and DUMP_RECOMBINATION:
            print("RCMB -> flush()")

    def _push_pipeline(self, part, dump):

        # split next part into 16-bit words
        z = self._bit_select(part, 46, 32)
        y = self._bit_select(part, 31, 16)
        x = self._bit_select(part, 15,  0)

        # shift to the right
        z1 = z
        y1 = y + self.z0
        x1 = x + self.y0 + (self.x0 >> _WORD_WIDTH) # IMPORTANT: This carry can be up to two bits wide!!

        # save lower 16 bits of the rightmost cell
        t = self.x0 & _WORD_MASK

        # update internal latches
        self.z0, self.y0, self.x0 = z1, y1, x1

        # dump
        if dump and DUMP_RECOMBINATION:
            print("RCMB -> push(): part = 0x%012x, word = 0x%04x" % (part, t))

        # done
        return t

    def recombine_square(self, parts, ab_num_words, dump):

        # empty results so far
        words_lsb = list()  # n words
        words_msb = list()  # n words

        # recombine the lower half (n parts)
        # the first tick produces null result, the last part
        # produces three words and needs two extra ticks
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 1 + 2):
            next_part = parts[i] if i < ab_num_words else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_lsb.append(next_word)

        # recombine the upper half (n-1 parts)
        # the first tick produces null result
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 1):
            next_part = parts[i + ab_num_words] if i < (ab_num_words - 1) else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_msb.append(next_word)

        # merge words
        words = list()

        # merge lower half
        for x in range(ab_num_words):
            next_word = words_lsb[x]
            words.append(next_word)

        # merge upper half adding the two overlapping words
        for x in range(ab_num_words):
            next_word = words_msb[x]
            if x < 2:
                next_word += words_lsb[x + ab_num_words]
            words.append(next_word)

        return words

    def recombine_triangle(self, parts, ab_num_words, dump):

        # empty result so far
        words_lsb = list()

        # recombine the lower half (n+1 parts)
        # the first tick produces null result, so we need n + 1 + 1 = n + 2
        # ticks total and should only save the result word during the last
        # n + 1 ticks
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 2):

            next_part = parts[i] if i < (ab_num_words + 1) else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_lsb.append(next_word)

        return words_lsb

    def recombine_rectangle(self, parts, ab_num_words, dump):

        # empty result so far
        words_lsb = list()  # n words
        words_msb = list()  # n+1 words

        # recombine the lower half (n parts)
        # the first tick produces null result, the last part
        # produces three words and needs two extra ticks
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 1 + 2):
            next_part = parts[i] if i < ab_num_words else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_lsb.append(next_word)

        # recombine the upper half (n parts)
        # the first tick produces null result, the last part
        # produces two words and needs an extra tick
        self._flush_pipeline(dump)
        for i in range(ab_num_words + 2):
            next_part = parts[i + ab_num_words] if i < ab_num_words else 0
            next_word = self._push_pipeline(next_part, dump)

            if i > 0:
                words_msb.append(next_word)

        # merge words
        words = list()

        # merge lower half
        for x in range(ab_num_words):
            next_word = words_lsb[x]
            words.append(next_word)

        # merge upper half adding the two overlapping words
        for x in range(ab_num_words + 1):
            next_word = words_msb[x]
            if x < 2:
                next_word += words_lsb[x + ab_num_words]
            words.append(next_word)

        return words

class ModExpNG_WordMultiplier():

    def __init__(self):

        self._macs = list()
        self._indices = list()

        self._mac_aux = list()
        self._index_aux = list()

        for x in range(NUM_MULTS):
            self._macs.append(0)
            self._indices.append(0)

        self._mac_aux.append(0)
        self._index_aux.append(0)

    def _clear_all_macs(self, t, col, dump):
        for x in range(NUM_MULTS):
            self._macs[x] = 0
        if dump and DUMP_MACS_CLEARING:
            print("t=%2d, col=%2d > clear > all" % (t, col))

    def _clear_one_mac(self, x, t, col, dump):
        self._macs[x] = 0
        if dump and DUMP_MACS_CLEARING:
            print("t=%2d, col=%2d > clear > x=%d" % (t, col, x))

    def _clear_mac_aux(self, t, col, dump):
        self._mac_aux[0] = 0
        if dump and DUMP_MACS_CLEARING:
            print("t= 0, col=%2d > clear > aux" % (col))

    def _update_one_mac(self, x, t, col, a, b, dump, need_aux=False):

        if a >= (2 ** _WORD_WIDTH_EXT):
            raise Exception("a > 0x3FFFF!")

        if b >= (2 ** _WORD_WIDTH):
            raise Exception("b > 0xFFFF!")

        p = a * b
        if dump and DUMP_MACS_INPUTS:
            if x == 0: print("t=%2d, col=%2d > b=%05x > " % (t, col, b), end='')
            if x > 0: print("; ", end='')
            print("MAC[%d]: a=%05x" % (x, a), end='')
            if x == (NUM_MULTS-1) and not need_aux: print("")

        self._macs[x] += p

    def _update_mac_aux(self, y, col, a, b, dump):

        if a >= (2 ** _WORD_WIDTH_EXT):
            raise Exception("a > 0x3FFFF!")

        if b >= (2 ** _WORD_WIDTH):
            raise Exception("b > 0xFFFF!")

        p = a * b
        if dump and DUMP_MACS_INPUTS:
            print("; AUX: a=%05x" % a)
            
        self._mac_aux[0] += p

    def _preset_indices(self, col):
        for x in range(len(self._indices)):
            self._indices[x] = col * len(self._indices) + x

    def _preset_index_aux(self, num_cols):
        self._index_aux[0] = num_cols * len(self._indices)

    def _dump_macs_helper(self, t, col, aux=False):
        print("t=%2d, col=%2d > "% (t, col), end='')
        for i in range(NUM_MULTS):
            if i > 0: print(" | ", end='')
            print("mac[%d]: 0x%012x" % (i, self._macs[i]), end='')
        if aux:
            print(" | mac_aux[ 0]: 0x%012x" % (self._mac_aux[0]), end='')
        print("")

    def _dump_macs(self, t, col):
        self._dump_macs_helper(t, col)

    def _dump_macs_with_aux(self, t, col):
        self._dump_macs_helper(t, col, True)

    def _dump_indices_helper(self, t, col, aux=False):
        print("t=%2d, col=%2d > indices:" % (t, col), end='')
        for i in range(NUM_MULTS):
            print(" %2d" % self._indices[i], end='')
        if aux:
            print(" %2d" % self._index_aux[0], end='')
        print("")

    def _dump_indices(self, t, col):
        self._dump_indices_helper(t, col)

    def _dump_indices_with_aux(self, t, col):
        self._dump_indices_helper(t, col, True)

    def _rotate_indices(self, num_words):
        for x in range(len(self._indices)):
            if self._indices[x] > 0:
                self._indices[x] -= 1
            else:
                self._indices[x] = num_words - 1

    def _rotate_index_aux(self):
        self._index_aux[0] -= 1

    def _mult_store_part(self, parts, time, column, part_index, mac_index, dump):
        parts[part_index] = self._macs[mac_index]
        if dump and DUMP_MULT_PARTS:
            print("t=%2d, col=%2d > parts[%2d]: mac[%d] = 0x%012x" %
                (time, column, part_index, mac_index, parts[part_index]))

    def _mult_store_part_aux(self, parts, time, column, part_index, dump):
        parts[part_index] = self._mac_aux[0]
        if dump and DUMP_MULT_PARTS:
            print("t=%2d, col=%2d > parts[%2d]: mac_aux[%d] = 0x%012x" %
                (time, column, part_index, 0, parts[part_index]))

    def multiply_square(self, a_wide, b_narrow, ab_num_words, dump=False):

        num_cols = ab_num_words // NUM_MULTS

        parts = list()
        for i in range(2 * ab_num_words - 1):
            parts.append(0)

        for col in range(num_cols):

            b_carry = 0

            for t in range(ab_num_words):

                # take care of indices
                if t == 0: self._preset_indices(col)
                else:      self._rotate_indices(ab_num_words)

                # take care of macs
                if t == 0:
                    self._clear_all_macs(t, col, dump)
                else:
                    t1 = t - 1
                    if (t1 // 8) == col:
                        self._clear_one_mac(t1 % NUM_MULTS, t, col, dump)

                # debug output
                if dump and DUMP_INDICES: self._dump_indices(t, col)

                # current b-word
                # multiplier's b-input is limited to 16-bit words, so we need to propagate
                # carries on the fly here, carry can be up to two bits
                bt = b_narrow.words[t] + b_carry
                b_carry = (bt & _CARRY_MASK) >> _WORD_WIDTH
                if dump and b_carry > 1:
                    print("Rare overflow case was detected and then successfully corrected.")
                bt &= _WORD_MASK

                # multiply by a-words
                for x in range(NUM_MULTS):
                    ax = a_wide.words[self._indices[x]]
                    self._update_one_mac(x, t, col, ax, bt, dump)

                    if t == (col * NUM_MULTS + x):
                        part_index = t
                        self._mult_store_part(parts, t, col, part_index, x, dump)

                # debug output
                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs(t, col)

                # save the uppers part of product at end of column,
                # for the last column don't save the very last part
                if t == (ab_num_words - 1):
                    for x in range(NUM_MULTS):
                        if not (col == (num_cols - 1) and x == (NUM_MULTS - 1)):
                            part_index = ab_num_words + col * NUM_MULTS + x
                            self._mult_store_part(parts, t, col, part_index, x, dump)

        return parts

    def multiply_triangle(self, a_wide, b_narrow, ab_num_words, dump=False):

        num_cols = ab_num_words // NUM_MULTS

        parts = list()
        for i in range(ab_num_words + 1):
            parts.append(0)

        for col in range(num_cols):

            last_col = col == (num_cols - 1)

            for t in range(ab_num_words + 1):

                # take care of indices
                if t == 0: self._preset_indices(col)
                else:      self._rotate_indices(ab_num_words)

                # take care of auxilary index
                if last_col:
                    if t == 0: self._preset_index_aux(num_cols)
                    else:      self._rotate_index_aux()

                # take care of macs
                if t == 0: self._clear_all_macs(t, col, dump)

                # take care of auxilary mac
                if last_col:
                    if t == 0: self._clear_mac_aux(t, col, dump)

                # debug output
                if dump and DUMP_INDICES: self._dump_indices_with_aux(t, col)

                # current b-word
                bt = b_narrow.words[t]

                # multiply by a-words
                for x in range(NUM_MULTS):
                    ax = a_wide.words[self._indices[x]]
                    self._update_one_mac(x, t, col, ax, bt, dump, last_col)

                    if t == (col * NUM_MULTS + x):
                        part_index = t
                        self._mult_store_part(parts, t, col, part_index, x, dump)

                # aux multiplier
                if last_col:
                    ax = a_wide.words[self._index_aux[0]]
                    self._update_mac_aux(t, col, ax, bt, dump)

                    if t == ab_num_words:
                        part_index = t
                        self._mult_store_part_aux(parts, t, col, part_index, dump)

                # debug output
                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs_with_aux(t, col)

                # shortcut
                if not last_col:
                    if t == (NUM_MULTS * (col + 1) - 1): break

        return parts

    def multiply_rectangle(self, a_wide, b_narrow, ab_num_words, dump=False):

        num_cols = ab_num_words // NUM_MULTS

        parts = list()
        for i in range(2 * ab_num_words):
            parts.append(0)

        for col in range(num_cols):

            for t in range(ab_num_words + 1):

                # take care of indices
                if t == 0: self._preset_indices(col)
                else:      self._rotate_indices(ab_num_words)

                # take care of macs
                if t == 0:
                    self._clear_all_macs(t, col, dump)
                else:
                    t1 = t - 1
                    if (t1 // 8) == col:
                        self._clear_one_mac(t1 % NUM_MULTS, t, col, dump)

                # debug output
                if dump and DUMP_INDICES: self._dump_indices(t, col)

                # current b-word
                bt = b_narrow.words[t]

                # multiply by a-words
                for x in range(NUM_MULTS):
                    ax = a_wide.words[self._indices[x]]
                    self._update_one_mac(x, t, col, ax, bt, dump)

                    # don't save one value for the very last time instant per column
                    if t < ab_num_words and t == (col * NUM_MULTS + x):
                        part_index = t
                        self._mult_store_part(parts, t, col, part_index, x, dump)

                # debug output
                if dump and DUMP_MACS_ACCUMULATION: self._dump_macs(t, col)

                # save the upper parts of product at end of column
                if t == ab_num_words:
                    for x in range(NUM_MULTS):
                        part_index = ab_num_words + col * NUM_MULTS + x
                        self._mult_store_part(parts, t, col, part_index, x, dump)

        return parts

class ModExpNG_LowlevelOperator():

    def _check_word(self, a):
        if a < 0 or a > _WORD_MASK:
            raise Exception("Word out of range!")

    def _check_carry_borrow(self, cb):
        if cb < 0 or cb > 1:
            raise Exception("Carry or borrow out of range!")

    def add_words(self, a, b, c_in):

        self._check_word(a)
        self._check_word(b)
        self._check_carry_borrow(c_in)

        sum = a + b + c_in

        sum_s = sum & _WORD_MASK
        sum_c = sum >> _WORD_WIDTH

        return (sum_c, sum_s)

    def sub_words(self, a, b, b_in):

        self._check_word(a)
        self._check_word(b)
        self._check_carry_borrow(b_in)

        dif = a - b - b_in

        if dif < 0:
            dif_b = 1
            dif_d = dif + 2 ** _WORD_WIDTH
        else:
            dif_b = 0
            dif_d = dif

        return (dif_b, dif_d)

class ModExpNG_Worker():

    def __init__(self):
        self.lowlevel     = ModExpNG_LowlevelOperator()
        self.multiplier   = ModExpNG_WordMultiplier()
        self.recombinator = ModExpNG_PartRecombinator()

    def serial_subtract_modular(self, a, b, n, ab_num_words):
        c_in = 0
        b_in = 0
        ab = list()
        ab_n = list()
        for x in range(ab_num_words):
            a_word = a.words[x]
            b_word = b.words[x]
            (b_out, d_out) = self.lowlevel.sub_words(a_word, b_word, b_in)
            (c_out, s_out) = self.lowlevel.add_words(d_out, n.words[x], c_in)
            ab.append(d_out)
            ab_n.append(s_out)
            (c_in, b_in) = (c_out, b_out)
        d = ab if not b_out else ab_n
        return ModExpNG_Operand(None, ab_num_words, d)

    def serial_add_uneven(self, a, b, ab_num_words):
        c_in = 0
        ab = list()
        for x in range(2 * ab_num_words):
            a_word = a.words[x] if x < ab_num_words else 0
            b_word = b.words[x]
            (c_out, s_out) = self.lowlevel.add_words(a_word, b_word, c_in)
            ab.append(s_out)
            c_in = c_out
        return ModExpNG_Operand(None, 2*ab_num_words, ab)

    def multipurpose_multiply(self, a, b, n, n_coeff, ab_num_words, reduce_only=False, multiply_only=False, dump=False, dump_crt="", dump_ladder=""):

        #
        # 1. AB = A * B
        #
        if dump: print("multiply_square(%s_%s)" % (dump_crt, dump_ladder))

        if reduce_only:
            ab = b
        else:
            ab_parts = self.multiplier.multiply_square(a, b, ab_num_words, dump)
            ab_words = self.recombinator.recombine_square(ab_parts, ab_num_words, dump)
            ab = ModExpNG_Operand(None, 2 * ab_num_words, ab_words)

        if dump and DUMP_VECTORS:
            ab.format_verilog_concat("%s_%s_AB" % (dump_crt, dump_ladder))

        if multiply_only:
            return ModExpNG_Operand(None, 2*ab_num_words, ab_words)

        #
        # 2. Q = LSB(AB) * N_COEFF
        #
        if dump: print("multiply_triangle(%s_%s)" % (dump_crt, dump_ladder))

        q_parts = self.multiplier.multiply_triangle(ab, n_coeff, ab_num_words, dump)
        q_words = self.recombinator.recombine_triangle(q_parts, ab_num_words, dump)
        q = ModExpNG_Operand(None, ab_num_words + 1, q_words)

        if dump and DUMP_VECTORS:
            q.format_verilog_concat("%s_%s_Q" % (dump_crt, dump_ladder))

        #
        # 3. M = Q * N
        #
        if dump: print("multiply_rectangle(%s_%s)" % (dump_crt, dump_ladder))

        m_parts = self.multiplier.multiply_rectangle(n, q, ab_num_words, dump)
        m_words = self.recombinator.recombine_rectangle(m_parts, ab_num_words, dump)
        m = ModExpNG_Operand(None, 2 * ab_num_words + 1, m_words)

        if dump and DUMP_VECTORS:
            m.format_verilog_concat("%s_%s_M" % (dump_crt, dump_ladder))

        #
        # 4. R = AB + M
        #

        #
        # 4a. compute carry (actual sum is all zeroes and need not be stored)
        #
        
        r_cy = 0 # this can be up to two bits, since we're adding extended words!!
        for i in range(ab_num_words + 1):
            s = ab.words[i] + m.words[i] + r_cy
            r_cy_new = s >> _WORD_WIDTH

            if dump and DUMP_REDUCTION:
                print("[%2d] 0x%05x + 0x%05x + 0x%x => {0x%x, [0x%05x]}" %
                    (i, ab.words[i], m.words[i], r_cy, r_cy_new, s & 0xffff))   # ???

            r_cy = r_cy_new


        #
        # 4b. Initialize empty result
        #
        
        R = list()
        for i in range(ab_num_words):
            R.append(0)

        #
        # 4c. compute the actual upper part of sum (take carry into account)
        #
        
        for i in range(ab_num_words):

            if dump and DUMP_REDUCTION:
                print("[%2d]" % i, end='')

            ab_word = ab.words[ab_num_words + i + 1] if i < (ab_num_words - 1) else 0
            if dump and DUMP_REDUCTION:
                print(" 0x%05x" % ab_word, end='')

            m_word = m.words[ab_num_words + i + 1]
            if dump and DUMP_REDUCTION:
                print(" + 0x%05x" % m_word, end='')

            if i == 0: R[i] = r_cy
            else:      R[i] = 0

            if dump and DUMP_REDUCTION:
                print(" + 0x%x" % R[i], end='')

            R[i] += ab_word
            R[i] += m_word
            if dump and DUMP_REDUCTION:
                print(" = 0x%05x" % R[i])

        return ModExpNG_Operand(None, ab_num_words, R)

    def convert_nonredundant(self, a, num_words):
        carry = 0
        for x in range(num_words):
            a.words[x] += carry
            carry = a.words[x] >> _WORD_WIDTH
            a.words[x] &= _WORD_MASK
        return carry

class ModExpNG_Core():

    def __init__(self, i):
        self.wrk = ModExpNG_Worker()
        self.bnk = ModExpNG_BanksCRT(i)
        self.inp = ModExpNG_CoreInput()
        self.out = ModExpNG_CoreOutput()

    #
    # CRT_(X|Y) means either CRT_X or CRT_Y
    # LADDER_{X,Y} means both LADDER_X and LADDER_Y
    #

    #
    # copy from CRT_(X|Y).LADDER_X.NARROW to OUTPUT
    #
    def set_output_from_narrow(self, sel_output, bank_crt, sel_narrow):
        self.out._set_value(sel_output, bank_crt.ladder_x._get_narrow(sel_narrow))

    #
    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.NARROW
    #
    def set_narrow_from_input(self, bank_crt, sel_narrow, sel_input):
        bank_crt.ladder_x._set_narrow(sel_narrow, self.inp._get_value(sel_input))
        bank_crt.ladder_y._set_narrow(sel_narrow, self.inp._get_value(sel_input))

    #
    # copy from INPUT to CRT_(X|Y).LADDER_{X,Y}.WIDE
    #
    def set_wide_from_input(self, bank_crt, sel_wide, sel_input):
        bank_crt.ladder_x._set_wide(sel_wide, self.inp._get_value(sel_input))
        bank_crt.ladder_y._set_wide(sel_wide, self.inp._get_value(sel_input))

    #
    # copy from CRT_Y.LADDER_{X,Y}.{WIDE,NARROW} to CRT_X.LADDER_{X,Y}.{WIDE,NARROW}
    #
    def copy_crt_y2x(self, sel_wide, sel_narrow):

        self.bnk.crt_x.ladder_x._set_wide(sel_wide, self.bnk.crt_y.ladder_x._get_wide(sel_wide))
        self.bnk.crt_x.ladder_y._set_wide(sel_wide, self.bnk.crt_y.ladder_y._get_wide(sel_wide))

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow))
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, self.bnk.crt_y.ladder_y._get_narrow(sel_narrow))

    #
    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{X,Y}.LADDER_Y.{WIDE,NARROW}
    #
    def copy_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):

        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))

        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))

    #
    # copy from CRT_{X,Y}.LADDER_Y.{WIDE,NARROW} to CRT_{X,Y}.LADDER_X.{WIDE,NARROW}
    #
    def copy_ladders_y2x(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, self.bnk.crt_x.ladder_y._get_wide(sel_wide_in))
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, self.bnk.crt_y.ladder_y._get_wide(sel_wide_in))

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in))
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in))

    #
    # copy from CRT_{X,Y}.LADDER_X.{WIDE,NARROW} to CRT_{Y,X}.LADDER_Y.{WIDE,NARROW}
    #
    def cross_ladders_x2y(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out):

        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, self.bnk.crt_y.ladder_x._get_wide(sel_wide_in))
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, self.bnk.crt_x.ladder_x._get_wide(sel_wide_in))
        
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in))
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in))

    #
    # modular multiply sel_wide_in by sel_narrow_in
    # stores intermediate result in WIDE.L and WIDE.H
    # needs modulus WIDE.N and speed-up coefficients NARROW.N_COEFF to be filled
    # places two copies of resulting quantity in sel_wide_out and sel_narrow_out
    # sel_*_in and sel_*_out can overlap (overwriting of input operands is ok)
    #
    def modular_multiply(self, sel_wide_in, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words, mode=(True, True), d=False):

        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)

        xxa       = self.bnk.crt_x.ladder_x._get_wide(sel_wide_in)
        xya       = self.bnk.crt_x.ladder_y._get_wide(sel_wide_in)

        yxa       = self.bnk.crt_y.ladder_x._get_wide(sel_wide_in)
        yya       = self.bnk.crt_y.ladder_y._get_wide(sel_wide_in)

        xxb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        xyb       = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in)

        yxb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
        yyb       = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in)

        if not mode[0]: xb = xxb
        else:           xb = xyb

        if not mode[1]: yb = yxb
        else:           yb = yyb

        xxp = self.wrk.multipurpose_multiply(xxa, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="X")
        xyp = self.wrk.multipurpose_multiply(xya, xb, xn, xn_coeff, num_words, dump=d, dump_crt="X", dump_ladder="Y")

        yxp = self.wrk.multipurpose_multiply(yxa, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="X")
        yyp = self.wrk.multipurpose_multiply(yya, yb, yn, yn_coeff, num_words, dump=d, dump_crt="Y", dump_ladder="Y")

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xxp)
        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, xyp)
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yxp)
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, yyp)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxp)
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyp)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxp)
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyp)

    #
    # modular subtract values in sel_narrow_in (X-Y)
    # stores two copies of the result in sel_*_out
    #
    def modular_subtract(self, sel_narrow_in, sel_narrow_out, sel_wide_out, num_words):

        xa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        xb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_in)
        xn = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        ya = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)
        yb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_in)
        yn = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xd = self.wrk.serial_subtract_modular(xa, xb, xn, num_words)
        yd = self.wrk.serial_subtract_modular(ya, yb, yn, num_words)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xd)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yd)

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xd)
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yd)
    
    #
    # modular reduce sel_narrow_in
    # stores two copies of the result in sel_*_out
    #
    def modular_reduce(self, sel_narrow_in, sel_wide_out, sel_narrow_out, num_words):

        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)

        xb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        yb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)

        xp = self.wrk.multipurpose_multiply(None, xb, xn, xn_coeff, num_words, reduce_only=True)
        yp = self.wrk.multipurpose_multiply(None, yb, yn, yn_coeff, num_words, reduce_only=True)

        self.bnk.crt_x.ladder_x._set_wide(sel_wide_out, xp)
        self.bnk.crt_x.ladder_y._set_wide(sel_wide_out, xp)
        self.bnk.crt_y.ladder_x._set_wide(sel_wide_out, yp)
        self.bnk.crt_y.ladder_y._set_wide(sel_wide_out, yp)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xp)
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xp)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yp)
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yp)

    #
    # propagate carries (convert to non-redundant representation) content in sel_narrow
    # overwrites input value
    #
    def propagate_carries(self, sel_narrow, num_words):
        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_x._get_narrow(sel_narrow), num_words)
        self.wrk.convert_nonredundant(self.bnk.crt_x.ladder_y._get_narrow(sel_narrow), num_words)
        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_x._get_narrow(sel_narrow), num_words)
        self.wrk.convert_nonredundant(self.bnk.crt_y.ladder_y._get_narrow(sel_narrow), num_words)

    #
    # copy from CRT_{X,Y}.LADDER_{X,Y}.WIDE.{H,L} to CRT_{X,Y}.LADDER_{X,Y}.NARROW
    #
    def merge_lha(self, sel_narrow, num_words):
        xx_lsb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
        xy_lsb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.L)
        yx_lsb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.L)
        yy_lsb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.L)

        xx_msb = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
        xy_msb = self.bnk.crt_x.ladder_y._get_wide(ModExpNG_WideBankEnum.H)
        yx_msb = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.H)
        yy_msb = self.bnk.crt_y.ladder_y._get_wide(ModExpNG_WideBankEnum.H)

        xx = xx_lsb.words + xx_msb.words
        xy = xy_lsb.words + xy_msb.words
        yx = yx_lsb.words + yx_msb.words
        yy = yy_lsb.words + yy_msb.words

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xx))
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, xy))
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yx))
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow, ModExpNG_Operand(None, 2*num_words, yy))

    #
    # multiply sel_wide_in by sel_narrow_in
    # stores twice larger product in WIDE.L and WIDE.H
    #
    def regular_multiply(self, sel_wide_in, sel_narrow_in, num_words):

        xn       = self.bnk.crt_x.ladder_x._get_wide(ModExpNG_WideBankEnum.N)
        yn       = self.bnk.crt_y.ladder_x._get_wide(ModExpNG_WideBankEnum.N)

        xn_coeff = self.bnk.crt_x.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)
        yn_coeff = self.bnk.crt_y.ladder_x._get_narrow(ModExpNG_NarrowBankEnum.N_COEFF)

        xxa       = self.bnk.crt_x.ladder_x._get_wide(sel_wide_in)
        xya       = self.bnk.crt_x.ladder_y._get_wide(sel_wide_in)

        yxa       = self.bnk.crt_y.ladder_x._get_wide(sel_wide_in)
        yya       = self.bnk.crt_y.ladder_y._get_wide(sel_wide_in)

        xb       = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_in)
        yb       = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_in)

        xxp = self.wrk.multipurpose_multiply(xxa, xb, None, None, num_words, multiply_only=True)
        xyp = self.wrk.multipurpose_multiply(xya, xb, None, None, num_words, multiply_only=True)

        yxp = self.wrk.multipurpose_multiply(yxa, yb, None, None, num_words, multiply_only=True)
        yyp = self.wrk.multipurpose_multiply(yya, yb, None, None, num_words, multiply_only=True)

        xxp_lsb = xxp.lower_half()
        xxp_msb = xxp.upper_half()

        xyp_lsb = xyp.lower_half()
        xyp_msb = xyp.upper_half()

        yxp_lsb = yxp.lower_half()
        yxp_msb = yxp.upper_half()

        yyp_lsb = yyp.lower_half()
        yyp_msb = yyp.upper_half()

        self.bnk.crt_x.ladder_x._set_wide(ModExpNG_WideBankEnum.L, xxp_lsb)
        self.bnk.crt_x.ladder_y._set_wide(ModExpNG_WideBankEnum.L, xyp_lsb)
        self.bnk.crt_y.ladder_x._set_wide(ModExpNG_WideBankEnum.L, yxp_lsb)
        self.bnk.crt_y.ladder_y._set_wide(ModExpNG_WideBankEnum.L, yyp_lsb)

        self.bnk.crt_x.ladder_x._set_wide(ModExpNG_WideBankEnum.H, xxp_msb)
        self.bnk.crt_x.ladder_y._set_wide(ModExpNG_WideBankEnum.H, xyp_msb)
        self.bnk.crt_y.ladder_x._set_wide(ModExpNG_WideBankEnum.H, yxp_msb)
        self.bnk.crt_y.ladder_y._set_wide(ModExpNG_WideBankEnum.H, yyp_msb)

    #
    # adds sel_narrow_a_in to sel_narrow_b_in
    # stores result in sel_narrow_out
    #
    def regular_add(self, sel_narrow_a_in, sel_narrow_b_in, sel_narrow_out, num_words):
        xxa = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_a_in)
        xya = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_a_in)
        yxa = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_a_in)
        yya = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_a_in)

        xxb = self.bnk.crt_x.ladder_x._get_narrow(sel_narrow_b_in)
        xyb = self.bnk.crt_x.ladder_y._get_narrow(sel_narrow_b_in)
        yxb = self.bnk.crt_y.ladder_x._get_narrow(sel_narrow_b_in)
        yyb = self.bnk.crt_y.ladder_y._get_narrow(sel_narrow_b_in)

        xxc = self.wrk.serial_add_uneven(xxa, xxb, num_words)
        xyc = self.wrk.serial_add_uneven(xya, xyb, num_words)
        yxc = self.wrk.serial_add_uneven(yxa, yxb, num_words)
        yyc = self.wrk.serial_add_uneven(yya, yyb, num_words)

        self.bnk.crt_x.ladder_x._set_narrow(sel_narrow_out, xxc)
        self.bnk.crt_x.ladder_y._set_narrow(sel_narrow_out, xyc)
        self.bnk.crt_y.ladder_x._set_narrow(sel_narrow_out, yxc)
        self.bnk.crt_y.ladder_y._set_narrow(sel_narrow_out, yyc)

    #
    # dump working variables before ladder step
    #
    def dump_before_step_using_crt(self, pq, m):
        print("num_words = %d" % pq)
        print("\rladder_mode_x = %d" % m[0])
        print("\rladder_mode_y = %d" % m[1])
        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")
        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("X_N")
        self.bnk.crt_x.ladder_x._get_wide(W.N).format_verilog_concat("Y_N")
        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("X_N_COEFF")
        self.bnk.crt_x.ladder_x._get_narrow(N.N_COEFF).format_verilog_concat("Y_N_COEFF")

    #
    # dump working variables after ladder step
    #
    def dump_after_step_using_crt(self):
        self.bnk.crt_x.ladder_x._get_narrow(N.C).format_verilog_concat("X_X")
        self.bnk.crt_x.ladder_y._get_narrow(N.C).format_verilog_concat("X_Y")
        self.bnk.crt_y.ladder_x._get_narrow(N.C).format_verilog_concat("Y_X")
        self.bnk.crt_y.ladder_y._get_narrow(N.C).format_verilog_concat("Y_Y")

    #
    # this deliberately converts narrow operand into redundant representation
    #
    def _force_overflow(self, bank_crt, sel_narrow):

        # original words
        T = bank_crt.ladder_x._get_narrow(sel_narrow).words

        # loop through upper N-1 words
        for i in range(1, len(T)):

            # get msbs of the previous word
            upper_bits = T[i-1] & _CARRY_MASK

            # if the previous msbs are empty, force lsbs of the current word
            # into them and then wipe the current lsbs
            if upper_bits == 0:
                lower_bits = T[i] & (_CARRY_MASK >> _WORD_WIDTH)
                T[i] ^= lower_bits
                T[i-1] |= (lower_bits << _WORD_WIDTH)

        # overwrite original words
        bank_crt.ladder_x._set_narrow(sel_narrow, ModExpNG_Operand(None, len(T), T))

        print("Forced overflow.")

#
# read content of core's output bank and compare it against known good values
#
def compare_signature():

    c  = core
    s  = s_known
    xm = xm_known
    ym = ym_known

    core_s  = c.out.get_value(O.S)
    core_xm = c.out.get_value(O.XM)
    core_ym = c.out.get_value(O.YM)

    if core_s.number()  != s:  print("ERROR: core_s != s!")
    else:                      print("s is OK")

    if core_xm.number() != xm: print("ERROR: core_xm != xm!")
    else:                      print("x_mutated is OK")

    if core_ym.number() != ym: print("ERROR: core_ym != ym!")
    else:                      print("y_mutated is OK")

#
# get current ladder mode based on two exponents' bits
#
def get_ladder_mode_using_crt(v, bit):

    bit_value_p = (v.dp.number() & (1 << bit)) >> bit
    bit_value_q = (v.dq.number() & (1 << bit)) >> bit

    bit_value_p = bit_value_p > 0
    bit_value_q = bit_value_q > 0

    return (bit_value_p, bit_value_q)

#
# get current ladder mode based on private exponent's bit
#
def get_ladder_mode_without_crt(v, bit):

    bit_value_d = (v.d.number() & (1 << bit)) >> bit

    bit_value_d = bit_value_d > 0

    return (not bit_value_d, bit_value_d)

#
# print current exponentiation progress
#
def print_ladder_progress(current, total):

    # this will always print "100.0%" at the very last iteration, since we're
    # counting bits from msb to lsb and the very last index is zero, which
    # is congruent to 0 mod DUMP_PROGRESS_FACTOR
    if (current % DUMP_PROGRESS_FACTOR) == 0:
        pct = float((_WORD_WIDTH * total - current) / (_WORD_WIDTH * total)) * 100.0
        print("\rdone: %5.1f%%" % pct, end='')

    # move to next line after the very last iteration
    if current == 0: print("")

#
# try to exponentiate using the quad-multiplier (dual-core, dual-ladder) scheme
#
def sign_using_crt():

    c  = core
    v  = vector
    n  = n_num_words
    pq = pq_num_words

    ff = (False, False)
                                                                   #
                                                                   # A / B => different content in banks (A in WIDE, B in NARROW)
                                                                   # [XY]Z => different content in ladders (XZ in X, YZ in Y)
                                                                   # ..    => temporarily half-filled bank (omitted to save space)
                                                                   # *     => "crossed" content (X.Y == Y.X and Y.Y == X.X)
                                                                   #
                                                                   # +------------------------+-------+------------------+---------+-----------+
                                                                   # |  A                     |  B    |  C               |  D      |  E        |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input   (c.bnk.crt_x, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_y, W.N,       I.N)          # |  ?                     |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_x, W.A,       I.X)          # |  ..                    |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_y, W.A,       I.Y)          # | [XY] / ?               |  ?    |  ?               |  ?      | ?         |
    c.set_wide_from_input   (c.bnk.crt_x, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | .. / ?    |
    c.set_wide_from_input   (c.bnk.crt_y, W.E,       I.M)          # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input (c.bnk.crt_x, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_y, N.N_COEFF, I.N_COEFF)    # | [XY] / ?               |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_x, N.A,       I.N_FACTOR)   # | [XY] / ..              |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_y, N.A,       I.N_FACTOR)   # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ?    |
    c.set_narrow_from_input (c.bnk.crt_x, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M  / ..   |
    c.set_narrow_from_input (c.bnk.crt_y, N.E,       I.M)          # | [XY] / N_FACTOR        |  ?    |  ?               |  ?      | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.A, N.A, W.B, N.B, n)                      # | [XY] / N_FACTOR        | [XY]F |  ?               |  ?      | M         | [XY]F = [XY] * N_FACTOR
    c.modular_multiply(W.B, N.B, W.C, N.C, n, mode=ff)             # | [XY] / N_FACTOR        | [XY]F | [XY]YM           |  ?      | M         | [XY]MF = [XY]F * [XY]F
    c.modular_multiply(W.C, N.I, W.D, N.D, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         | [XY]M = [XY]MF * 1
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.D, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
    c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)               # | [XY] / N_FACTOR        | [XY]F | [XY]YM           | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.E, N.B, W.C, N.C, n)                      # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         | [XY]MB = M * [XY]F
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.C, n_num_words)                          # | [XY] / N_FACTOR        | [XY]F | [XY]MB           | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.copy_crt_y2x(W.C, N.C)                                       # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input  (c.bnk.crt_x, W.N,       I.P)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_y, W.N,       I.Q)           # | [XY] / N_FACTOR        | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_x, W.A,       I.P_FACTOR)    # | ...         / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_y, W.A,       I.Q_FACTOR)    # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | M         |
    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | ..        |
    c.set_wide_from_input  (c.bnk.crt_x, W.E,       I.QINV)        # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.P_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.Q_COEFF)     # | [PQ]_FACTOR / N_FACTOR | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_x, N.A,       I.P_FACTOR)    # | [PQ]_FACTOR / ...      | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_y, N.A,       I.Q_FACTOR)    # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / M  |
    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV / .. |
    c.set_narrow_from_input(c.bnk.crt_x, N.E,       I.QINV)        # | [PQ]_FACTOR            | [XY]F |  YMB             | [XY]M   | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_reduce(N.C, W.D, N.D, pq)                            # | [PQ]_FACTOR            | [XY]F |  YMB             | [PQ]MBZ | QINV      | [PQ]MBZ = YMB mod [PQ]
    c.modular_multiply(W.D, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBZ | QINV      | [PQ]MB = [PQ]MBZ * [PQ]_FACTOR
    c.modular_multiply(W.C, N.A, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]MB           | [PQ]MBF | QINV      | [PQ]MBF = [PQ]MB * [PQ]_FACTOR
    c.modular_multiply(W.A, N.I, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]IF           | [PQ]MBF | QINV      | [PQ]IF = 1 * [PQ]_FACTOR
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.copy_ladders_x2y(W.D, N.D, W.C, N.C)                         # | [PQ]_FACTOR            | [XY]F | [PQ]IF / [PQ]MBF | [PQ]MBF | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    ###########################                                    # |                        |       |                  |         |           |
    # Begin Montgomery Ladder #                                    # |                        |       |                  |         |           |
    ###########################                                    # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
    for bit in range(_WORD_WIDTH * pq - 1, -1, -1):                # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
        m  = get_ladder_mode_using_crt(v, bit)                     # |                        |       |                  |         |           |
        dbg = bit == DUMP_LADDER_INDEX                             # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
        if dbg:                                                    # |                        |       |                  |         |           |
            if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C) # |                        |       |                  |         |           |
            if DUMP_VECTORS: c.dump_before_step_using_crt(pq, m)   # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
        c.modular_multiply(W.C, N.C, W.C, N.C, pq, mode=m, d=dbg)  # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]MBF | QINV      | <LADDER>
                                                                   # +------------------------+-------+------------------+---------+-----------+
        if dbg and DUMP_VECTORS: c.dump_after_step_using_crt()     # |                        |       |                  |         |           |
        print_ladder_progress(bit, pq)                             # |                        |       |                  |         |           |
                                                                   # |                        |       |                  |         |           |
    #########################                                      # |                        |       |                  |         |           |
    # End Montgomery Ladder #                                      # |                        |       |                  |         |           |
    #########################                                      # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.C, N.I, W.D, N.D, pq)                     # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      | [PQ]SB = [PQ]SBF * 1
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.D, pq)                                   # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB  | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.cross_ladders_x2y(W.D, N.D, W.D, N.D)                        # | [PQ]_FACTOR            | [XY]F | [PQ]SBF          | [PQ]SB* | QINV      |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_subtract(N.D, N.C, W.C, pq)                          # | [PQ]_FACTOR            | [XY]F |  RSB             | [PQ]SB* | QINV      | RSB = PSB - QSB
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.C, N.E, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBIZ           | [PQ]SB* | QINV      | RSBIZ = RSB * QINV
    c.modular_multiply(W.C, N.A, W.C, N.C, pq)                     # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | QINV      | RSBI = RSBIZ * P_FACTOR
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | ..        |
    c.set_wide_from_input  (c.bnk.crt_x, W.E, I.Q)                 # | [PQ]_FACTOR / N_FACTOR | [XY]F |  RSBI            | [PQ]SB* | Q / QINV  |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q / ..    |
    c.set_narrow_from_input(c.bnk.crt_x, N.E, I.Q)                 # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.regular_multiply(W.E, N.C, pq)                               # | [PQ]_FACTOR            | [XY]F |  RSBI            | [PQ]SB* | Q         | = RSBI * Q
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.merge_lha(N.A, pq)                                           # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.A, n)                                    # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            | [PQ]SB* | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.copy_crt_y2x(W.D, N.D)                                       # | [PQ]_FACTOR / QRSBI    | [XY]F |  RSBI            |  QSB*   | Q         |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.regular_add(N.D, N.A, N.C, pq)                               # | [PQ]_FACTOR / QRSBI    | [XY]F |  SB              |  QSB*   | Q         | SB = QSB + RSBI
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_wide_from_input  (c.bnk.crt_x, W.N, I.N)                 # |                        |       |                  |         |           |
    c.set_wide_from_input  (c.bnk.crt_y, W.N, I.N)                 # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_narrow_from_input(c.bnk.crt_x, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
    c.set_narrow_from_input(c.bnk.crt_y, N.N_COEFF, I.N_COEFF)     # |                        |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.modular_multiply(W.B, N.C, W.A, N.A, n, ff)                  # |  S                     |       |                  |         |           | S = XF * SB
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.propagate_carries(N.A, n)                                    # |  S                     |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+
    c.set_output_from_narrow(O.S, c.bnk.crt_x, N.A)                # |  S                     |       |                  |         |           |
                                                                   # +------------------------+-------+------------------+---------+-----------+

#
# try to exponentiate using only half of the quad-multiplier (one dual-ladder core)
#
def sign_without_crt():

    c  = core
    v  = vector
    n  = n_num_words

    ff = (False, False)

    c.set_wide_from_input   (c.bnk.crt_x, W.N,       I.N)
    c.set_wide_from_input   (c.bnk.crt_y, W.N,       I.N)
    c.set_wide_from_input   (c.bnk.crt_x, W.A,       I.X)
    c.set_wide_from_input   (c.bnk.crt_y, W.A,       I.Y)
    c.set_wide_from_input   (c.bnk.crt_x, W.E,       I.M)
    c.set_wide_from_input   (c.bnk.crt_y, W.E,       I.M)

    c.set_narrow_from_input (c.bnk.crt_x, N.N_COEFF, I.N_COEFF)
    c.set_narrow_from_input (c.bnk.crt_y, N.N_COEFF, I.N_COEFF)
    c.set_narrow_from_input (c.bnk.crt_x, N.A,       I.N_FACTOR)
    c.set_narrow_from_input (c.bnk.crt_y, N.A,       I.N_FACTOR)
    c.set_narrow_from_input (c.bnk.crt_x, N.E,       I.M)
    c.set_narrow_from_input (c.bnk.crt_y, N.E,       I.M)

    c.modular_multiply(W.A, N.A, W.B, N.B, n)           # [XY]F = [XY] * N_FACTOR
    c.modular_multiply(W.B, N.B, W.C, N.C, n, mode=ff)  # [XY]MF = [XY]F * [XY]F
    c.modular_multiply(W.C, N.I, W.D, N.D, n)           # [XY]M = [XY]MF * 1

    c.propagate_carries(N.D, n)

    c.set_output_from_narrow(O.XM, c.bnk.crt_x, N.D)
    c.set_output_from_narrow(O.YM, c.bnk.crt_y, N.D)

    c.modular_multiply(W.E, N.B, W.C, N.C, n)   # [XY]MB = M * [XY]F

    XF = c.bnk.crt_x.ladder_x._get_narrow(N.B)

    c.set_wide_from_input(c.bnk.crt_x, W.A, I.N_FACTOR)
    c.set_wide_from_input(c.bnk.crt_y, W.A, I.N_FACTOR)

    c.modular_multiply(W.C, N.A, W.D, N.D, n)   # MBF = MB * N_FACTOR
    c.modular_multiply(W.A, N.I, W.C, N.C, n)   # IF = 1 * N_FACTOR    
    
    c.copy_ladders_x2y(W.D, N.D, W.C, N.C)

    ###########################
    # Begin Montgomery Ladder #
    ###########################

    for bit in range(_WORD_WIDTH * n - 1, -1, -1):

        m  = get_ladder_mode_without_crt(v, bit)
        dbg = bit == DUMP_LADDER_INDEX

        if dbg:
            if FORCE_OVERFLOW: c._force_overflow(c.bnk.crt_x, N.C)
            if DUMP_VECTORS: c.dump_before_step_without_crt(n, m)

        c.modular_multiply(W.C, N.C, W.C, N.C, n, mode=m, d=dbg)

        if dbg and DUMP_VECTORS: c.dump_after_step_without_crt()
        print_ladder_progress(bit, n)
            
    #########################
    # End Montgomery Ladder #
    #########################

    c.cross_ladders_x2y(W.B, N.B, W.B, N.B)

    c.modular_multiply(W.C, N.I, W.D, N.D, n)           # SB = SBF * 1    
    c.modular_multiply(W.B, N.D, W.A, N.A, n, mode=ff)  # S = XF * SB

    c.copy_ladders_y2x(W.A, N.A, W.B, N.B)
    
    c.propagate_carries(N.B, n)
    
    c.set_output_from_narrow(O.S, c.bnk.crt_y, N.B)


#
# main()
#
if __name__ == "__main__":

    # handy shortcuts
    W = ModExpNG_WideBankEnum
    N = ModExpNG_NarrowBankEnum
    I = ModExpNG_CoreInputEnum
    O = ModExpNG_CoreOutputEnum

    # set helper quantity
    # instantiate core
    # load test vector
    # transfer numbers from vector to core
    # set numbers of words
    # obtain known good reference value with built-in math
    # mutate blinding quantities with built-in math

    i = ModExpNG_Operand(1, KEY_LENGTH)

    core   = ModExpNG_Core(i)
    vector = ModExpNG_TestVector()

    core.inp.set_value(I.M,        vector.m)

    core.inp.set_value(I.N,        vector.n)
    core.inp.set_value(I.P,        vector.p)
    core.inp.set_value(I.Q,        vector.q)

    core.inp.set_value(I.N_COEFF,  vector.n_coeff)
    core.inp.set_value(I.P_COEFF,  vector.p_coeff)
    core.inp.set_value(I.Q_COEFF,  vector.q_coeff)

    core.inp.set_value(I.N_FACTOR, vector.n_factor)
    core.inp.set_value(I.P_FACTOR, vector.p_factor)
    core.inp.set_value(I.Q_FACTOR, vector.q_factor)

    core.inp.set_value(I.X,        vector.x)
    core.inp.set_value(I.Y,        vector.y)

    core.inp.set_value(I.QINV,     vector.qinv)

    n_num_words  = KEY_LENGTH  // _WORD_WIDTH
    pq_num_words = n_num_words // 2

    s_known = pow(vector.m.number(), vector.d.number(), vector.n.number())

    xm_known = pow(vector.x.number(), 2, vector.n.number())
    ym_known = pow(vector.y.number(), 2, vector.n.number())

    # sign using CRT and check
    print("Signing using CRT...")
    sign_using_crt()
    compare_signature()

    # sign without CRT and check
    print("Signing without CRT...")
    sign_without_crt()
    compare_signature()


#
# End-of-File
#