diff options
Diffstat (limited to 'src/rtl')
-rw-r--r-- | src/rtl/modexpa7_top.v | 120 | ||||
-rw-r--r-- | src/rtl/modexpa7_wrapper.v | 30 |
2 files changed, 100 insertions, 50 deletions
diff --git a/src/rtl/modexpa7_top.v b/src/rtl/modexpa7_top.v index ad101dd..ea3d2c2 100644 --- a/src/rtl/modexpa7_top.v +++ b/src/rtl/modexpa7_top.v @@ -54,7 +54,7 @@ module modexpa7_top # input bus_cs, input bus_we, - input [OPERAND_ADDR_WIDTH+1:0] bus_addr, + input [OPERAND_ADDR_WIDTH+2:0] bus_addr, input [ 32-1:0] bus_data_wr, output [ 32-1:0] bus_data_rd ); @@ -109,24 +109,38 @@ module modexpa7_top # reg valid_reg = 1'b0; assign ready = ready_reg; - assign valid = valid_reg; + assign valid = valid_reg;
+
+ reg init_trig_latch;
+ reg next_trig_latch;
+
+ always @(posedge clk)
+ //
+ if (fsm_state == FSM_STATE_IDLE)
+ //
+ case ({next_trig, init_trig})
+ 2'b00: {next_trig_latch, init_trig_latch} <= 2'b00; // do nothing
+ 2'b01: {next_trig_latch, init_trig_latch} <= 2'b01; // precalculate
+ 2'b10: {next_trig_latch, init_trig_latch} <= 2'b10; // exponentiate
+ 2'b11: {next_trig_latch, init_trig_latch} <= 2'b01; // 'init' has priority over 'next'
+ endcase // ready flag logic always @(posedge clk or negedge rst_n) // - if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state + if (rst_n == 1'b0) ready_reg <= 1'b0; // reset flag to default state else case (fsm_state) - FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started - FSM_STATE_STOP: if (!ready_reg) ready_reg <= 1'b1; // set flag after operation is finished + FSM_STATE_IDLE: if (init_trig) ready_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (init_trig_latch) ready_reg <= 1'b1; // set flag after operation is finished endcase // valid flag logic always @(posedge clk or negedge rst_n) // - if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state + if (rst_n == 1'b0) valid_reg <= 1'b0; // reset flag to default state else case (fsm_state) - FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started - FSM_STATE_STOP: if (!valid_reg) valid_reg <= 1'b1; // set flag after operation is finished + FSM_STATE_IDLE: if (next_trig) valid_reg <= 1'b0; // clear flag when operation is started + FSM_STATE_STOP: if (next_trig_latch) valid_reg <= 1'b1; // set flag after operation is finished endcase @@ -137,14 +151,20 @@ module modexpa7_top # reg [OPERAND_ADDR_WIDTH+4:0] exponent_num_bits_latch; // save number of words in modulus when pre-calculation has been triggered, - // i.e. user has apparently loaded a new modulus into the core + // i.e. user has apparently loaded a new modulus into the core
+ //
+ // we also need to update modulus length when user wants to exponentiate,
+ // because he could have done precomputation for some modulus, then used
+ // a different length modulus and then reverted back the original modulus
+ // without doing precomputation (dammit, spent whole day chasing this bug :( always @(posedge clk) // - if (fsm_next_state == FSM_STATE_PRECALC_START) + if ((fsm_next_state == FSM_STATE_PRECALC_START) ||
+ (fsm_next_state == FSM_STATE_EXPONENT_START)) modulus_num_words_latch <= modulus_num_words; // save number of bits in exponent when exponentiation has been triggered, - // i.e. user has loaded a new message into the core and wants exponentiate + // i.e. user has loaded a new message into the core and wants to exponentiate always @(posedge clk) // if (fsm_next_state == FSM_STATE_EXPONENT_START) @@ -154,17 +174,21 @@ module modexpa7_top # /* * Split bus address into bank/word parts. */ - wire [ 2 - 1 : 0] bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+1:OPERAND_ADDR_WIDTH]; + wire [ 3 - 1 : 0] bus_addr_bank = bus_addr[OPERAND_ADDR_WIDTH+2:OPERAND_ADDR_WIDTH]; wire [OPERAND_ADDR_WIDTH - 1 : 0] bus_addr_word = bus_addr[OPERAND_ADDR_WIDTH-1:0]; /* * Define bank offsets. */ - localparam [ 1: 0] BANK_MODULUS = 2'b00; // 0 - localparam [ 1: 0] BANK_MESSAGE = 2'b01; // 1 - localparam [ 1: 0] BANK_EXPONENT = 2'b10; // 2 - localparam [ 1: 0] BANK_RESULT = 2'b11; // 3 + localparam [ 2: 0] BANK_MODULUS = 3'b000; // 0 + localparam [ 2: 0] BANK_MESSAGE = 3'b001; // 1 + localparam [ 2: 0] BANK_EXPONENT = 3'b010; // 2 + localparam [ 2: 0] BANK_RESULT = 3'b011; // 3
+ localparam [ 2: 0] BANK_MODULUS_COEFF_OUT = 3'b100; // 5
+ localparam [ 2: 0] BANK_MODULUS_COEFF_IN = 3'b101; // 4
+ localparam [ 2: 0] BANK_MONTGOMERY_FACTOR_OUT = 3'b110; // 7 + localparam [ 2: 0] BANK_MONTGOMERY_FACTOR_IN = 3'b111; // 6
/* @@ -176,7 +200,7 @@ module modexpa7_top # * * Note, that the core does squaring and multiplication simultaneously, so * there are two identical systolic multipliers inside. It's better to have two - * copies of modulus to give router some freeding in placing the multipliers, + * copies of modulus to give router some freedom in placing the multipliers, * that's why there are actually two identical block memories N1 and N2 instead of N. * User reads from the first one, but writes to both of them. Note that the synthesis * tool might get too clever and find out that N1 and N2 are identical and decide @@ -250,14 +274,18 @@ module modexpa7_top # /* - * Instantiate internal memories. + * Instantiate more block memories.
+ *
+ * Fast modular exponentiation requires two pre-calculated helper quantities: Montgomery
+ * factor F and modulus-dependent speed-up coefficient N_COEFF. This core has two separate
+ * buffers for each of those quantities, during pre-computation F and N_COEFF are written to
+ * the "output" buffers, so that user can retrieve them and store along with the key for
+ * future use. During exponentiation F and N_COEFF are read from the "input" buffers and
+ * must be supplied by user along with the modulus. * - * We have two block memories: F for Montgomery factor and N_COEFF for modulus-dependent - * coefficient, they are written to during pre-calculation and read from during exponentiation. - * - * Note, that there are actually two identical block memories N_COEFF1 and N_COEFF2 instead of - * just one N_COEFF, read the explanation above. F is only used by one of the multipliers, so - * we don't need F1 and F2. + * Note, that there are actually two identical input block memories N_COEFF1 and N_COEFF2
+ * instead of just one N_COEFF, read the explanation above. F is only used by one of
+ * the multipliers, so we don't need F1 and F2. */ wire [OPERAND_ADDR_WIDTH-1:0] core_f_addr_wr; @@ -274,20 +302,38 @@ module modexpa7_top # wire core_f_wren; wire core_n_coeff_wren; +
+ wire [ 32-1:0] user_f_out_data; + wire [ 32-1:0] user_f_in_data; + wire [ 32-1:0] user_n_coeff_out_data; + wire [ 32-1:0] user_n_coeff_in_data; +
+ wire user_f_in_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MONTGOMERY_FACTOR_IN); + wire user_n_coeff_in_wren = bus_cs && bus_we && (bus_addr_bank == BANK_MODULUS_COEFF_IN);
bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_f (.clk(clk), + bram_f_out (.clk(clk), .a_addr(core_f_addr_wr), .a_out(), .a_wr(core_f_wren), .a_in(core_f_data_wr), + .b_addr(bus_addr_word), .b_out(user_f_out_data)); +
+ bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_f_in (.clk(clk), + .a_addr(bus_addr_word), .a_out(user_f_in_data), .a_wr(user_f_in_wren), .a_in(bus_data_wr), .b_addr(core_f_addr_rd), .b_out(core_f_data_rd)); - +
bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_n_coeff1 (.clk(clk), + bram_n_coeff_out (.clk(clk), .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), + .b_addr(bus_addr_word), .b_out(user_n_coeff_out_data));
+ + bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) + bram_n_coeff_in1 (.clk(clk), + .a_addr(bus_addr_word), .a_out(user_n_coeff_in_data), .a_wr(user_n_coeff_in_wren), .a_in(bus_data_wr), .b_addr(core_n_coeff1_addr_rd), .b_out(core_n_coeff1_data_rd)); bram_1rw_1ro_readfirst #(.MEM_WIDTH(32), .MEM_ADDR_BITS(OPERAND_ADDR_WIDTH)) - bram_n_coeff2 (.clk(clk), - .a_addr(core_n_coeff_addr_wr), .a_out(), .a_wr(core_n_coeff_wren), .a_in(core_n_coeff_data_wr), + bram_n_coeff_in2 (.clk(clk), + .a_addr(bus_addr_word), .a_out(), .a_wr(user_n_coeff_in_wren), .a_in(bus_data_wr), .b_addr(core_n_coeff2_addr_rd), .b_out(core_n_coeff2_data_rd)); @@ -461,7 +507,7 @@ module modexpa7_top # */ // delay bus_addr_bank by 1 clock cycle to remember from where we've just been reading - reg [1: 0] bus_addr_bank_dly; + reg [2: 0] bus_addr_bank_dly; always @(posedge clk) if (bus_cs) bus_addr_bank_dly <= bus_addr_bank; @@ -474,12 +520,16 @@ module modexpa7_top # // case (bus_addr_bank_dly) // - BANK_MODULUS: bus_data_rd_mux = user_n_data; - BANK_MESSAGE: bus_data_rd_mux = user_m_data; - BANK_EXPONENT: bus_data_rd_mux = user_d_data; - BANK_RESULT: bus_data_rd_mux = user_r_data; + BANK_MODULUS: bus_data_rd_mux = user_n_data; + BANK_MESSAGE: bus_data_rd_mux = user_m_data; + BANK_EXPONENT: bus_data_rd_mux = user_d_data; + BANK_RESULT: bus_data_rd_mux = user_r_data;
+ //
+ BANK_MODULUS_COEFF_OUT: bus_data_rd_mux = user_n_coeff_out_data;
+ BANK_MODULUS_COEFF_IN: bus_data_rd_mux = user_n_coeff_in_data;
+ BANK_MONTGOMERY_FACTOR_OUT: bus_data_rd_mux = user_f_out_data; + BANK_MONTGOMERY_FACTOR_IN: bus_data_rd_mux = user_f_in_data; // endcase - endmodule diff --git a/src/rtl/modexpa7_wrapper.v b/src/rtl/modexpa7_wrapper.v index a4e2319..8ebc22a 100644 --- a/src/rtl/modexpa7_wrapper.v +++ b/src/rtl/modexpa7_wrapper.v @@ -42,7 +42,7 @@ module modexpa7_wrapper # input cs, input we, - input [OPERAND_ADDR_WIDTH+2:0] address, + input [OPERAND_ADDR_WIDTH+3:0] address, input [ 32-1:0] write_data, output [ 32-1:0] read_data ); @@ -54,8 +54,8 @@ module modexpa7_wrapper # localparam ADDR_MSB_REGS = 1'b0; localparam ADDR_MSB_CORE = 1'b1; - wire address_msb = address[OPERAND_ADDR_WIDTH+2]; - wire [OPERAND_ADDR_WIDTH+1:0] address_lsb = address[OPERAND_ADDR_WIDTH+1:0]; + wire address_msb = address[OPERAND_ADDR_WIDTH+3]; + wire [OPERAND_ADDR_WIDTH+2:0] address_lsb = address[OPERAND_ADDR_WIDTH+2:0]; /* @@ -68,17 +68,17 @@ module modexpa7_wrapper # /* * Registers */ - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_NAME0 = 'h00; // - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_NAME1 = 'h01; // - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_VERSION = 'h02; // - - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_CONTROL = 'h08; // {next, init} - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_STATUS = 'h09; // {valid, ready} - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODE = 'h10; // {crt, dummy} - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_MODULUS_BITS = 'h11; // number of bits in modulus - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_EXPONENT_BITS = 'h12; // number of bits in exponent - localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_BUFFER_BITS = 'h13; // largest supported number of bits
- localparam [OPERAND_ADDR_WIDTH+1:0] ADDR_ARRAY_BITS = 'h14; // number of bits in systolic array + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_NAME0 = 'h00; // + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_NAME1 = 'h01; // + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_VERSION = 'h02; // + + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_CONTROL = 'h08; // {next, init} + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_STATUS = 'h09; // {valid, ready} + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_MODE = 'h10; // {crt, dummy} + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_MODULUS_BITS = 'h11; // number of bits in modulus + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_EXPONENT_BITS = 'h12; // number of bits in exponent + localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_BUFFER_BITS = 'h13; // largest supported number of bits
+ localparam [OPERAND_ADDR_WIDTH+2:0] ADDR_ARRAY_BITS = 'h14; // number of bits in systolic array localparam CONTROL_INIT_BIT = 0; localparam CONTROL_NEXT_BIT = 1; @@ -91,7 +91,7 @@ module modexpa7_wrapper # localparam CORE_NAME0 = 32'h6D6F6465; // "mode" localparam CORE_NAME1 = 32'h78706137; // "xpa7" - localparam CORE_VERSION = 32'h302E3230; // "0.20" + localparam CORE_VERSION = 32'h302E3235; // "0.25" /* |