1 files changed, 20 insertions, 13 deletions
diff --git a/fpga_modular.cpp b/fpga_modular.cpp
index af485a0..9b01df0 100644
--- a/fpga_modular.cpp
+++ b/fpga_modular.cpp
@@ -605,7 +605,8 @@ void fpga_modular_mul_helper_reduce_p256(FPGA_WORD *c, FPGA_BUFFER *p)
 // on the other hand is the output of the parallelized Comba multiplier, so it
 // is 2*OPERAND_WIDTH wide and has twice as many words (2*OPERAND_NUM_WORDS).
 //
-// ...
+// To save FPGA resources, the calculation is done using only two adders and
+// one subtractor. The algorithm is split into five steps.
 //
 //------------------------------------------------------------------------------
 #if USE_CURVE == 2
@@ -626,27 +627,33 @@ void fpga_modular_mul_helper_reduce_p384(FPGA_WORD *c, FPGA_BUFFER *p)
 	 s9.words[11] = 0,       s9.words[10] = 0,       s9.words[ 9] = 0,       s9.words[ 8] = 0,       s9.words[ 7] = 0,       s9.words[ 6] = 0,       s9.words[ 5] = 0,       s9.words[ 4] = c[23],   s9.words[ 3] = c[22],   s9.words[ 2] = c[21],   s9.words[ 1] = c[20],   s9.words[ 0] = 0;
 	s10.words[11] = 0,      s10.words[10] = 0,      s10.words[ 9] = 0,      s10.words[ 8] = 0,      s10.words[ 7] = 0,      s10.words[ 6] = 0,      s10.words[ 5] = 0,      s10.words[ 4] = c[23],  s10.words[ 3] = c[23],  s10.words[ 2] = 0,      s10.words[ 1] = 0,      s10.words[ 0] = 0;
 
-
 		// intermediate results
-	FPGA_BUFFER t1, t2, t3, t4;
+	FPGA_BUFFER sum0, sum1, difference;
 
 		/* Step 1. */
-	fpga_modular_add(&s1,  &s3, &t1);	// t1 = s1 + s3
-	fpga_modular_add(&s2,  &s2, &t2);	// t2 = 2*s2
-	fpga_modular_add(&s4,  &s5, &t3);	// t3 = s4 + s5
-	fpga_modular_add(&s6,  &s7, &t4);	// t4 = s6 + s7
+	fpga_modular_add(&s1,         &s3,         &sum0);			// sum0 = s1 + s3
+	fpga_modular_add(&s2,         &s2,         &sum1);			// sum1 = 2*s2
+	fpga_modular_sub(&ecdsa_zero, &s8,         &difference);	// difference = -s8
 
 		/* Step 2. */
-	fpga_modular_add(&t1,  &t2, &t1);	// t1 = t1 + t2 = s1 + 2*s2 + 2*s3
-	fpga_modular_add(&t3,  &t4, &t2);	// t2 = t3 + t4 = s4 + s5 + s6 + s7
-	fpga_modular_add(&s8,  &s9, &t3);	// t3 = s8 + s9
+	fpga_modular_add(&sum0,       &s4,         &sum0);			// sum0 = s1 + s3 + s4
+	fpga_modular_add(&sum1,       &s5,         &sum1);			// sum1 = 2*s2 + s5
+	fpga_modular_sub(&difference, &s9,         &difference);	// difference = -(s8 + s9)
 
 		/* Step 3. */
-	fpga_modular_add(&t1,  &t2, &t1);	// t1 = t1 + t2 = s1 + 2*s2 + 2*s3 + s4 + s5 + s6 + s7
-	fpga_modular_add(&s10, &t3, &t2);	// t2 = s10 + t3 = s8 + s9 + s10
+	fpga_modular_add(&sum0,       &s6,         &sum0);			// sum0 = s1 + s3 + s4 + s6
+	fpga_modular_add(&sum1,       &s7,         &sum1);			// sum1 = 2*s2 + s5 + s7
+	fpga_modular_sub(&difference, &s10,        &difference);	// difference = -(s8 + s9 + s10)
 
 		/* Step 4. */
-	fpga_modular_sub(&t1,  &t2, p);		// p = t1 - t2 = s1 + 2*s2 + 2*s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
+	fpga_modular_add(&sum0,       &sum1,       &sum0);			// sum0 = s1 + 2*s2 + 2*s3 + s4 + s5
+//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
+	fpga_modular_sub(&difference, &ecdsa_zero, &difference);	// compulsory cycle to keep difference constant for next stage
+
+		/* Step 5. */
+	fpga_modular_add(&sum0,       &difference, p);				// p = s1 + 2*s2 + s3 + s4 + s5 + s6 + s7 - s8 - s9 - s10
+//	fpga_modular_add(<dummy>,     <dummy>,     &sum1);			// dummy cycle, result ignored
+//	fpga_modular_add(<dummy>,     <dummy>,     &difference);	// dummy cycle, result ignored
 }
 #endif