Initial import of libtommath 0.35

--HG-- branch : libtommath-orig extra : convert_revision : b14c94b945e5c61de5bfa1a6816e11ed55cb2911

Initial import of libtommath 0.35
180935d8 · Matt Johnston · fc43a488 · 180935d8 · 180935d8 · 180935d8
Commit 180935d8 authored 19 years ago by Matt Johnston
--- a/TODO
+++ b/TODO
+things for book in order of importance...
+
+- Fix up pseudo-code [only] for combas that are not consistent with source
+- Start in chapter 3 [basics] and work up...
+   - re-write to prose [less abrupt]
+   - clean up pseudo code [spacing]
+   - more examples where appropriate and figures
+
+Goal:
+   - Get sync done by mid January [roughly 8-12 hours work]
+   - Finish ch3-6 by end of January [roughly 12-16 hours of work]
+   - Finish ch7-end by mid Feb [roughly 20-24 hours of work].
+
+Goal isn't "first edition" but merely cleaner to read.
+
+
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.32}
+\title{LibTomMath User Manual \\ v0.35}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
@@ -263,12 +263,12 @@ are the pros and cons of LibTomMath by comparing it to the math routines from Gn
 \begin{center}
 \begin{tabular}{|l|c|c|l|}
 \hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\
-\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 76.04$ \\
+\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath  $ = 71.97$ \\
 \hline Commented function prototypes & X && GnuPG function names are cryptic. \\
 \hline Speed && X & LibTomMath is slower.  \\
 \hline Totally free & X & & GPL has unfavourable restrictions.\\
 \hline Large function base & X & & GnuPG is barebones. \\
-\hline Four modular reduction algorithms & X & & Faster modular exponentiation. \\
+\hline Five modular reduction algorithms & X & & Faster modular exponentiation for a variety of moduli. \\
 \hline Portable & X & & GnuPG requires configuration to build. \\
 \hline
 \end{tabular}
@@ -284,9 +284,12 @@ would require when working with large integers.
 So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your
 own application but I think there are reasons not to.  While LibTomMath is slower than libraries such as GnuMP it is
 not normally significantly slower.  On x86 machines the difference is normally a factor of two when performing modular
-exponentiations.
+exponentiations.  It depends largely on the processor, compiler and the moduli being used.
  
-Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.
+Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern.  However,
+on the other side of the coin LibTomMath offers you a totally free (public domain) well structured math library
+that is very flexible, complete and performs well in resource contrained environments.  Fast RSA for example can
+be performed with as little as 8KB of ram for data (again depending on build options).  
  
 \chapter{Getting Started with LibTomMath}
 \section{Building Programs}
@@ -809,7 +812,7 @@ mp\_int variables based on their digits only.
  
 \index{mp\_cmp\_mag}
 \begin{alltt}
-int mp_cmp(mp_int * a, mp_int * b);
+int mp_cmp_mag(mp_int * a, mp_int * b);
 \end{alltt}
 This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
 three compare codes listed in figure \ref{fig:CMP}.
@@ -1220,12 +1223,13 @@ int mp_sqr (mp_int * a, mp_int * b);
 \end{alltt}
  
 Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
-algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms.
+algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms because
+of the speed difference.  
  
 \section{Tuning Polynomial Basis Routines}
  
 Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
-the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectfully they require 
+the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectively they require 
 considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
 multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
 of 138).
@@ -1297,14 +1301,14 @@ of $b$.  This algorithm accepts an input $a$ of any range and is not limited by
 \section{Barrett Reduction}
  
 Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
-a decent speedup over straight division.  First a $mu$ value must be precomputed with the following function.
+a decent speedup over straight division.  First a $\mu$ value must be precomputed with the following function.
  
 \index{mp\_reduce\_setup}
 \begin{alltt}
 int mp_reduce_setup(mp_int *a, mp_int *b);
 \end{alltt}
  
-Given a modulus in $b$ this produces the required $mu$ value in $a$.  For any given modulus this only has to
+Given a modulus in $b$ this produces the required $\mu$ value in $a$.  For any given modulus this only has to
 be computed once.  Modular reduction can now be performed with the following.
  
 \index{mp\_reduce}
@@ -1312,7 +1316,7 @@ be computed once.  Modular reduction can now be performed with the following.
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 \end{alltt}
  
-This will reduce $a$ in place modulo $b$ with the precomputed $mu$ value in $c$.  $a$ must be in the range
+This will reduce $a$ in place modulo $b$ with the precomputed $\mu$ value in $c$.  $a$ must be in the range
 $0 \le a < b^2$.
  
 \begin{alltt}
@@ -1578,7 +1582,8 @@ will return $-2$.
 This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
 the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
 values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
-$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$.
+$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$ or simply 
+$\left ( \left ( \left ( a^{1/2} \right )^{1/2} \right )^{1/2} \right )^{1/2}$
  
 \chapter{Prime Numbers}
 \section{Trial Division}

--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@@ -21,8 +21,7 @@
 * Based on slow invmod except this is optimized for the case where b is 
 * odd as per HAC Note 14.64 on pp. 610
 */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
  mp_int  x, y, u, v, B, D;
  int     res, neg;
@@ -39,20 +38,20 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
  
  /* x == modulus, y == value to invert */
  if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
  }
  
  /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
+    goto LBL_ERR;
  }
  
  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
  }
  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
  }
  mp_set (&D, 1);
  
@@ -61,17 +60,17 @@ top:
  while (mp_iseven (&u) == 1) {
    /* 4.1 u = u/2 */
    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
    /* 4.2 if B is odd then */
    if (mp_isodd (&B) == 1) {
      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
      }
    }
    /* B = B/2 */
    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  }
  
@@ -79,18 +78,18 @@ top:
  while (mp_iseven (&v) == 1) {
    /* 5.1 v = v/2 */
    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
    /* 5.2 if D is odd then */
    if (mp_isodd (&D) == 1) {
      /* D = (D-x)/2 */
      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-        goto __ERR;
+        goto LBL_ERR;
      }
    }
    /* D = D/2 */
    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  }
  
@@ -98,20 +97,20 @@ top:
  if (mp_cmp (&u, &v) != MP_LT) {
    /* u = u - v, B = B - D */
    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  
    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  } else {
    /* v - v - u, D = D - B */
    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  
    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  }
  
@@ -125,21 +124,21 @@ top:
  /* if v != 1 then there is no inverse */
  if (mp_cmp_d (&v, 1) != MP_EQ) {
    res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
  }
  
  /* b is now the inverse */
  neg = a->sign;
  while (D.sign == MP_NEG) {
    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  }
  mp_exch (&D, c);
  c->sign = neg;
  res = MP_OKAY;
  
-__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
  return res;
 }
 #endif
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@@ -23,8 +23,7 @@
 *
 * Based on Algorithm 14.32 on pp.601 of HAC.
 */
-int
-fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
+int fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
  int     ix, res, olduse;
  mp_word W[MP_WARRAY];

--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@@ -31,8 +31,7 @@
 * Based on Algorithm 14.12 on pp.595 of HAC.
 *
 */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
  int     olduse, res, pa, ix, iz;
  mp_digit W[MP_WARRAY];
@@ -50,7 +49,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  
  /* clear the carry */
  _W = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
      int      tx, ty;
      int      iy;
      mp_digit *tmpx, *tmpy;
@@ -63,7 +62,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      tmpx = a->dp + tx;
      tmpy = b->dp + ty;
  
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
         while (tx++ < a->used && ty-- >= 0) { ... }
       */
      iy = MIN(a->used-tx, ty+1);
@@ -80,14 +79,17 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      _W = _W >> ((mp_word)DIGIT_BIT);
  }
  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
+
  /* setup dest */
  olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
  
  {
    register mp_digit *tmpc;
    tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
      /* now extract the previous digit [below the carry] */
      *tmpc++ = W[ix];
    }

--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@@ -24,8 +24,7 @@
 *
 * Based on Algorithm 14.12 on pp.595 of HAC.
 */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
  int     olduse, res, pa, ix, iz;
  mp_digit W[MP_WARRAY];
@@ -42,7 +41,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  /* number of output digits to produce */
  pa = a->used + b->used;
  _W = 0;
-  for (ix = digs; ix <= pa; ix++) { 
+  for (ix = digs; ix < pa; ix++) { 
      int      tx, ty, iy;
      mp_digit *tmpx, *tmpy;
  
@@ -70,6 +69,9 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      /* make next carry */
      _W = _W >> ((mp_word)DIGIT_BIT);
  }
+  
+  /* store final carry */
+  W[ix] = (mp_digit)(_W & MP_MASK);
  
  /* setup dest */
  olduse  = c->used;

--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@@ -15,33 +15,14 @@
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
  
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
  
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
  
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -60,7 +41,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
  
  /* number of output digits to produce */
  W1 = 0;
-  for (ix = 0; ix <= pa; ix++) { 
+  for (ix = 0; ix < pa; ix++) { 
      int      tx, ty, iy;
      mp_word  _W;
      mp_digit *tmpy;
@@ -76,7 +57,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
      tmpx = a->dp + tx;
      tmpy = a->dp + ty;
  
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
         while (tx++ < a->used && ty-- >= 0) { ... }
       */
      iy = MIN(a->used-tx, ty+1);
@@ -101,7 +82,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
      }
  
      /* store it */
-      W[ix] = _W;
+      W[ix] = (mp_digit)(_W & MP_MASK);
  
      /* make next carry */
      W1 = _W >> ((mp_word)DIGIT_BIT);

--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@@ -49,23 +49,23 @@ int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  
  mp_set(&tq, 1);
  n = mp_count_bits(a) - mp_count_bits(b);
-  if (((res = mp_copy(a, &ta)) != MP_OKAY) ||
-      ((res = mp_copy(b, &tb)) != MP_OKAY) || 
+  if (((res = mp_abs(a, &ta)) != MP_OKAY) ||
+      ((res = mp_abs(b, &tb)) != MP_OKAY) || 
      ((res = mp_mul_2d(&tb, n, &tb)) != MP_OKAY) ||
      ((res = mp_mul_2d(&tq, n, &tq)) != MP_OKAY)) {
-      goto __ERR;
+      goto LBL_ERR;
  }
  
  while (n-- >= 0) {
     if (mp_cmp(&tb, &ta) != MP_GT) {
        if (((res = mp_sub(&ta, &tb, &ta)) != MP_OKAY) ||
            ((res = mp_add(&q, &tq, &q)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
        }
     }
     if (((res = mp_div_2d(&tb, 1, &tb, NULL)) != MP_OKAY) ||
         ((res = mp_div_2d(&tq, 1, &tq, NULL)) != MP_OKAY)) {
-           goto __ERR;
+           goto LBL_ERR;
     }
  }
  
@@ -74,13 +74,13 @@ int mp_div(mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  n2 = (a->sign == b->sign ? MP_ZPOS : MP_NEG);
  if (c != NULL) {
     mp_exch(c, &q);
-     c->sign  = n2;
+     c->sign  = (mp_iszero(c) == MP_YES) ? MP_ZPOS : n2;
  }
  if (d != NULL) {
     mp_exch(d, &ta);
-     d->sign = n;
+     d->sign = (mp_iszero(d) == MP_YES) ? MP_ZPOS : n;
  }
-__ERR:
+LBL_ERR:
   mp_clear_multi(&ta, &tb, &tq, &q, NULL);
   return res;
 }
@@ -129,19 +129,19 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  q.used = a->used + 2;
  
  if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto __Q;
+    goto LBL_Q;
  }
  
  if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
+    goto LBL_T1;
  }
  
  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto __T2;
+    goto LBL_T2;
  }
  
  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto __X;
+    goto LBL_X;
  }
  
  /* fix the sign */
@@ -153,10 +153,10 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  if (norm < (int)(DIGIT_BIT-1)) {
     norm = (DIGIT_BIT-1) - norm;
     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
     }
     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto __Y;
+       goto LBL_Y;
     }
  } else {
     norm = 0;
@@ -168,13 +168,13 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  
  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
-    goto __Y;
+    goto LBL_Y;
  }
  
  while (mp_cmp (&x, &y) != MP_LT) {
    ++(q.dp[n - t]);
    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
    }
  }
  
@@ -216,7 +216,7 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
      t1.dp[1] = y.dp[t];
      t1.used = 2;
      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
      }
  
      /* find right hand */
@@ -228,27 +228,27 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  
    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
    }
  
    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
    }
  
    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto __Y;
+      goto LBL_Y;
    }
  
    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
    if (x.sign == MP_NEG) {
      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
      }
      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
      }
      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-        goto __Y;
+        goto LBL_Y;
      }
  
      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
@@ -275,11 +275,11 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  
  res = MP_OKAY;
  
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-__Q:mp_clear (&q);
+LBL_Y:mp_clear (&y);
+LBL_X:mp_clear (&x);
+LBL_T2:mp_clear (&t2);
+LBL_T1:mp_clear (&t1);
+LBL_Q:mp_clear (&q);
  return res;
 }
  

--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@@ -20,7 +20,7 @@
 * Based on algorithm from the paper
 *
 * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Loong Lee,
+ *                 Chae Hoon Lim, Pil Joong Lee,
 *          POSTECH Information Research Laboratories
 *
 * The modulus must be of a special format [see manual]

--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@@ -61,25 +61,33 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     return err;
 #else 
     /* no invmod */
-     return MP_VAL
+     return MP_VAL;
 #endif
  }
  
+/* modified diminished radix reduction */
+#if defined(BN_MP_REDUCE_IS_2K_L_C) && defined(BN_MP_REDUCE_2K_L_C)
+  if (mp_reduce_is_2k_l(P) == MP_YES) {
+     return s_mp_exptmod(G, X, P, Y, 1);
+  }
+#endif
+
 #ifdef BN_MP_DR_IS_MODULUS_C
  /* is it a DR modulus? */
  dr = mp_dr_is_modulus(P);
 #else
+  /* default to no */
  dr = 0;
 #endif
  
 #ifdef BN_MP_REDUCE_IS_2K_C
-  /* if not, is it a uDR modulus? */
+  /* if not, is it a unrestricted DR modulus? */
  if (dr == 0) {
     dr = mp_reduce_is_2k(P) << 1;
  }
 #endif
    
-  /* if the modulus is odd or dr != 0 use the fast method */
+  /* if the modulus is odd or dr != 0 use the montgomery method */
 #ifdef BN_MP_EXPTMOD_FAST_C
  if (mp_isodd (P) == 1 || dr !=  0) {
    return mp_exptmod_fast (G, X, P, Y, dr);
@@ -87,7 +95,7 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 #endif
 #ifdef BN_S_MP_EXPTMOD_C
    /* otherwise use the generic Barrett reduction technique */
-    return s_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y, 0);
 #else
    /* no exptmod for evens */
    return MP_VAL;

--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -29,8 +29,7 @@
   #define TAB_SIZE 256
 #endif
  
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
  mp_int  M[TAB_SIZE], res;
  mp_digit buf, mp;
@@ -88,11 +87,11 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 #ifdef BN_MP_MONTGOMERY_SETUP_C     
     /* now setup montgomery  */
     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
     }
 #else
     err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
  
     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
@@ -108,7 +107,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
        redux = mp_montgomery_reduce;
 #else
        err = MP_VAL;
-        goto __M;
+        goto LBL_M;
 #endif
     }
  } else if (redmode == 1) {
@@ -118,24 +117,24 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     redux = mp_dr_reduce;
 #else
     err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
  } else {
 #if defined(BN_MP_REDUCE_2K_SETUP_C) && defined(BN_MP_REDUCE_2K_C)
     /* setup DR reduction for moduli of the form 2**k - b */
     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
-        goto __M;
+        goto LBL_M;
     }
     redux = mp_reduce_2k;
 #else
     err = MP_VAL;
-     goto __M;
+     goto LBL_M;
 #endif
  }
  
  /* setup result */
  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __M;
+    goto LBL_M;
  }
  
  /* create M table
@@ -149,45 +148,45 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 #ifdef BN_MP_MONTGOMERY_CALC_NORMALIZATION_C
     /* now we need R mod m */
     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
     }
 #else 
     err = MP_VAL;
-     goto __RES;
+     goto LBL_RES;
 #endif
  
     /* now set M[1] to G * R mod m */
     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
     }
  } else {
     mp_set(&res, 1);
     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
     }
  }
  
  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __RES;
+    goto LBL_RES;
  }
  
  for (x = 0; x < (winsize - 1); x++) {
    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
    }
    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
    }
  }
  
  /* create upper table */
  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
    }
    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto __RES;
+      goto LBL_RES;
    }
  }
  
@@ -227,10 +226,10 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
    /* if the bit is zero and mode == 1 then we square */
    if (mode == 1 && y == 0) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
      continue;
    }
@@ -244,19 +243,19 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      /* square first */
      for (x = 0; x < winsize; x++) {
        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
        }
        if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
        }
      }
  
      /* then multiply */
      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
  
      /* empty window and reset */
@@ -271,10 +270,10 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
    /* square then multiply if the bit is set */
    for (x = 0; x < bitcpy; x++) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-        goto __RES;
+        goto LBL_RES;
      }
  
      /* get next bit of the window */
@@ -282,10 +281,10 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      if ((bitbuf & (1 << winsize)) != 0) {
        /* then multiply */
        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
        }
        if ((err = redux (&res, P, mp)) != MP_OKAY) {
-          goto __RES;
+          goto LBL_RES;
        }
      }
    }
@@ -299,15 +298,15 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      * of R.
      */
     if ((err = redux(&res, P, mp)) != MP_OKAY) {
-       goto __RES;
+       goto LBL_RES;
     }
  }
  
  /* swap res with Y */
  mp_exch (&res, Y);
  err = MP_OKAY;
-__RES:mp_clear (&res);
-__M:
+LBL_RES:mp_clear (&res);
+LBL_M:
  mp_clear(&M[1]);
  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
    mp_clear (&M[x]);

--- a/bn_mp_exteuclid.c
+++ b/bn_mp_exteuclid.c
@@ -59,6 +59,13 @@ int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
   }
  
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
   /* copy result out */
   if (U1 != NULL) { mp_exch(U1, &u1); }
   if (U2 != NULL) { mp_exch(U2, &u2); }

--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@@ -43,7 +43,7 @@ int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
  }
  
  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto __U;
+    goto LBL_U;
  }
  
  /* must be positive for the remainder of the algorithm */
@@ -57,24 +57,24 @@ int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
  if (k > 0) {
     /* divide the power of two out */
     if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
     }
  
     if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
     }
  }
  
  /* divide any remaining factors of two out */
  if (u_lsb != k) {
     if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
     }
  }
  
  if (v_lsb != k) {
     if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
     }
  }
  
@@ -87,23 +87,23 @@ int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     
     /* subtract smallest from largest */
     if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
     }
     
     /* Divide out all factors of two */
     if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
-        goto __V;
+        goto LBL_V;
     } 
  } 
  
  /* multiply by 2**k which we divided out at the beginning */
  if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
-     goto __V;
+     goto LBL_V;
  }
  c->sign = MP_ZPOS;
  res = MP_OKAY;
-__V:mp_clear (&u);
-__U:mp_clear (&v);
+LBL_V:mp_clear (&u);
+LBL_U:mp_clear (&v);
  return res;
 }
 #endif
--- a/bn_mp_invmod_slow.c
+++ b/bn_mp_invmod_slow.c
@@ -33,25 +33,25 @@ int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
  }
  
  /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
  }
  if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
  }
  
  /* 2. [modified] if x,y are both even then return an error! */
  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
    res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
  }
  
  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
  }
  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __ERR;
+    goto LBL_ERR;
  }
  mp_set (&A, 1);
  mp_set (&D, 1);
@@ -61,24 +61,24 @@ top:
  while (mp_iseven (&u) == 1) {
    /* 4.1 u = u/2 */
    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
    /* 4.2 if A or B is odd then */
    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
      /* A = (A+y)/2, B = (B-x)/2 */
      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
      }
      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
      }
    }
    /* A = A/2, B = B/2 */
    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  }
  
@@ -86,24 +86,24 @@ top:
  while (mp_iseven (&v) == 1) {
    /* 5.1 v = v/2 */
    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
    /* 5.2 if C or D is odd then */
    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
      /* C = (C+y)/2, D = (D-x)/2 */
      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
      }
      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
      }
    }
    /* C = C/2, D = D/2 */
    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  }
  
@@ -111,28 +111,28 @@ top:
  if (mp_cmp (&u, &v) != MP_LT) {
    /* u = u - v, A = A - C, B = B - D */
    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  
    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  
    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  } else {
    /* v - v - u, C = C - A, D = D - B */
    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  
    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  
    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __ERR;
+      goto LBL_ERR;
    }
  }
  
@@ -145,27 +145,27 @@ top:
  /* if v != 1 then there is no inverse */
  if (mp_cmp_d (&v, 1) != MP_EQ) {
    res = MP_VAL;
-    goto __ERR;
+    goto LBL_ERR;
  }
  
  /* if its too low */
  while (mp_cmp_d(&C, 0) == MP_LT) {
      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
      }
  }
  
  /* too big */
  while (mp_cmp_mag(&C, b) != MP_LT) {
      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
-         goto __ERR;
+         goto LBL_ERR;
      }
  }
  
  /* C is now the inverse */
  mp_exch (&C, c);
  res = MP_OKAY;
-__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+LBL_ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
  return res;
 }
 #endif
--- a/bn_mp_jacobi.c
+++ b/bn_mp_jacobi.c
@@ -50,13 +50,13 @@ int mp_jacobi (mp_int * a, mp_int * p, int *c)
  }
  
  if ((res = mp_init (&p1)) != MP_OKAY) {
-    goto __A1;
+    goto LBL_A1;
  }
  
  /* divide out larger power of two */
  k = mp_cnt_lsb(&a1);
  if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) {
-     goto __P1;
+     goto LBL_P1;
  }
  
  /* step 4.  if e is even set s=1 */
@@ -84,18 +84,18 @@ int mp_jacobi (mp_int * a, mp_int * p, int *c)
  } else {
    /* n1 = n mod a1 */
    if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
    }
    if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
-      goto __P1;
+      goto LBL_P1;
    }
    *c = s * r;
  }
  
  /* done */
  res = MP_OKAY;
-__P1:mp_clear (&p1);
-__A1:mp_clear (&a1);
+LBL_P1:mp_clear (&p1);
+LBL_A1:mp_clear (&a1);
  return res;
 }
 #endif
--- a/bn_mp_lcm.c
+++ b/bn_mp_lcm.c
@@ -28,20 +28,20 @@ int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
  
  /* t1 = get the GCD of the two inputs */
  if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
-    goto __T;
+    goto LBL_T;
  }
  
  /* divide the smallest by the GCD */
  if (mp_cmp_mag(a, b) == MP_LT) {
     /* store quotient in t2 such that t2 * b is the LCM */
     if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
     }
     res = mp_mul(b, &t2, c);
  } else {
     /* store quotient in t2 such that t2 * a is the LCM */
     if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
-        goto __T;
+        goto LBL_T;
     }
     res = mp_mul(a, &t2, c);
  }
@@ -49,7 +49,7 @@ int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
  /* fix the sign to positive */
  c->sign = MP_ZPOS;
  
-__T:
+LBL_T:
  mp_clear_multi (&t1, &t2, NULL);
  return res;
 }

--- a/bn_mp_mod_2d.c
+++ b/bn_mp_mod_2d.c
@@ -28,7 +28,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
  }
  
  /* if the modulus is larger than the value than return */
-  if (b > (int) (a->used * DIGIT_BIT)) {
+  if (b >= (int) (a->used * DIGIT_BIT)) {
    res = mp_copy (a, c);
    return res;
  }

--- a/bn_mp_montgomery_calc_normalization.c
+++ b/bn_mp_montgomery_calc_normalization.c
@@ -28,7 +28,6 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
  /* how many bits of last digit does b use */
  bits = mp_count_bits (b) % DIGIT_BIT;
  
-
  if (b->used > 1) {
     if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
        return res;

--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@@ -57,8 +57,9 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
    u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
  }
  
-  /* store final carry [if any] */
+  /* store final carry [if any] and increment ix offset  */
  *tmpc++ = u;
+  ++ix;
  
  /* now zero digits above the top */
  while (ix++ < olduse) {