diff --git a/CHANGES b/CHANGES
index 90b0dd12553236f4b64eaca8a4b5c8be6bd6186e..2437b197eb4c9ef528ae7cf4b928d5d968407d2f 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,13 @@
 
  Changes between 0.9.6 and 0.9.7  [xx XXX 2000]
 
+  *) Implement binary inversion algorithm for BN_mod_inverse in addition
+     to the algorithm using long divison.  The binary algorithm can be
+     used only if the modulus is odd.  It is faster only for relatively
+     small moduli (roughly 20% for 128-bit moduli, roughly 5% for 256-bit
+     moduli), so we use it only for moduli up to 400 bits.
+     [Bodo Moeller]
+
   *) Change bctest again: '-x' expressions are not available in all
      versions of 'test'.
      [Bodo Moeller]
diff --git a/crypto/bn/bn_gcd.c b/crypto/bn/bn_gcd.c
index d5caf5136f1248007dc6bb6f215665351cc055a9..2c6fb2379a99328add8f52434e3c50efcd2d39d9 100644
--- a/crypto/bn/bn_gcd.c
+++ b/crypto/bn/bn_gcd.c
@@ -56,7 +56,7 @@
  * [including the GNU Public Licence.]
  */
 /* ====================================================================
- * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -240,131 +240,216 @@ BIGNUM *BN_mod_inverse(BIGNUM *in,
 	/* From  B = a mod |n|,  A = |n|  it follows that
 	 *
 	 *      0 <= B < A,
-	 *      sign*X*a  ==  B   (mod |n|),
-	 *     -sign*Y*a  ==  A   (mod |n|).
+	 *     -sign*X*a  ==  B   (mod |n|),
+	 *      sign*Y*a  ==  A   (mod |n|).
 	 */
 
-	while (!BN_is_zero(B))
+	if (BN_is_odd(n) && (BN_num_bits(n) <= 400))
 		{
-		BIGNUM *tmp;
+		/* Binary inversion algorithm; requires odd modulus.
+		 * This is faster than the general algorithm if the modulus
+		 * is sufficiently small. */
+		int shift;
+		
+		while (!BN_is_zero(B))
+			{
+			/*
+			 *      0 < B < A <= |n|,
+			 * (1) -sign*X*a  ==  B   (mod |n|),
+			 * (2)  sign*Y*a  ==  A   (mod |n|)
+			 */
+
+			/* Now divide  B  by the maximum possible power of two in the integers,
+			 * and divide  X  by the same value mod |n|.
+			 * When we're done, (1) still holds. */
+			shift = 0;
+			while (!BN_is_bit_set(B, shift)) /* note that 0 < B */
+				{
+				shift++;
+				
+				if (BN_is_odd(X))
+					{
+					if (!BN_uadd(X, X, n)) goto err;
+					}
+				/* now X is even, so we can easily divide it by two */
+				if (!BN_rshift1(X, X)) goto err;
+				}
+			if (shift > 0)
+				{
+				if (!BN_rshift(B, B, shift)) goto err;
+				}
 
-		/*
-		 *      0 < B < A,
-		 * (*)  sign*X*a  ==  B   (mod |n|),
-		 *     -sign*Y*a  ==  A   (mod |n|)
-		 */
 
-		/* (D, M) := (A/B, A%B) ... */
-		if (BN_num_bits(A) == BN_num_bits(B))
-			{
-			if (!BN_one(D)) goto err;
-			if (!BN_sub(M,A,B)) goto err;
+			/* Same for  A  and  Y.  Afterwards, (2) still holds. */
+			shift = 0;
+			while (!BN_is_bit_set(A, shift)) /* note that 0 < A */
+				{
+				shift++;
+				
+				if (BN_is_odd(Y))
+					{
+					if (!BN_uadd(Y, Y, n)) goto err;
+					}
+				/* now Y is even */
+				if (!BN_rshift1(Y, Y)) goto err;
+				}
+			if (shift > 0)
+				{
+				if (!BN_rshift(A, A, shift)) goto err;
+				}
+
+			
+			/* We still have (1) and (2), but  A  may no longer be larger than  B.
+			 * Both  A  and  B  are odd.
+			 * The following computations ensure that
+			 *
+			 *      0 =< B < A = |n|,
+			 * (1) -sign*X*a  ==  B   (mod |n|),
+			 * (2)  sign*Y*a  ==  A   (mod |n|)
+			 */
+			if (BN_ucmp(B, A) >= 0)
+				{
+				/* -sign*(X + Y)*a == B - A  (mod |n|) */
+				if (!BN_uadd(X, X, Y)) goto err;
+				/* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
+				 * actually makes the algorithm slower */
+				if (!BN_usub(B, B, A)) goto err;
+				}
+			else
+				{
+				/*  sign*(X + Y)*a == A - B  (mod |n|) */
+				if (!BN_uadd(Y, Y, X)) goto err;
+				/* as above, BN_mod_add_quick(Y, Y, X, n) would slow things down */
+				if (!BN_usub(A, A, B)) goto err;
+				}
 			}
-		else if (BN_num_bits(A) == BN_num_bits(B) + 1)
+		}
+	else
+		{
+		/* general inversion algorithm (less efficient than binary inversion) */
+
+		while (!BN_is_zero(B))
 			{
-			/* A/B is 1, 2, or 3 */
-			if (!BN_lshift1(T,B)) goto err;
-			if (BN_ucmp(A,T) < 0)
+			BIGNUM *tmp;
+			
+			/*
+			 *      0 < B < A,
+			 * (*) -sign*X*a  ==  B   (mod |n|),
+			 *      sign*Y*a  ==  A   (mod |n|)
+			 */
+			
+			/* (D, M) := (A/B, A%B) ... */
+			if (BN_num_bits(A) == BN_num_bits(B))
 				{
-				/* A < 2*B, so D=1 */
 				if (!BN_one(D)) goto err;
 				if (!BN_sub(M,A,B)) goto err;
 				}
-			else
+			else if (BN_num_bits(A) == BN_num_bits(B) + 1)
 				{
-				/* A >= 2*B, so D=2 or D=3 */
-				if (!BN_sub(M,A,T)) goto err;
-				if (!BN_add(D,T,B)) goto err; /* use D (:= 3*B) as temp */
-				if (BN_ucmp(A,D) < 0)
+				/* A/B is 1, 2, or 3 */
+				if (!BN_lshift1(T,B)) goto err;
+				if (BN_ucmp(A,T) < 0)
 					{
-					/* A < 3*B, so D=2 */
-					if (!BN_set_word(D,2)) goto err;
-					/* M (= A - 2*B) already has the correct value */
+					/* A < 2*B, so D=1 */
+					if (!BN_one(D)) goto err;
+					if (!BN_sub(M,A,B)) goto err;
 					}
 				else
 					{
-					/* only D=3 remains */
-					if (!BN_set_word(D,3)) goto err;
-					/* currently  M = A - 2*B,  but we need  M = A - 3*B */
-					if (!BN_sub(M,M,B)) goto err;
+					/* A >= 2*B, so D=2 or D=3 */
+					if (!BN_sub(M,A,T)) goto err;
+					if (!BN_add(D,T,B)) goto err; /* use D (:= 3*B) as temp */
+					if (BN_ucmp(A,D) < 0)
+						{
+						/* A < 3*B, so D=2 */
+						if (!BN_set_word(D,2)) goto err;
+						/* M (= A - 2*B) already has the correct value */
+						}
+					else
+						{
+						/* only D=3 remains */
+						if (!BN_set_word(D,3)) goto err;
+						/* currently  M = A - 2*B,  but we need  M = A - 3*B */
+						if (!BN_sub(M,M,B)) goto err;
+						}
 					}
 				}
-			}
-		else
-			{
-			if (!BN_div(D,M,A,B,ctx)) goto err;
-			}
-		
-		/* Now
-		 *      A = D*B + M;
-		 * thus we have
-		 * (**) -sign*Y*a  ==  D*B + M   (mod |n|).
-		 */
-		
-		tmp=A; /* keep the BIGNUM object, the value does not matter */
-
-		/* (A, B) := (B, A mod B) ... */
-		A=B;
-		B=M;
-		/* ... so we have  0 <= B < A  again */
-
-		/* Since the former  M  is now  B  and the former  B  is now  A,
-		 * (**) translates into
-		 *      -sign*Y*a  ==  D*A + B    (mod |n|),
-		 * i.e.
-		 *      -sign*Y*a - D*A  ==  B    (mod |n|).
-		 * Similarly, (*) translates into
-		 *       sign*X*a  ==  A          (mod |n|).
-		 *
-		 * Thus,
-		 *  -sign*Y*a - D*sign*X*a  ==  B  (mod |n|),
-		 * i.e.
-		 *       -sign*(Y + D*X)*a  ==  B  (mod |n|).
-		 *
-		 * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
-		 *       sign*X*a  ==  B   (mod |n|),
-		 *      -sign*Y*a  ==  A   (mod |n|).
-		 * Note that  X  and  Y  stay non-negative all the time.
-		 */
-
-		/* most of the time D is very small, so we can optimize tmp := D*X+Y */
-		if (BN_is_one(D))
-			{
-			if (!BN_add(tmp,X,Y)) goto err;
-			}
-		else
-			{
-			if (BN_is_word(D,2))
-				{
-				if (!BN_lshift1(tmp,X)) goto err;
-				}
-			else if (BN_is_word(D,4))
+			else
 				{
-				if (!BN_lshift(tmp,X,2)) goto err;
+				if (!BN_div(D,M,A,B,ctx)) goto err;
 				}
-			else if (D->top == 1)
+			
+			/* Now
+			 *      A = D*B + M;
+			 * thus we have
+			 * (**)  sign*Y*a  ==  D*B + M   (mod |n|).
+			 */
+			
+			tmp=A; /* keep the BIGNUM object, the value does not matter */
+			
+			/* (A, B) := (B, A mod B) ... */
+			A=B;
+			B=M;
+			/* ... so we have  0 <= B < A  again */
+			
+			/* Since the former  M  is now  B  and the former  B  is now  A,
+			 * (**) translates into
+			 *       sign*Y*a  ==  D*A + B    (mod |n|),
+			 * i.e.
+			 *       sign*Y*a - D*A  ==  B    (mod |n|).
+			 * Similarly, (*) translates into
+			 *      -sign*X*a  ==  A          (mod |n|).
+			 *
+			 * Thus,
+			 *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+			 * i.e.
+			 *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+			 *
+			 * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+			 *      -sign*X*a  ==  B   (mod |n|),
+			 *       sign*Y*a  ==  A   (mod |n|).
+			 * Note that  X  and  Y  stay non-negative all the time.
+			 */
+			
+			/* most of the time D is very small, so we can optimize tmp := D*X+Y */
+			if (BN_is_one(D))
 				{
-				if (!BN_copy(tmp,X)) goto err;
-				if (!BN_mul_word(tmp,D->d[0])) goto err;
+				if (!BN_add(tmp,X,Y)) goto err;
 				}
 			else
 				{
-				if (!BN_mul(tmp,D,X,ctx)) goto err;
+				if (BN_is_word(D,2))
+					{
+					if (!BN_lshift1(tmp,X)) goto err;
+					}
+				else if (BN_is_word(D,4))
+					{
+					if (!BN_lshift(tmp,X,2)) goto err;
+					}
+				else if (D->top == 1)
+					{
+					if (!BN_copy(tmp,X)) goto err;
+					if (!BN_mul_word(tmp,D->d[0])) goto err;
+					}
+				else
+					{
+					if (!BN_mul(tmp,D,X,ctx)) goto err;
+					}
+				if (!BN_add(tmp,tmp,Y)) goto err;
 				}
-			if (!BN_add(tmp,tmp,Y)) goto err;
+			
+			M=Y; /* keep the BIGNUM object, the value does not matter */
+			Y=X;
+			X=tmp;
+			sign = -sign;
 			}
-		
-		M=Y; /* keep the BIGNUM object, the value does not matter */
-		Y=X;
-		X=tmp;
-		sign = -sign;
 		}
-
+		
 	/*
 	 * The while loop (Euclid's algorithm) ends when
 	 *      A == gcd(a,n);
 	 * we have
-	 *      -sign*Y*a  ==  A  (mod |n|),
+	 *       sign*Y*a  ==  A  (mod |n|),
 	 * where  Y  is non-negative.
 	 */
 
@@ -378,7 +463,7 @@ BIGNUM *BN_mod_inverse(BIGNUM *in,
 	if (BN_is_one(A))
 		{
 		/* Y*a == 1  (mod |n|) */
-		if (BN_ucmp(Y,n) < 0)
+		if (!Y->neg && BN_ucmp(Y,n) < 0)
 			{
 			if (!BN_copy(R,Y)) goto err;
 			}
diff --git a/crypto/bn/bn_mod.c b/crypto/bn/bn_mod.c
index 92fe11684c8c914a7a006fc72f5693841c8fcb87..5cf82480d7bafb8e61b52ef62f44118e1acf9d30 100644
--- a/crypto/bn/bn_mod.c
+++ b/crypto/bn/bn_mod.c
@@ -150,8 +150,8 @@ int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_
 int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m)
 	{
 	if (!BN_add(r, a, b)) return 0;
-	if (BN_cmp(r, m) >= 0)
-		return BN_sub(r, r, m);
+	if (BN_ucmp(r, m) >= 0)
+		return BN_usub(r, r, m);
 	return 1;
 	}