Slightly faster DSA verification (BN_mod_exp2_mont),

marginally faster BN_mod_exp for 1024 bit exponents.

Slightly faster DSA verification (BN_mod_exp2_mont),
marginally faster BN_mod_exp for 1024 bit exponents.
dc434bbc · Bodo Möller · 947b3b8b · dc434bbc · dc434bbc · dc434bbc
8 changed file
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,17 @@

 Changes between 0.9.5a and 0.9.6  [xx XXX 2000]

+  *) Re-implement BN_mod_exp2_mont using independent (and larger) windows.
+     This makes DSA verification about 2 % faster.
+     [Bodo Moeller]
+
+  *) Increase maximum window size in BN_mod_exp_... to 6 bits instead of 5
+     (meaning that now 2^5 values will be precomputed, which is only 4 KB
+     plus overhead for 1024 bit moduli).
+     This makes exponentiations about 0.5 % faster for 1024 bit
+     exponents (as measured by "openssl speed rsa2048").
+     [Bodo Moeller]
+
  *) Rename memory handling macros to avoid conflicts with other
     software:
          Malloc         =>  OPENSSL_malloc
@@ -13,7 +24,7 @@
     [Richard Levitte]

  *) New function BN_mod_exp_mont_word for small bases (roughly 15-20%
-     faster than BN_mod_exp_mont).
+     faster than BN_mod_exp_mont, i.e. 7.5-10% for a full DH exchange).
     [Bodo Moeller]

  *) CygWin32 support.

--- a/TABLE
+++ b/TABLE
@@ -632,7 +632,7 @@ $dso_scheme   =

 *** debug-ben
 $cc           = gcc
-$cflags       = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -O2 -pedantic -Wall -Wshadow -Werror -pipe
+$cflags       = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -DDEBUG_SAFESTACK -O2 -pedantic -Wall -Wshadow -Werror -pipe
 $unistd       = 
 $thread_cflag = (unknown)
 $lflags       = 
@@ -650,7 +650,7 @@ $dso_scheme   =

 *** debug-ben-debug
 $cc           = gcc
-$cflags       = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -g3 -O2 -pedantic -Wall -Wshadow -Werror -pipe
+$cflags       = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -DDEBUG_SAFESTACK -g3 -O2 -pedantic -Wall -Wshadow -Werror -pipe
 $unistd       = 
 $thread_cflag = (unknown)
 $lflags       = 
@@ -1228,7 +1228,7 @@ $dso_scheme   =
 $cc           = cc
 $cflags       = -n32 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_SGI_MP_SOURCE
 $lflags       = 
 $bn_ops       = DES_PTR RC4_CHAR RC4_CHUNK_LL DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT
 $bn_obj       = asm/mips3.o
@@ -1246,7 +1246,7 @@ $dso_scheme   =
 $cc           = gcc
 $cflags       = -mabi=n32 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_SGI_MP_SOURCE
 $lflags       = 
 $bn_ops       = MD2_CHAR RC4_INDEX RC4_CHAR RC4_CHUNK_LL DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT
 $bn_obj       = asm/mips3.o
@@ -1264,7 +1264,7 @@ $dso_scheme   =
 $cc           = cc
 $cflags       = -64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_SGI_MP_SOURCE
 $lflags       = 
 $bn_ops       = RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG
 $bn_obj       = asm/mips3.o
@@ -1282,7 +1282,7 @@ $dso_scheme   =
 $cc           = gcc
 $cflags       = -mabi=64 -mips4 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_SGI_MP_SOURCE
 $lflags       = 
 $bn_ops       = RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG
 $bn_obj       = asm/mips3.o
@@ -1300,7 +1300,7 @@ $dso_scheme   =
 $cc           = ccc
 $cflags       = -fast -readonly_strings -DL_ENDIAN -DTERMIO
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_REENTRANT
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL
 $bn_obj       = asm/alpha.o
@@ -1318,7 +1318,7 @@ $dso_scheme   =
 $cc           = gcc
 $cflags       = -O3 -DL_ENDIAN -DTERMIO
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_REENTRANT
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL
 $bn_obj       = asm/alpha.o
@@ -1336,7 +1336,7 @@ $dso_scheme   =
 $cc           = ccc
 $cflags       = -fast -readonly_strings -DL_ENDIAN -DTERMIO
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_REENTRANT
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL
 $bn_obj       = asm/alpha.o
@@ -1354,7 +1354,7 @@ $dso_scheme   =
 $cc           = gcc
 $cflags       = -O3 -DL_ENDIAN -DTERMIO
 $unistd       = 
-$thread_cflag = (unknown)
+$thread_cflag = -D_REENTRANT
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL
 $bn_obj       = asm/alpha.o
@@ -1859,7 +1859,7 @@ $cc           = gcc
 $cflags       = -O3 -mv8 -Dssize_t=int
 $unistd       = 
 $thread_cflag = (unknown)
-$lflags       = 
+$lflags       = -liberty
 $bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL DES_PTR DES_RISC1
 $bn_obj       = 
 $des_obj      = 

--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -485,6 +485,7 @@ BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num);
 #define BN_F_BN_CTX_NEW					 106
 #define BN_F_BN_DIV					 107
 #define BN_F_BN_EXPAND2					 108
+#define BN_F_BN_MOD_EXP2_MONT				 118
 #define BN_F_BN_MOD_EXP_MONT				 109
 #define BN_F_BN_MOD_EXP_MONT_WORD			 117
 #define BN_F_BN_MOD_INVERSE				 110

--- a/crypto/bn/bn_err.c
+++ b/crypto/bn/bn_err.c
@@ -76,8 +76,9 @@ static ERR_STRING_DATA BN_str_functs[]=
 {ERR_PACK(0,BN_F_BN_CTX_NEW,0),	"BN_CTX_new"},
 {ERR_PACK(0,BN_F_BN_DIV,0),	"BN_div"},
 {ERR_PACK(0,BN_F_BN_EXPAND2,0),	"bn_expand2"},
+{ERR_PACK(0,BN_F_BN_MOD_EXP2_MONT,0),	"BN_mod_exp2_mont"},
 {ERR_PACK(0,BN_F_BN_MOD_EXP_MONT,0),	"BN_mod_exp_mont"},
-{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT_WORD,0),	"BN_MOD_EXP_MONT_WORD"},
+{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT_WORD,0),	"BN_mod_exp_mont_word"},
 {ERR_PACK(0,BN_F_BN_MOD_INVERSE,0),	"BN_mod_inverse"},
 {ERR_PACK(0,BN_F_BN_MOD_MUL_RECIPROCAL,0),	"BN_mod_mul_reciprocal"},
 {ERR_PACK(0,BN_F_BN_MPI2BN,0),	"BN_mpi2bn"},

--- a/crypto/bn/bn_exp.c
+++ b/crypto/bn/bn_exp.c
@@ -121,7 +121,7 @@
 #endif


-#define TABLE_SIZE	16
+#define TABLE_SIZE	32

 /* slow but works */
 int BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
@@ -427,27 +427,22 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 	ts=1;

 	if (!BN_mod(&(val[0]),a,m,ctx)) goto err;		/* 1 */
-	if (!BN_mod_mul_reciprocal(aa,&(val[0]),&(val[0]),&recp,ctx))
-		goto err;				/* 2 */
-
-	if (bits <= 17) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits >= 256)
-		window=5;	/* max size of window */
-	else if (bits >= 128)
-		window=4;
-	else
-		window=3;

-	j=1<<(window-1);
-	for (i=1; i<j; i++)
+	window = BN_window_bits_for_exponent_size(bits);
+	if (window > 1)
 		{
-		BN_init(&val[i]);
-		if (!BN_mod_mul_reciprocal(&(val[i]),&(val[i-1]),aa,&recp,ctx))
-			goto err;
+		if (!BN_mod_mul_reciprocal(aa,&(val[0]),&(val[0]),&recp,ctx))
+			goto err;				/* 2 */
+		j=1<<(window-1);
+		for (i=1; i<j; i++)
+			{
+			BN_init(&val[i]);
+			if (!BN_mod_mul_reciprocal(&(val[i]),&(val[i-1]),aa,&recp,ctx))
+				goto err;
+			}
+		ts=i;
 		}
-	ts=i;
-
+		
 	start=1;	/* This is used to avoid multiplication etc
 			 * when there is only the value '1' in the
 			 * buffer. */
@@ -574,25 +569,20 @@ int BN_mod_exp_mont(BIGNUM *rr, BIGNUM *a, const BIGNUM *p,
 	else
 		aa=a;
 	if (!BN_to_montgomery(&(val[0]),aa,mont,ctx)) goto err; /* 1 */
-	if (!BN_mod_mul_montgomery(d,&(val[0]),&(val[0]),mont,ctx)) goto err; /* 2 */
-
-	if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits >= 256)
-		window=5;	/* max size of window */
-	else if (bits >= 128)
-		window=4;
-	else
-		window=3;

-	j=1<<(window-1);
-	for (i=1; i<j; i++)
+	window = BN_window_bits_for_exponent_size(bits);
+	if (window > 1)
 		{
-		BN_init(&(val[i]));
-		if (!BN_mod_mul_montgomery(&(val[i]),&(val[i-1]),d,mont,ctx))
-			goto err;
+		if (!BN_mod_mul_montgomery(d,&(val[0]),&(val[0]),mont,ctx)) goto err; /* 2 */
+		j=1<<(window-1);
+		for (i=1; i<j; i++)
+			{
+			BN_init(&(val[i]));
+			if (!BN_mod_mul_montgomery(&(val[i]),&(val[i-1]),d,mont,ctx))
+				goto err;
+			}
+		ts=i;
 		}
-	ts=i;

 	start=1;	/* This is used to avoid multiplication etc
 			 * when there is only the value '1' in the
@@ -787,26 +777,21 @@ int BN_mod_exp_simple(BIGNUM *r, BIGNUM *a, BIGNUM *p, BIGNUM *m,
 	BN_init(&(val[0]));
 	ts=1;
 	if (!BN_mod(&(val[0]),a,m,ctx)) goto err;		/* 1 */
-	if (!BN_mod_mul(d,&(val[0]),&(val[0]),m,ctx))
-		goto err;				/* 2 */
-
-	if (bits <= 17) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits >= 256)
-		window=5;	/* max size of window */
-	else if (bits >= 128)
-		window=4;
-	else
-		window=3;

-	j=1<<(window-1);
-	for (i=1; i<j; i++)
+	window = BN_window_bits_for_exponent_size(bits);
+	if (window > 1)
 		{
-		BN_init(&(val[i]));
-		if (!BN_mod_mul(&(val[i]),&(val[i-1]),d,m,ctx))
-			goto err;
+		if (!BN_mod_mul(d,&(val[0]),&(val[0]),m,ctx))
+			goto err;				/* 2 */
+		j=1<<(window-1);
+		for (i=1; i<j; i++)
+			{
+			BN_init(&(val[i]));
+			if (!BN_mod_mul(&(val[i]),&(val[i-1]),d,m,ctx))
+				goto err;
+			}
+		ts=i;
 		}
-	ts=i;

 	start=1;	/* This is used to avoid multiplication etc
 			 * when there is only the value '1' in the

--- a/crypto/bn/bn_exp2.c
+++ b/crypto/bn/bn_exp2.c
+/* crypto/bn/bn_exp2.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
 #include <stdio.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"

-/* I've done some timing with different table sizes.
- * The main hassle is that even with bits set at 3, this requires
- * 63 BIGNUMs to store the pre-calculated values.
- *          512   1024 
- * bits=1  75.4%  79.4%
- * bits=2  61.2%  62.4%
- * bits=3  61.3%  59.3%
- * The lack of speed improvement is also a function of the pre-calculation
- * which could be removed.
- */
-#define EXP2_TABLE_BITS	2 /* 1  2  3  4  5  */
-#define EXP2_TABLE_SIZE	4 /* 2  4  8 16 32  */
+#define TABLE_SIZE	32

 int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,
 	     BIGNUM *p2, BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
-	int i,j,k,bits,bits1,bits2,ret=0,wstart,wend,window,xvalue,yvalue;
-	int start=1,ts=0,x,y;
-	BIGNUM *d,*aa1,*aa2,*r;
-	BIGNUM val[EXP2_TABLE_SIZE][EXP2_TABLE_SIZE];
+	int i,j,bits,b,bits1,bits2,ret=0,wpos1,wpos2,window1,window2,wvalue1,wvalue2;
+	int r_is_one=1,ts1=0,ts2=0;
+	BIGNUM *d,*r;
+	BIGNUM *a_mod_m;
+	BIGNUM val1[TABLE_SIZE], val2[TABLE_SIZE];
 	BN_MONT_CTX *mont=NULL;

 	bn_check_top(a1);
@@ -32,7 +133,7 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,

 	if (!(m->d[0] & 1))
 		{
-		BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
+		BNerr(BN_F_BN_MOD_EXP2_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
 		return(0);
 		}
 	bits1=BN_num_bits(p1);
@@ -42,17 +143,13 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,
 		BN_one(rr);
 		return(1);
 		}
+	bits=(bits1 > bits2)?bits1:bits2;

 	BN_CTX_start(ctx);
 	d = BN_CTX_get(ctx);
 	r = BN_CTX_get(ctx);
 	if (d == NULL || r == NULL) goto err;

-	bits=(bits1 > bits2)?bits1:bits2;
-
-	/* If this is not done, things will break in the montgomery
-	 * part */
-
 	if (in_mont != NULL)
 		mont=in_mont;
 	else
@@ -61,139 +158,143 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,
 		if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
 		}

-	BN_init(&(val[0][0]));
-	BN_init(&(val[1][1]));
-	BN_init(&(val[0][1]));
-	BN_init(&(val[1][0]));
-	ts=1;
+	window1 = BN_window_bits_for_exponent_size(bits1);
+	window2 = BN_window_bits_for_exponent_size(bits2);
+
+	/*
+	 * Build table for a1:   val1[i] := a1^(2*i + 1) mod m  for i = 0 .. 2^(window1-1)
+	 */
+	BN_init(&val1[0]);
+	ts1=1;
 	if (BN_ucmp(a1,m) >= 0)
 		{
-		BN_mod(&(val[1][0]),a1,m,ctx);
-		aa1= &(val[1][0]);
+		if (!BN_mod(&(val1[0]),a1,m,ctx))
+			goto err;
+		a_mod_m = &(val1[0]);
 		}
 	else
-		aa1=a1;
+		a_mod_m = a1;
+	if (!BN_to_montgomery(&(val1[0]),a_mod_m,mont,ctx)) goto err;
+	if (window1 > 1)
+		{
+		if (!BN_mod_mul_montgomery(d,&(val1[0]),&(val1[0]),mont,ctx)) goto err;
+
+		j=1<<(window1-1);
+		for (i=1; i<j; i++)
+			{
+			BN_init(&(val1[i]));
+			if (!BN_mod_mul_montgomery(&(val1[i]),&(val1[i-1]),d,mont,ctx))
+				goto err;
+			}
+		ts1=i;
+		}
+
+
+	/*
+	 * Build table for a2:   val2[i] := a2^(2*i + 1) mod m  for i = 0 .. 2^(window2-1)
+	 */
+	BN_init(&val2[0]);
+	ts2=1;
 	if (BN_ucmp(a2,m) >= 0)
 		{
-		BN_mod(&(val[0][1]),a2,m,ctx);
-		aa2= &(val[0][1]);
+		if (!BN_mod(&(val2[0]),a2,m,ctx))
+			goto err;
+		a_mod_m = &(val2[0]);
 		}
 	else
-		aa2=a2;
-	if (!BN_to_montgomery(&(val[1][0]),aa1,mont,ctx)) goto err;
-	if (!BN_to_montgomery(&(val[0][1]),aa2,mont,ctx)) goto err;
-	if (!BN_mod_mul_montgomery(&(val[1][1]),
-		&(val[1][0]),&(val[0][1]),mont,ctx))
-		goto err;
-
-#if 0
-	if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits > 250)
-		window=5;	/* max size of window */
-	else if (bits >= 120)
-		window=4;
-	else
-		window=3;
-#else
-	window=EXP2_TABLE_BITS;
-#endif
-
-	k=1<<window;
-	for (x=0; x<k; x++)
+		a_mod_m = a2;
+	if (!BN_to_montgomery(&(val2[0]),a_mod_m,mont,ctx)) goto err;
+	if (window2 > 1)
 		{
-		if (x >= 2)
-			{
-			BN_init(&(val[x][0]));
-			BN_init(&(val[x][1]));
-			if (!BN_mod_mul_montgomery(&(val[x][0]),
-				&(val[1][0]),&(val[x-1][0]),mont,ctx)) goto err;
-			if (!BN_mod_mul_montgomery(&(val[x][1]),
-				&(val[1][0]),&(val[x-1][1]),mont,ctx)) goto err;
-			}
-		for (y=2; y<k; y++)
+		if (!BN_mod_mul_montgomery(d,&(val2[0]),&(val2[0]),mont,ctx)) goto err;
+
+		j=1<<(window2-1);
+		for (i=1; i<j; i++)
 			{
-			BN_init(&(val[x][y]));
-			if (!BN_mod_mul_montgomery(&(val[x][y]),
-				&(val[x][y-1]),&(val[0][1]),mont,ctx))
+			BN_init(&(val2[i]));
+			if (!BN_mod_mul_montgomery(&(val2[i]),&(val2[i-1]),d,mont,ctx))
 				goto err;
 			}
+		ts2=i;
 		}
-	ts=k;
-
-	start=1;	/* This is used to avoid multiplication etc
-			 * when there is only the value '1' in the
-			 * buffer. */
-	xvalue=0;	/* The 'x value' of the window */
-	yvalue=0;	/* The 'y value' of the window */
-	wstart=bits-1;	/* The top bit of the window */
-	wend=0;		/* The bottom bit of the window */
-
-        if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
-	for (;;)
+
+
+	/* Now compute the power product, using independent windows. */
+	r_is_one=1;
+	wvalue1=0;  /* The 'value' of the first window */
+	wvalue2=0;  /* The 'value' of the second window */
+	wpos1=0;    /* If wvalue1 > 0, the bottom bit of the first window */
+	wpos2=0;    /* If wvalue2 > 0, the bottom bit of the second window */
+
+	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
+	for (b=bits-1; b>=0; b--)
 		{
-		xvalue=BN_is_bit_set(p1,wstart);
-		yvalue=BN_is_bit_set(p2,wstart);
-		if (!(xvalue || yvalue))
+		if (!r_is_one)
 			{
-			if (!start)
+			if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
+				goto err;
+			}
+		
+		if (!wvalue1)
+			if (BN_is_bit_set(p1, b))
 				{
-				if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
-					goto err;
+				/* consider bits b-window1+1 .. b for this window */
+				i = b-window1+1;
+				while (!BN_is_bit_set(p1, i))
+					i++;
+				wpos1 = i;
+				wvalue1 = 1;
+				for (i = b-1; i >= wpos1; i--)
+					{
+					wvalue1 <<= 1;
+					if (BN_is_bit_set(p1, i))
+						wvalue1++;
+					}
 				}
-			wstart--;
-			if (wstart < 0) break;
-			continue;
-			}
-		/* We now have wstart on a 'set' bit, we now need to work out
-		 * how bit a window to do.  To do this we need to scan
-		 * forward until the last set bit before the end of the
-		 * window */
-		j=wstart;
-		/* xvalue=BN_is_bit_set(p1,wstart); already set */
-		/* yvalue=BN_is_bit_set(p1,wstart); already set */
-		wend=0;
-		for (i=1; i<window; i++)
-			{
-			if (wstart-i < 0) break;
-			xvalue+=xvalue;
-			xvalue|=BN_is_bit_set(p1,wstart-i);
-			yvalue+=yvalue;
-			yvalue|=BN_is_bit_set(p2,wstart-i);
-			}
-
-		/* i is the size of the current window */
-		/* add the 'bytes above' */
-		if (!start)
-			for (j=0; j<i; j++)
+		
+		if (!wvalue2)
+			if (BN_is_bit_set(p2, b))
 				{
-				if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
-					goto err;
+				/* consider bits b-window2+1 .. b for this window */
+				i = b-window2+1;
+				while (!BN_is_bit_set(p2, i))
+					i++;
+				wpos2 = i;
+				wvalue2 = 1;
+				for (i = b-1; i >= wpos2; i--)
+					{
+					wvalue2 <<= 1;
+					if (BN_is_bit_set(p2, i))
+						wvalue2++;
+					}
 				}
+
+		if (wvalue1 && b == wpos1)
+			{
+			/* wvalue1 is odd and < 2^window1 */
+			if (!BN_mod_mul_montgomery(r,r,&(val1[wvalue1>>1]),mont,ctx))
+				goto err;
+			wvalue1 = 0;
+			r_is_one = 0;
+			}
 		
-		/* wvalue will be an odd number < 2^window */
-		if (xvalue || yvalue)
+		if (wvalue2 && b == wpos2)
 			{
-			if (!BN_mod_mul_montgomery(r,r,&(val[xvalue][yvalue]),
-				mont,ctx)) goto err;
+			/* wvalue2 is odd and < 2^window2 */
+			if (!BN_mod_mul_montgomery(r,r,&(val2[wvalue2>>1]),mont,ctx))
+				goto err;
+			wvalue2 = 0;
+			r_is_one = 0;
 			}
-
-		/* move the 'window' down further */
-		wstart-=i;
-		start=0;
-		if (wstart < 0) break;
 		}
 	BN_from_montgomery(rr,r,mont,ctx);
 	ret=1;
 err:
 	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
 	BN_CTX_end(ctx);
-	for (i=0; i<ts; i++)
-		{
-		for (j=0; j<ts; j++)
-			{
-			BN_clear_free(&(val[i][j]));
-			}
-		}
+	for (i=0; i<ts1; i++)
+		BN_clear_free(&(val1[i]));
+	for (i=0; i<ts2; i++)
+		BN_clear_free(&(val2[i]));
 	return(ret);
 	}
--- a/crypto/bn/bn_lcl.h
+++ b/crypto/bn/bn_lcl.h
@@ -55,6 +55,59 @@
 * copied and put under another distribution licence
 * [including the GNU Public Licence.]
 */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */

 #ifndef HEADER_BN_LCL_H
 #define HEADER_BN_LCL_H
@@ -65,6 +118,51 @@
 extern "C" {
 #endif

+
+/*
+ * BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
+ *
+ *
+ * For window size 'w' (w >= 2) and a random 'b' bits exponent,
+ * the number of multiplications is a constant plus on average
+ *
+ *    2^(w-1) + (b-w)/(w+1);
+ *
+ * here  2^(w-1)  is for precomputing the table (we actually need
+ * entries only for windows that have the lowest bit set), and
+ * (b-w)/(w+1)  is an approximation for the expected number of
+ * w-bit windows, not counting the first one.
+ *
+ * Thus we should use
+ *
+ *    w >= 6  if        b > 671
+ *     w = 5  if  671 > b > 239
+ *     w = 4  if  239 > b >  79
+ *     w = 3  if   79 > b >  23
+ *    w <= 2  if   23 > b
+ *
+ * (with draws in between).  Very small exponents are often selected
+ * with low Hamming weight, so we use  w = 1  for b <= 23.
+ */
+#if 1
+#define BN_window_bits_for_exponent_size(b) \
+		((b) > 671 ? 6 : \
+		 (b) > 239 ? 5 : \
+		 (b) >  79 ? 4 : \
+		 (b) >  23 ? 3 : 1)
+#else
+/* Old SSLeay/OpenSSL table.
+ * Maximum window size was 5, so this table differs for b==1024;
+ * but it coincides for other interesting values (b==160, b==512).
+ */
+#define BN_window_bits_for_exponent_size(b) \
+		((b) > 255 ? 5 : \
+		 (b) > 127 ? 4 : \
+		 (b) >  17 ? 3 : 1)
+#endif	 
+
+
+
 /* Pentium pro 16,16,16,32,64 */
 /* Alpha       16,16,16,16.64 */
 #define BN_MULL_SIZE_NORMAL			(16) /* 32 */

--- a/util/libeay.num
+++ b/util/libeay.num
@@ -1801,3 +1801,5 @@ X509_CRL_digest                         2391
 d2i_ASN1_SET_OF_PKCS7                   2397
 EVP_CIPHER_CTX_set_key_length           2399
 EVP_CIPHER_CTX_ctrl                     2400
+BN_mod_exp_mont_word                    2401
+RAND_egd_bytes                          2402