Bug-fix in CBC encrypt tail processing and commentary section update.

bac252a5 · Andy Polyakov · a963395a · bac252a5
隐藏空白更改
内联并排

Showing with 29 addition and 16 deletion

crypto/aes/asm/aes-586.pl crypto/aes/asm/aes-586.pl +29 -16

未找到文件。
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -6,7 +6,7 @@
 # forms are granted according to the OpenSSL license.
 # ====================================================================
 #
-# Version 3.0.
+# Version 3.1.
 #
 # You might fail to appreciate this module performance from the first
 # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
@@ -46,23 +46,27 @@
 # Instruction Level Parallelism, and it indeed resulted in up to 15%
 # better performance on most recent µ-archs...
 #
-# Current ECB performance numbers for 128-bit key in cycles per byte
-# [measure commonly used by AES benchmarkers] are:
+# Current ECB performance numbers for 128-bit key in CPU cycles per
+# processed byte [measure commonly used by AES benchmarkers] are:
 #
 #		small footprint		fully unrolled
 # P4[-3]	23[24]			22[23]
 # AMD K8	19			18
-# PIII		26(*)			23
+# PIII		26			23
 # Pentium	63(*)			52
 #
 # (*)	Performance difference between small footprint code and fully
-#	unrolled in more commonly used CBC mode is not as big, 7% for
-#	PIII and 15% for Pentium, which I consider tolerable.
+#	unrolled in more commonly used CBC mode is not as big, 4% for
+#	for Pentium. PIII's ~13% difference [in both cases in 3rd
+#	version] is considered tolerable...
 #
 # Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance imrovement of CBC benchmark results [on most
-# recent -archs]. CBC performance is virtually as good as ECB now and
-# sometimes even better, because function prologues and epilogues are
+# up to 40% performance imrovement of CBC benchmark results. 40% was
+# observed on P4 core, where "overall" imrovement coefficient, i.e. if
+# compared to PIC generated by GCC and in CBC mode, was observed to be
+# as large as 4x:-) CBC performance is virtually identical to ECB now
+# and on some platforms even better, e.g. 56 "small" cycles/byte on
+# senior Pentium, because certain function prologues and epilogues are
 # effectively taken out of the loop...

 push(@INC,"perlasm","../../perlasm");
@@ -79,8 +83,9 @@ $acc="esi";

 $small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
 			# recent µ-archs], but ~5 times smaller!
-			# I favor compact code, because it minimizes
-			# cache contention...
+			# I favor compact code to minimize cache
+			# contention and in hope to "collect" 5% back
+			# in real-life applications...
 $vertical_spin=0;	# shift "verticaly" defaults to 0, because of
 			# its proof-of-concept status...

@@ -1296,12 +1301,18 @@ sub declast()
 	&push	($key eq "edi" ? $key : "");	# push ivp
 	&pushf	();
 	&mov	($key,&wparam(1));		# load out
-	&xor	($s0,$s0);
-	&mov	(&DWP(0,$key),$s0);		# zero output
-	&mov	(&DWP(4,$key),$s0);
-	&mov	(&DWP(8,$key),$s0);
-	&mov	(&DWP(12,$key),$s0);
+	&mov	($s1,16);
+	&sub	($s1,$s2);
+	&cmp	($key,$acc);			# compare with inp
+	&je	(&label("enc_in_place"));
 	&data_word(0x90A4F3FC);	# cld; rep movsb; nop	# copy input
+	&jmp	(&label("enc_skip_in_place"));
+    &set_label("enc_in_place");
+	&lea	($key,&DWP(0,$key,$s2));
+    &set_label("enc_skip_in_place");
+	&mov	($s2,$s1);
+	&xor	($s0,$s0);
+	&data_word(0x90AAF3FC);	# cld; rep stosb; nop	# zero tail
 	&popf	();
 	&pop	($key);				# pop ivp

@@ -1456,6 +1467,8 @@ sub declast()
 	&pushf	();
 	&data_word(0x90A4F3FC);	# cld; rep movsb; nop	# restore tail
 	&popf	();
+
+    &align	(4);
    &set_label("dec_out");
    &stack_pop(5);
 &function_end("AES_cbc_encrypt");