From 7363455fac97cd1a7cb94f4002db7ce40537bc4b Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 20 Jul 1999 15:50:20 +0000 Subject: [PATCH] MIPS III/IV assembler module is reimplemented. --- CHANGES | 3 + Configure | 4 +- crypto/bn/Makefile.ssl | 2 +- crypto/bn/asm/mips3.s | 2062 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 2068 insertions(+), 3 deletions(-) create mode 100644 crypto/bn/asm/mips3.s diff --git a/CHANGES b/CHANGES index 8b55de5c06..df10c894b2 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,9 @@ Changes between 0.9.3a and 0.9.4 + *) MIPS III/IV assembler module is reimplemented. + [Andy Polyakov] + *) More DES library cleanups: remove references to srand/rand and delete an unused file. [Ulf Möller] diff --git a/Configure b/Configure index 07cbb1e619..4b68a097dd 100755 --- a/Configure +++ b/Configure @@ -134,10 +134,10 @@ my %table=( "irix-gcc","gcc:-O2 -DTERMIOS -DB_ENDIAN::(unknown)::BN_LLONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR:::", "irix-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::BN_LLONG DES_PTR DES_RISC2 DES_UNROLL BF_PTR:::", "irix-mips3-gcc","gcc:-mips3 -O2 -DTERMIOS -DB_ENDIAN::(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:::", -"irix-mips3-cc", "cc:-n32 -mips3 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:::", +"irix-mips3-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::", "debug-irix-cc", "cc:-w2 -g -DCRYPTO_MDEBUG -DTERMIOS -DB_ENDIAN::(unknown):::::", # This is the n64 mode build. -"irix64-mips4-cc", "cc:-64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:::", +"irix64-mips4-cc", "cc:-64 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::", # HPUX 9.X config. # Don't use the bundled cc. It is broken. Use HP ANSI C if possible, or diff --git a/crypto/bn/Makefile.ssl b/crypto/bn/Makefile.ssl index d31972d9d5..93c2a0195f 100644 --- a/crypto/bn/Makefile.ssl +++ b/crypto/bn/Makefile.ssl @@ -20,6 +20,7 @@ BN_ASM= bn_asm.o #BN_ASM= bn86-elf.o CFLAGS= $(INCLUDES) $(CFLAG) +ASFLAGS=$(CFLAGS) GENERAL=Makefile TEST=bntest.c exptest.c @@ -110,7 +111,6 @@ asm/sparcv8plus-gcc.o: asm/sparcv8plus.S # MIPS 64 bit assember asm/mips3.o: asm/mips3.s - /usr/bin/as -mips3 -O2 -o asm/mips3.o asm/mips3.s # MIPS 32 bit assember asm/mips1.o: asm/mips1.s diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s new file mode 100644 index 0000000000..2193c954b2 --- /dev/null +++ b/crypto/bn/asm/mips3.s @@ -0,0 +1,2062 @@ +.rdata +.asciiz "mips3.s, Version 1.0 (prerelease)" +.asciiz "MIPS III/IV ISA artwork by Andy Polyakov " + +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. + * + * Rights for redistribution and usage in source and binary forms are + * granted according to the OpenSSL license. Warranty of any kind is + * disclaimed. + * ==================================================================== + */ + +/* + * This is my modest contributon to the OpenSSL project (see + * http://www.openssl.org/ for more information about it) and is + * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c + * module. For updates see http://fy.chalmers.se/~appro/hpe/. + * + * The module is designed to work with "new" IRIX ABI(5), namely + * N32 and N64. But it was tested only with MIPSpro 7.2.x assembler, + * i.e. depends on preprocessor options set up by MIPSspro 7.2.x + * driver. Another neat gadget offered by MIPSpro 7.2.x assembler is + * an peep-hole(?) optimization pass. This gave me the opportunity + * to make the code looking more regular as all those architecture + * dependent(!) instruction rescheduling details were left to the + * assembler. Cool, huh? Do note that I have no idea if GNU assembler + * does anything similar nor how GNU C will do with this module. + * Feedback on the matter is therefore very much appreciated:-) + * + * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' + * exhibits 3-3.5-3.7 times improvement! + * + * + */ +#include +#include + +#if _MIPS_ISA>=4 +#define MOVNZ(cond,dst,src) \ + movn dst,src,cond +#else +#define MOVNZ(cond,dst,src) \ + .set noreorder; \ + bnezl cond,.+8; \ + move dst,src; \ + .set reorder +#endif + +.text + +.set noat +.set reorder + +#define MINUS4 v1 + +LEAF(bn_mul_add_words) + .align 5 + .set noreorder + bgtzl a2,.L_bn_mul_add_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_mul_add_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_mul_add_words_tail + +.L_bn_mul_add_words_loop: + dmultu t0,a3 + ld t1,0(a0) + ld t2,8(a1) + ld t3,8(a0) + ld ta0,16(a1) + ld ta1,16(a0) + daddu t1,v0 + sltu v0,t1,v0 /* All manuals say it "compares 32-bit + * values", but it seems to work fine + * even on 64-bit registers. */ + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,0(a0) + daddu v0,AT + + dmultu t2,a3 + ld ta2,24(a1) + ld ta3,24(a0) + daddu t3,v0 + sltu v0,t3,v0 + mflo AT + mfhi t2 + daddu t3,AT + daddu v0,t2 + sltu AT,t3,AT + sd t3,8(a0) + daddu v0,AT + + dmultu ta0,a3 + subu a2,4 + PTR_ADD a0,32 + PTR_ADD a1,32 + daddu ta1,v0 + sltu v0,ta1,v0 + mflo AT + mfhi ta0 + daddu ta1,AT + daddu v0,ta0 + sltu AT,ta1,AT + sd ta1,-16(a0) + daddu v0,AT + + + dmultu ta2,a3 + and ta0,a2,MINUS4 + daddu ta3,v0 + sltu v0,ta3,v0 + mflo AT + mfhi ta2 + daddu ta3,AT + daddu v0,ta2 + sltu AT,ta3,AT + sd ta3,-8(a0) + daddu v0,AT + .set noreorder + bgtzl ta0,.L_bn_mul_add_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_mul_add_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_mul_add_words_return: + jr ra + +.L_bn_mul_add_words_tail: + dmultu t0,a3 + ld t1,0(a0) + subu a2,1 + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,0(a0) + daddu v0,AT + beqz a2,.L_bn_mul_add_words_return + + ld t0,8(a1) + dmultu t0,a3 + ld t1,8(a0) + subu a2,1 + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,8(a0) + daddu v0,AT + beqz a2,.L_bn_mul_add_words_return + + ld t0,16(a1) + dmultu t0,a3 + ld t1,16(a0) + daddu t1,v0 + sltu v0,t1,v0 + mflo AT + mfhi t0 + daddu t1,AT + daddu v0,t0 + sltu AT,t1,AT + sd t1,16(a0) + daddu v0,AT + jr ra +END(bn_mul_add_words) + +LEAF(bn_mul_words) + .align 5 + .set noreorder + bgtzl a2,.L_bn_mul_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_mul_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_mul_words_tail + +.L_bn_mul_words_loop: + dmultu t0,a3 + ld t2,8(a1) + ld ta0,16(a1) + ld ta2,24(a1) + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,0(a0) + daddu v0,t1,t0 + + dmultu t2,a3 + subu a2,4 + PTR_ADD a0,32 + PTR_ADD a1,32 + mflo AT + mfhi t2 + daddu v0,AT + sltu t3,v0,AT + sd v0,-24(a0) + daddu v0,t3,t2 + + dmultu ta0,a3 + mflo AT + mfhi ta0 + daddu v0,AT + sltu ta1,v0,AT + sd v0,-16(a0) + daddu v0,ta1,ta0 + + + dmultu ta2,a3 + and ta0,a2,MINUS4 + mflo AT + mfhi ta2 + daddu v0,AT + sltu ta3,v0,AT + sd v0,-8(a0) + daddu v0,ta3,ta2 + .set noreorder + bgtzl ta0,.L_bn_mul_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_mul_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_mul_words_return: + jr ra + +.L_bn_mul_words_tail: + dmultu t0,a3 + subu a2,1 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,0(a0) + daddu v0,t1,t0 + beqz a2,.L_bn_mul_words_return + + ld t0,8(a1) + dmultu t0,a3 + subu a2,1 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,8(a0) + daddu v0,t1,t0 + beqz a2,.L_bn_mul_words_return + + ld t0,16(a1) + dmultu t0,a3 + mflo AT + mfhi t0 + daddu v0,AT + sltu t1,v0,AT + sd v0,16(a0) + daddu v0,t1,t0 + jr ra +END(bn_mul_words) + +LEAF(bn_sqr_words) + .align 5 + .set noreorder + bgtzl a2,.L_bn_sqr_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_sqr_words_proceed: + li MINUS4,-4 + and ta0,a2,MINUS4 + move v0,zero + beqz ta0,.L_bn_sqr_words_tail + +.L_bn_sqr_words_loop: + dmultu t0,t0 + ld t2,8(a1) + ld ta0,16(a1) + ld ta2,24(a1) + mflo t1 + mfhi t0 + sd t1,0(a0) + sd t0,8(a0) + + dmultu t2,t2 + subu a2,4 + PTR_ADD a0,64 + PTR_ADD a1,32 + mflo t3 + mfhi t2 + sd t3,-48(a0) + sd t2,-40(a0) + + dmultu ta0,ta0 + mflo ta1 + mfhi ta0 + sd ta1,-32(a0) + sd ta0,-24(a0) + + + dmultu ta2,ta2 + and ta0,a2,MINUS4 + mflo ta3 + mfhi ta2 + sd ta3,-16(a0) + sd ta2,-8(a0) + + .set noreorder + bgtzl ta0,.L_bn_sqr_words_loop + ld t0,0(a1) + + bnezl a2,.L_bn_sqr_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_sqr_words_return: + move v0,zero + jr ra + +.L_bn_sqr_words_tail: + dmultu t0,t0 + subu a2,1 + mflo t1 + mfhi t0 + sd t1,0(a0) + sd t0,8(a0) + beqz a2,.L_bn_sqr_words_return + + ld t0,8(a1) + dmultu t0,t0 + subu a2,1 + mflo t1 + mfhi t0 + sd t1,16(a0) + sd t0,24(a0) + beqz a2,.L_bn_sqr_words_return + + ld t0,16(a1) + dmultu t0,t0 + mflo t1 + mfhi t0 + sd t1,32(a0) + sd t0,40(a0) + jr ra +END(bn_sqr_words) + +LEAF(bn_add_words) + .align 5 + .set noreorder + bgtzl a3,.L_bn_add_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_add_words_proceed: + li MINUS4,-4 + and AT,a3,MINUS4 + move v0,zero + beqz AT,.L_bn_add_words_tail + +.L_bn_add_words_loop: + ld ta0,0(a2) + ld t1,8(a1) + ld ta1,8(a2) + ld t2,16(a1) + ld ta2,16(a2) + ld t3,24(a1) + ld ta3,24(a2) + daddu ta0,t0 + subu a3,4 + sltu t8,ta0,t0 + daddu t0,ta0,v0 + PTR_ADD a0,32 + sltu v0,t0,ta0 + sd t0,-32(a0) + daddu v0,t8 + + daddu ta1,t1 + PTR_ADD a1,32 + sltu t9,ta1,t1 + daddu t1,ta1,v0 + PTR_ADD a2,32 + sltu v0,t1,ta1 + sd t1,-24(a0) + daddu v0,t9 + + daddu ta2,t2 + and AT,a3,MINUS4 + sltu t8,ta2,t2 + daddu t2,ta2,v0 + sltu v0,t2,ta2 + sd t2,-16(a0) + daddu v0,t8 + + daddu ta3,t3 + sltu t9,ta3,t3 + daddu t3,ta3,v0 + sltu v0,t3,ta3 + sd t3,-8(a0) + daddu v0,t9 + + .set noreorder + bgtzl AT,.L_bn_add_words_loop + ld t0,0(a1) + + bnezl a3,.L_bn_add_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_add_words_return: + jr ra + +.L_bn_add_words_tail: + ld ta0,0(a2) + daddu ta0,t0 + subu a3,1 + sltu t8,ta0,t0 + daddu t0,ta0,v0 + sltu v0,t0,ta0 + sd t0,0(a0) + daddu v0,t8 + beqz a3,.L_bn_add_words_return + + ld t1,8(a1) + ld ta1,8(a2) + daddu ta1,t1 + subu a3,1 + sltu t9,ta1,t1 + daddu t1,ta1,v0 + sltu v0,t1,ta1 + sd t1,8(a0) + daddu v0,t9 + beqz a3,.L_bn_add_words_return + + ld t2,16(a1) + ld ta2,16(a2) + daddu ta2,t2 + sltu t8,ta2,t2 + daddu t2,ta2,v0 + sltu v0,t2,ta2 + sd t2,16(a0) + daddu v0,t8 + jr ra +END(bn_add_words) + +LEAF(bn_sub_words) + .align 5 + .set noreorder + bgtzl a3,.L_bn_sub_words_proceed + ld t0,0(a1) + jr ra + move v0,zero + .set reorder + +.L_bn_sub_words_proceed: + li MINUS4,-4 + and AT,a3,MINUS4 + move v0,zero + beqz AT,.L_bn_sub_words_tail + +.L_bn_sub_words_loop: + ld ta0,0(a2) + ld t1,8(a1) + ld ta1,8(a2) + ld t2,16(a1) + ld ta2,16(a2) + ld t3,24(a1) + ld ta3,24(a2) + sltu t8,t0,ta0 + dsubu t0,ta0 + subu a3,4 + dsubu ta0,t0,v0 + and AT,a3,MINUS4 + sd ta0,0(a0) + MOVNZ (t0,v0,t8) + + sltu t9,t1,ta1 + dsubu t1,ta1 + PTR_ADD a0,32 + dsubu ta1,t1,v0 + PTR_ADD a1,32 + sd ta1,-24(a0) + MOVNZ (t1,v0,t9) + + + sltu t8,t2,ta2 + dsubu t2,ta2 + dsubu ta2,t2,v0 + PTR_ADD a2,32 + sd ta2,-16(a0) + MOVNZ (t2,v0,t8) + + sltu t9,t3,ta3 + dsubu t3,ta3 + dsubu ta3,t3,v0 + sd ta3,-8(a0) + MOVNZ (t3,v0,t9) + + .set noreorder + bgtzl AT,.L_bn_sub_words_loop + ld t0,0(a1) + + bnezl a3,.L_bn_sub_words_tail + ld t0,0(a1) + .set reorder + +.L_bn_sub_words_return: + jr ra + +.L_bn_sub_words_tail: + ld ta0,0(a2) + subu a3,1 + sltu t8,t0,ta0 + dsubu t0,ta0 + dsubu ta0,t0,v0 + MOVNZ (t0,v0,t8) + sd ta0,0(a0) + beqz a3,.L_bn_sub_words_return + + ld t1,8(a1) + subu a3,1 + ld ta1,8(a2) + sltu t9,t1,ta1 + dsubu t1,ta1 + dsubu ta1,t1,v0 + MOVNZ (t1,v0,t9) + sd ta1,8(a0) + beqz a3,.L_bn_sub_words_return + + ld t2,16(a1) + ld ta2,16(a2) + sltu t8,t2,ta2 + dsubu t2,ta2 + dsubu ta2,t2,v0 + MOVNZ (t2,v0,t8) + sd ta2,16(a0) + jr ra +END(bn_sub_words) + +#undef MINUS4 + +LEAF(bn_div_words) + .align 5 + .set noreorder + bnezl a2,.L_bn_div_words_proceed + move t0,zero + jr ra + li v0,-1 /* I'd rather signal div-by-zero + * which can be done with 'break 7' */ + .set reorder + +.L_bn_div_words_proceed: + bltz a2,.L_bn_div_words_body + .set noreorder + dsll a2,1 + bgtz a2,.-4 + addu t0,1 + .set reorder + negu t1,t0 + li t2,-1 + dsll t2,t1 + and t2,a0 + dsrl AT,a1,t1 + .set noreorder + bnezl t2,.+8 + break 6 /* signal overflow */ + .set reorder + dsll a0,t0 + dsll a1,t0 + or a0,AT + +#define QT ta0 +#define DH ta1 +#define HH ta2 +#define MINUS1 ta3 +.L_bn_div_words_body: + dsrl DH,a2,32 + li v1,2 + sgeu AT,a0,a2 + li MINUS1,-1 + .set noreorder + bnezl AT,.+8 + dsubu a0,a2 + .set reorder + +.L_bn_div_words_outer_loop: + dsrl HH,a0,32 + subu v1,1 + dsrl QT,MINUS1,32 /* q=0xffffffff */ + beq DH,HH,.L_bn_div_words_inner_loop + ddivu zero,a0,DH + mflo QT +.L_bn_div_words_inner_loop: + dmultu a2,QT + dsll t3,a0,32 + dsrl AT,a1,32 + or t3,AT + mflo t0 + mfhi t1 + sltu t2,t3,t0 + seq t8,HH,t1 + sltu AT,HH,t1 + and t2,t8 + or AT,t2 + .set noreorder + bnezl AT,.L_bn_div_words_inner_loop + dsubu QT,1 + .set reorder + + dsubu a0,t3,t0 + beqz v1,.L_bn_div_words_outer_loop_done + + dsll a1,32 + dsll v0,QT,32 + b .L_bn_div_words_outer_loop + +.L_bn_div_words_outer_loop_done: + or v0,QT + move v1,a0 /* v1 contains remainder if one wants it */ + jr ra +#undef MINUS1 +#undef HH +#undef DH +#undef QT +END(bn_div_words) + +#define a_0 t0 +#define a_1 t1 +#define a_2 t2 +#define a_3 t3 +#define b_0 ta0 +#define b_1 ta1 +#define b_2 ta2 +#define b_3 ta3 + +#define a_4 s0 +#define a_5 s2 +#define a_6 s4 +#define a_7 a1 /* once we load a[7] we don't need a anymore */ +#define b_4 s1 +#define b_5 s3 +#define b_6 s5 +#define b_7 a2 /* once we load b[7] we don't need b anymore */ + +#define t_1 t8 +#define t_2 t9 + +#define c_1 v0 +#define c_2 v1 +#define c_3 a3 + +#define FRAME_SIZE 48 + +LEAF(bn_mul_comba8) + .align 5 + .set noreorder + PTR_SUB sp,FRAME_SIZE + .frame sp,64,ra + .set reorder + ld a_0,0(a1) /* If compiled with -mips3 options + * assembler barks on this line with + * "shouldn't have mult/div as last + * instruction in bb (R10K bug)" + * warning. If anybody out there has + * a clue on what does "bb" mean and + * how to circumvent this do send me + * a note. + * + */ + ld b_0,0(a2) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + ld b_1,8(a2) + ld b_2,16(a2) + ld b_3,24(a2) + dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + sd s0,0(sp) + sd s1,8(sp) + sd s2,16(sp) + sd s3,24(sp) + sd s4,32(sp) + sd s5,40(sp) + mflo c_1 + mfhi c_2 + + dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ + ld a_4,32(a1) + ld a_5,40(a1) + ld a_6,48(a1) + ld a_7,56(a1) + ld b_4,32(a2) + ld b_5,40(a2) + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ + ld b_6,48(a2) + ld b_7,56(a2) + sd c_1,0(a0) /* r[0]=c1; */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) /* r[1]=c2; */ + + dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) /* r[2]=c3; */ + + dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) /* r[3]=c1; */ + + dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) /* r[4]=c2; */ + + dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) /* r[5]=c3; */ + + dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,48(a0) /* r[6]=c1; */ + + dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,56(a0) /* r[7]=c2; */ + + dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,64(a0) /* r[8]=c3; */ + + dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,72(a0) /* r[9]=c1; */ + + dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,80(a0) /* r[10]=c2; */ + + dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,88(a0) /* r[11]=c3; */ + + dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,96(a0) /* r[12]=c1; */ + + dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,104(a0) /* r[13]=c2; */ + + dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ + ld s0,0(sp) + ld s1,8(sp) + ld s2,16(sp) + ld s3,24(sp) + ld s4,32(sp) + ld s5,40(sp) + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sd c_3,112(a0) /* r[14]=c3; */ + sd c_1,120(a0) /* r[15]=c1; */ + + PTR_ADD sp,FRAME_SIZE + + jr ra +END(bn_mul_comba8) + +LEAF(bn_mul_comba4) + .align 5 + .set reorder + ld a_0,0(a1) + ld b_0,0(a2) + ld a_1,8(a1) + ld a_2,16(a1) + dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + ld a_3,24(a1) + ld b_1,8(a2) + ld b_2,16(a2) + ld b_3,24(a2) + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) + + dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + sd c_3,40(a0) + + dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sd c_1,48(a0) + sd c_2,56(a0) + + jr ra +END(bn_mul_comba4) + +#undef a_4 +#undef a_5 +#undef a_6 +#undef a_7 +#define a_4 b_0 +#define a_5 b_1 +#define a_6 b_2 +#define a_7 b_3 + +LEAF(bn_sqr_comba8) + .align 5 + .set reorder + ld a_0,0(a1) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + + dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + ld a_4,32(a1) + ld a_5,40(a1) + ld a_6,48(a1) + ld a_7,56(a1) + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) + + dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + sltu AT,c_2,a2 + daddu c_3,AT + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + sltu AT,c_3,a2 + daddu c_1,AT + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + sltu AT,c_1,a2 + daddu c_2,AT + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + sltu AT,c_1,a2 + daddu c_2,AT + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,40(a0) + + dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + sltu AT,c_2,a2 + daddu c_3,AT + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + sltu AT,c_2,a2 + daddu c_3,AT + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,48(a0) + + dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + sltu AT,c_3,a2 + daddu c_1,AT + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + sltu AT,c_3,a2 + daddu c_1,AT + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + sltu AT,c_3,a2 + daddu c_1,AT + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,56(a0) + + dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + sltu AT,c_1,a2 + daddu c_2,AT + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + sltu AT,c_1,a2 + daddu c_2,AT + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,64(a0) + + dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + sltu AT,c_2,a2 + daddu c_3,AT + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + sltu AT,c_2,a2 + daddu c_3,AT + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,72(a0) + + dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + sltu AT,c_3,a2 + daddu c_1,AT + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,80(a0) + + dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + sltu AT,c_1,a2 + daddu c_2,AT + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,88(a0) + + dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,96(a0) + + dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,104(a0) + + dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sd c_3,112(a0) + sd c_1,120(a0) + + jr ra +END(bn_sqr_comba8) + +LEAF(bn_sqr_comba4) + .align 5 + .set reorder + ld a_0,0(a1) + ld a_1,8(a1) + ld a_2,16(a1) + ld a_3,24(a1) + dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ + mflo c_1 + mfhi c_2 + sd c_1,0(a0) + + dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu c_3,t_2,AT + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + sd c_2,8(a0) + + dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu AT,c_1,t_2 + daddu c_2,AT + sd c_3,16(a0) + + dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu c_3,c_2,t_2 + dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu a2,t_2,AT + daddu c_2,a2 + sltu AT,c_2,a2 + daddu c_3,AT + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sltu AT,c_2,t_2 + daddu c_3,AT + sd c_1,24(a0) + + dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu a2,t_2,AT + daddu c_3,a2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu c_1,c_3,t_2 + dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ + mflo t_1 + mfhi t_2 + daddu c_2,t_1 + sltu AT,c_2,t_1 + daddu t_2,AT + daddu c_3,t_2 + sltu AT,c_3,t_2 + daddu c_1,AT + sd c_2,32(a0) + + dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ + mflo t_1 + mfhi t_2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu a2,t_2,AT + daddu c_1,a2 + daddu c_3,t_1 + sltu AT,c_3,t_1 + daddu t_2,AT + daddu c_1,t_2 + sltu c_2,c_1,t_2 + sd c_3,40(a0) + + dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ + mflo t_1 + mfhi t_2 + daddu c_1,t_1 + sltu AT,c_1,t_1 + daddu t_2,AT + daddu c_2,t_2 + sd c_1,48(a0) + sd c_2,56(a0) + + jr ra +END(bn_sqr_comba4) -- GitLab