Lines Matching +full:1 +full:br +full:- +full:10
2 # Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
20 # faster than gcc and >60%(!) faster than code generated by HP-UX
21 # compiler (yes, HP-UX is generating slower code, because unlike gcc,
23 # substitutes for 64-bit rotate).
26 # and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
28 # too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
29 # 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
30 # are spent on extra work to provide for 32-bit rotations. 32-bit
32 # reason lower 32 bits are deposited to upper half of 64-bit register
36 # as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
37 # 32-bit unsigned right shift," 'pshr4.u' instructions here.
43 # split [at run-time if they have to]. But note that variable and
44 # parallel shifts are performed by multi-media ALU and *are* pairable
51 # latencies get "hidden" in instruction-level parallelism.
53 # (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
55 # because on Itanium 1 stall on MM result is accompanied by
56 # pipeline flush, which takes 6 cycles:-(
60 # Improve performance by 15-20%. Note about "rules of engagement"
63 # by ~10%.
67 # pre-9000 series [little-endian] system:
92 @sigma0=(1, 8, 7);
107 @sigma1=(17,19,10);
115 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
117 for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
118 $big_endian=0 if (/\-DL_ENDIAN/); }
120 { $big_endian=(unpack('L',pack('N',1))==1); }
124 .ident \"IA-64 ISA artwork by Andy Polyakov <appro\@openssl.org>\"
137 ctx=r31; // 1st arg
151 $ADDP ctx=0,r32 // 1st arg
161 add r9=1*$SZ,ctx }
165 // load A-H
175 add Ktbl=($TABLE#-.Lpic_point),Ktbl }
183 cmp.eq p9,p0=1,r8 }
194 A=R[0]; B=R[1]; C=R[2]; D=R[3]; E=R[4]; F=R[5]; G=R[6]; H=R[7]
208 add r9=1-$SZ,input
209 brp.loop.imp .L_first16,.L_first16_end-16 };;
213 { .mmi; add r8=1*$SZ,input
218 (p9) br.cond.dpnt.many .L1byte };;
221 (p10) br.cond.dpnt.many .L2byte };;
223 $LDW X[10]=[r8],4*$SZ
224 (p11) br.cond.dpnt.many .L3byte };;
227 (p12) br.cond.dpnt.many .L4byte };;
230 (p13) br.cond.dpnt.many .L5byte };;
233 (p14) br.cond.dpnt.many .L6byte };;
236 (p15) br.cond.dpnt.many .L7byte };;
237 { .mmb; $LDW X[ 1]=[r9],4*$SZ
240 mux1 X[15]=X[15],\@rev // eliminated on big-endian
241 br.many .L_first16 };;
247 $LDW X[10]=[r8],4*$SZ
257 shrp X[11]=X[11],X[10],56 };;
260 shrp X[10]=X[10],X[ 9],56 }
261 { .mmi; $LDW X[ 1]=[r9],4*$SZ
271 { .mii; shrp X[ 2]=X[ 2],X[ 1],56
272 shrp X[ 1]=X[ 1],X[ 0],56 }
275 mux1 X[15]=X[15],\@rev // eliminated on big-endian
276 br.many .L_first16 };;
279 $LDW X[10]=[r8],4*$SZ
292 shrp X[11]=X[11],X[10],48 }
293 { .mmi; $LDW X[ 1]=[r9],4*$SZ
295 shrp X[10]=X[10],X[ 9],48 };;
304 shrp X[ 2]=X[ 2],X[ 1],48 }
305 { .mii; shrp X[ 1]=X[ 1],X[ 0],48
308 mux1 X[15]=X[15],\@rev // eliminated on big-endian
309 br.many .L_first16 };;
323 { .mmi; $LDW X[ 1]=[r9],4*$SZ
325 shrp X[11]=X[11],X[10],40 };;
327 shrp X[10]=X[10],X[ 9],40
335 { .mii; shrp X[ 2]=X[ 2],X[ 1],40
336 shrp X[ 1]=X[ 1],X[ 0],40 }
339 mux1 X[15]=X[15],\@rev // eliminated on big-endian
340 br.many .L_first16 };;
351 { .mmi; $LDW X[ 1]=[r9],4*$SZ
355 shrp X[11]=X[11],X[10],32
356 shrp X[10]=X[10],X[ 9],32 }
364 shrp X[ 2]=X[ 2],X[ 1],32 }
365 { .mii; shrp X[ 1]=X[ 1],X[ 0],32
368 mux1 X[15]=X[15],\@rev // eliminated on big-endian
369 br.many .L_first16 };;
377 { .mmi; $LDW X[ 1]=[r9],4*$SZ
382 shrp X[11]=X[11],X[10],24 }
383 { .mii; shrp X[10]=X[10],X[ 9],24
391 { .mii; shrp X[ 2]=X[ 2],X[ 1],24
392 shrp X[ 1]=X[ 1],X[ 0],24 }
395 mux1 X[15]=X[15],\@rev // eliminated on big-endian
396 br.many .L_first16 };;
401 { .mmi; $LDW X[ 1]=[r9],4*$SZ
407 { .mii; shrp X[11]=X[11],X[10],16
408 shrp X[10]=X[10],X[ 9],16 };;
416 shrp X[ 2]=X[ 2],X[ 1],16 }
417 { .mii; shrp X[ 1]=X[ 1],X[ 0],16
420 mux1 X[15]=X[15],\@rev // eliminated on big-endian
421 br.many .L_first16 };;
423 { .mmi; $LDW X[ 1]=[r9],4*$SZ
430 shrp X[11]=X[11],X[10],8 };;
431 { .mii; shrp X[10]=X[10],X[ 9],8
439 { .mii; shrp X[ 2]=X[ 2],X[ 1],8
440 shrp X[ 1]=X[ 1],X[ 0],8 }
443 mux1 X[15]=X[15],\@rev };; // eliminated on big-endian
452 (p16) mux1 X[14]=X[14],\@rev };; // eliminated on big-endian
455 _rotr r11=$t1,$Sigma1[1] } // ROTR(e,41)
463 add r10=2-$SZ,input
464 add r11=3-$SZ,input };;
480 { .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
484 _rotr r11=$t1,$Sigma1[1] };; // ROTR(e,18)
496 _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
503 { .mib; (p16) add r9=1-$SZ,input // not used in sha512
505 br.ctop.sptk .L_first16 };;
508 { .mib; mov ar.lc=$rounds-17
509 brp.loop.imp .L_rest,.L_rest_end-16 }
510 { .mib; mov ar.ec=1
511 br.many .L_rest };;
517 _rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
518 { .mmi; add X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
519 $SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
521 _rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
523 $SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
524 // Pair of mmi; splits on Itanium 1 and prevents pipeline flush
528 _rotr r10=X[15-14],$sigma1[0] }// ROTR(s1,19)
531 _rotr r11=X[15-14],$sigma1[1] };;// ROTR(s1,61)
534 { .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
541 // Pair of mmi; splits on Itanium 1 and prevents pipeline flush
544 shrp r8=E,$t1,32+$Sigma1[1]} // ROTR(e,18)
550 { .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
555 _rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
563 add X[15]=X[15],s0 };; // X[i]+=sigma0(X[i+1])
569 _rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
577 br.ctop.sptk .L_rest };;
586 cmp.ltu p16,p0=1,num };;
590 { .mmb; add Ktbl=-$SZ*$rounds,Ktbl
591 (p16) add num=-1,num
592 (p16) br.dptk.many .L_outer };;
595 add r9=1*$SZ,ctx }
608 br.ret.sptk.many b0 };;
613 s/\`([^\`]*)\`/eval $1/gem;
614 s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
616 s/mux2(\s+)([^=]+)=([^,]+),\S+/mov$1 $2=$3/gm;
617 s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
618 s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
620 s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;