1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in 11# "hand-coded assembler"] doesn't stand for the whole improvement 12# coefficient. It turned out that eliminating RC4_CHAR from config 13# line results in ~40% improvement (yes, even for C implementation). 14# Presumably it has everything to do with AMD cache architecture and 15# RAW or whatever penalties. Once again! The module *requires* config 16# line *without* RC4_CHAR! As for coding "secret," I bet on partial 17# register arithmetics. For example instead of 'inc %r8; and $255,%r8' 18# I simply 'inc %r8b'. Even though optimization manual discourages 19# to operate on partial registers, it turned out to be the best bet. 20# At least for AMD... How IA32E would perform remains to be seen... 21 22# As was shown by Marc Bevand reordering of couple of load operations 23# results in even higher performance gain of 3.3x:-) At least on 24# Opteron... For reference, 1x in this case is RC4_CHAR C-code 25# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. 26# Latter means that if you want to *estimate* what to expect from 27# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz. 28 29# Intel P4 EM64T core was found to run the AMD64 code really slow... 30# The only way to achieve comparable performance on P4 was to keep 31# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to 32# compose blended code, which would perform even within 30% marginal 33# on either AMD and Intel platforms, I implement both cases. See 34# rc4_skey.c for further details... 35 36# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 37# those with add/sub results in 50% performance improvement of folded 38# loop... 39 40# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T 41# performance by >30% [unlike P4 32-bit case that is]. But this is 42# provided that loads are reordered even more aggressively! Both code 43# pathes, AMD64 and EM64T, reorder loads in essentially same manner 44# as my IA-64 implementation. On Opteron this resulted in modest 5% 45# improvement [I had to test it], while final Intel P4 performance 46# achieves respectful 432MBps on 2.8GHz processor now. For reference. 47# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than 48# RC4_INT code-path. While if executed on Opteron, it's only 25% 49# slower than the RC4_INT one [meaning that if CPU �-arch detection 50# is not implemented, then this final RC4_CHAR code-path should be 51# preferred, as it provides better *all-round* performance]. 52 53# Intel Core2 was observed to perform poorly on both code paths:-( It 54# apparently suffers from some kind of partial register stall, which 55# occurs in 64-bit mode only [as virtually identical 32-bit loop was 56# observed to outperform 64-bit one by almost 50%]. Adding two movzb to 57# cloop1 boosts its performance by 80%! This loop appears to be optimal 58# fit for Core2 and therefore the code was modified to skip cloop8 on 59# this CPU. 60 61# 62# OpenSolaris OS modifications 63# 64# Sun elects to use this software under the BSD license. 65# 66# This source originates from OpenSSL file rc4-x86_64.pl at 67# ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz 68# (presumably for future OpenSSL release 0.9.8h), with these changes: 69# 70# 1. Added some comments, "use strict", and declared all variables. 71# 72# 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 73# /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. 74# 75# 3. Changed function name from RC4() to arcfour_crypt() and RC4_set_key() 76# to arcfour_key_init(), and changed the parameter order for both to that 77# used by OpenSolaris. 78# 79# 4. The current method of using cpuid feature bits 20 (NX) or 28 (HTT) from 80# function OPENSSL_ia32_cpuid() to distinguish Intel/AMD does not work for 81# some newer AMD64 processors, as these bits are set on both Intel EM64T 82# processors and newer AMD64 processors. I replaced this with code to use CPUID 83# instruction subfunction EAX=0 to determine if we're running on "GenuineIntel" 84# or not. The result decides whether to use a 85# * 1-byte key array (label .LRC4_CHAR, optimal on Intel EM64T) or a 86# * 4-byte key array (Labels .Lloop1 and .Lloop8, optimal on AMD64). 87# 88# 5. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) assemblers). 89# 90# 6. Removed Lcloop8 code (slower than Lcloop1 on EM64T and not used on AMD64). 91# 92 93use strict; 94my ($code, $dat, $inp, $out, $len, $idx, $ido, $i, @XX, @TX, $YY, $TY); 95my $output = shift; 96open STDOUT,">$output"; 97 98# 99# Parameters 100# 101 102# OpenSSL: 103# void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, 104# unsigned char *outdata); 105#$dat="%rdi"; # arg1 106#$len="%rsi"; # arg2 107#$inp="%rdx"; # arg3 108#$out="%rcx"; # arg4 109 110# OpenSolaris: 111# void arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len); 112$dat="%rdi"; # arg1 113$inp="%rsi"; # arg2 114$out="%rdx"; # arg3 115$len="%rcx"; # arg4 116 117# 118# Register variables 119# 120# $XX[0] is key->i (aka key->x), $XX[1] is a temporary. 121# $TX[0] and $TX[1] are temporaries. 122# $YY is key->j (aka key->y). 123# $TY is a temporary. 124# 125@XX=("%r8","%r10"); 126@TX=("%r9","%r11"); 127$YY="%r12"; 128$TY="%r13"; 129 130$code=<<___; 131#if !defined(lint) && !defined(__lint) 132 133 .ident "%Z%%M% %I% %E% SMI" 134 135#include <sys/asm_linkage.h> 136 137 138ENTRY_NP(arcfour_crypt) 139 /* EXPORT DELETE START */ 140 141 or $len,$len # If (len == 0) return 142 jne .Lentry 143 ret 144.Lentry: 145 push %r12 146 push %r13 147 148 / Set $dat to beginning of array, key->arr[0] 149 add \$8,$dat 150 / Get key->j 151 movl -8($dat),$XX[0]#d 152 / Get key->i 153 movl -4($dat),$YY#d 154 155 / 156 / Use a 1-byte data array, on Intel P4 EM64T, 157 / which is more efficient there, 158 / or a 4-byte data array (for AMD AMD64). 159 / 160 161 / If RC4_CHAR flag set (Intel EM64T), then use 1-byte array 162 cmpl \$-1,256($dat) 163 je .LRC4_CHAR 164 / otherwise use 4-byte integer array (AMD64) 165 inc $XX[0]#b 166 movl ($dat,$XX[0],4),$TX[0]#d 167 test \$-8,$len 168 jz .Lloop1 169 jmp .Lloop8 170 171.align 16 172.Lloop8: 173 / 174 / This code is for use with a 4-byte integer data array, which is 175 / more efficient on AMD64 Athlon and Opteron-class processors. 176 / 177___ 178for ($i=0;$i<8;$i++) { 179$code.=<<___; 180 add $TX[0]#b,$YY#b 181 mov $XX[0],$XX[1] 182 movl ($dat,$YY,4),$TY#d 183 ror \$8,%rax # ror is redundant when $i=0 184 inc $XX[1]#b 185 movl ($dat,$XX[1],4),$TX[1]#d 186 cmp $XX[1],$YY 187 movl $TX[0]#d,($dat,$YY,4) 188 cmove $TX[0],$TX[1] 189 movl $TY#d,($dat,$XX[0],4) 190 add $TX[0]#b,$TY#b 191 movb ($dat,$TY,4),%al 192___ 193push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 194} 195$code.=<<___; 196 ror \$8,%rax 197 sub \$8,$len 198 199 xor ($inp),%rax 200 add \$8,$inp 201 mov %rax,($out) 202 add \$8,$out 203 204 test \$-8,$len 205 jnz .Lloop8 206 cmp \$0,$len 207 jne .Lloop1 208___ 209$code.=<<___; 210.Lexit: 211 / 212 / Cleanup and exit code 213 / 214 / --i to undo ++i done at entry 215 sub \$1,$XX[0]#b 216 / set key->i 217 movl $XX[0]#d,-8($dat) 218 / set key->j 219 movl $YY#d,-4($dat) 220 221 pop %r13 222 pop %r12 223 ret 224.align 16 225.Lloop1: 226 add $TX[0]#b,$YY#b 227 movl ($dat,$YY,4),$TY#d 228 movl $TX[0]#d,($dat,$YY,4) 229 movl $TY#d,($dat,$XX[0],4) 230 add $TY#b,$TX[0]#b 231 inc $XX[0]#b 232 movl ($dat,$TX[0],4),$TY#d 233 movl ($dat,$XX[0],4),$TX[0]#d 234 xorb ($inp),$TY#b 235 inc $inp 236 movb $TY#b,($out) 237 inc $out 238 dec $len 239 jnz .Lloop1 240 jmp .Lexit 241 242 243.align 16 244.LRC4_CHAR: 245 / 246 / This code is for use with a 1-byte integer data array, which is 247 / more efficient on Intel P4 EM64T-class processors. 248 / 249 add \$1,$XX[0]#b 250 movzb ($dat,$XX[0]),$TX[0]#d 251 jmp .Lcloop1 252 253.align 16 254.Lcloop1: 255 add $TX[0]#b,$YY#b 256 movzb ($dat,$YY),$TY#d 257 movb $TX[0]#b,($dat,$YY) 258 movb $TY#b,($dat,$XX[0]) 259 add $TX[0]#b,$TY#b 260 add \$1,$XX[0]#b 261 / Intel Optimization (preload $TY and $XX[0]): 262 movzb $TY#b,$TY#d 263 movzb $XX[0]#b,$XX[0]#d 264 movzb ($dat,$TY),$TY#d 265 movzb ($dat,$XX[0]),$TX[0]#d 266 xorb ($inp),$TY#b 267 lea 1($inp),$inp 268 movb $TY#b,($out) 269 lea 1($out),$out 270 sub \$1,$len 271 jnz .Lcloop1 272 jmp .Lexit 273 274 /* EXPORT DELETE END */ 275 ret 276SET_SIZE(arcfour_crypt) 277___ 278 279 280# 281# Parameters 282# 283 284# OpenSSL: 285# void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data); 286#$dat="%rdi"; # arg1 287#$len="%rsi"; # arg2 288#$inp="%rdx"; # arg3 289 290# OpenSolaris: 291# void arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen); 292$dat="%rdi"; # arg1 293$inp="%rsi"; # arg2 294$len="%rdx"; # arg3 295 296# Temporaries 297$idx="%r8"; 298$ido="%r9"; 299 300$code.=<<___; 301 / int arcfour_crypt_on_intel(void); 302.extern arcfour_crypt_on_intel 303 304ENTRY_NP(arcfour_key_init) 305 /* EXPORT DELETE START */ 306 307 / Find out if we're running on Intel or something else (e.g., AMD64). 308 / This sets %eax to 1 for Intel, otherwise 0. 309 push %rdi / Save arg1 310 push %rsi / Save arg2 311 push %rdx / Save arg3 312 call arcfour_crypt_on_intel 313 pop %rdx / Restore arg3 314 pop %rsi / Restore arg2 315 pop %rdi / Restore arg1 316 317 / Set $dat to beginning of array, key->arr[0] 318 lea 8($dat),$dat 319 lea ($inp,$len),$inp 320 neg $len 321 mov $len,%rcx 322 / Zeroed below, as %eax contains a flag from arcfour_crypt_on_intel(): 323 /xor %eax,%eax 324 xor $ido,$ido 325 xor %r10,%r10 326 xor %r11,%r11 327 328 / 329 / Use a 1-byte data array, on Intel P4 EM64T, 330 / which is more efficient there, 331 / or a 4-byte data array (for AMD AMD64). 332 / 333 cmp \$1,%eax / Test if Intel 334 mov \$0,%eax / Zero eax without modifying flags 335 je .Lc1stloop / If Intel then use a 1-byte array, 336 jmp .Lw1stloop / otherwise use a 4-byte array. 337 338.align 16 339.Lw1stloop: 340 / AMD64 (4-byte array) 341 mov %eax,($dat,%rax,4) 342 add \$1,%al 343 jnc .Lw1stloop 344 345 xor $ido,$ido 346 xor $idx,$idx 347.align 16 348.Lw2ndloop: 349 mov ($dat,$ido,4),%r10d 350 add ($inp,$len,1),$idx#b 351 add %r10b,$idx#b 352 add \$1,$len 353 mov ($dat,$idx,4),%r11d 354 cmovz %rcx,$len 355 mov %r10d,($dat,$idx,4) 356 mov %r11d,($dat,$ido,4) 357 add \$1,$ido#b 358 jnc .Lw2ndloop 359 jmp .Lexit_key 360 361.align 16 362.Lc1stloop: 363 / Intel EM64T (1-byte array) 364 mov %al,($dat,%rax) 365 add \$1,%al 366 jnc .Lc1stloop 367 368 xor $ido,$ido 369 xor $idx,$idx 370.align 16 371.Lc2ndloop: 372 mov ($dat,$ido),%r10b 373 add ($inp,$len),$idx#b 374 add %r10b,$idx#b 375 add \$1,$len 376 mov ($dat,$idx),%r11b 377 jnz .Lcnowrap 378 mov %rcx,$len 379.Lcnowrap: 380 mov %r10b,($dat,$idx) 381 mov %r11b,($dat,$ido) 382 add \$1,$ido#b 383 jnc .Lc2ndloop 384 movl \$-1,256($dat) 385 386.align 16 387.Lexit_key: 388 xor %eax,%eax 389 mov %eax,-8($dat) 390 mov %eax,-4($dat) 391 392 /* EXPORT DELETE END */ 393 ret 394SET_SIZE(arcfour_key_init) 395.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 396 397#else 398 /* LINTED */ 399 /* Nothing to be linted in this file--it's pure assembly source. */ 400#endif /* !lint && !__lint */ 401___ 402 403$code =~ s/#([bwd])/$1/gm; 404 405print $code; 406 407close STDOUT; 408