xref: /titanic_44/usr/src/common/crypto/arcfour/amd64/arcfour-x86_64.pl (revision 694c35faa87b858ecdadfe4fc592615f4eefbb07)
155553f71Sda73024#!/usr/bin/env perl
255553f71Sda73024#
355553f71Sda73024# ====================================================================
455553f71Sda73024# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
555553f71Sda73024# project. The module is, however, dual licensed under OpenSSL and
655553f71Sda73024# CRYPTOGAMS licenses depending on where you obtain it. For further
755553f71Sda73024# details see http://www.openssl.org/~appro/cryptogams/.
855553f71Sda73024# ====================================================================
955553f71Sda73024#
1055553f71Sda73024# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
1155553f71Sda73024# "hand-coded assembler"] doesn't stand for the whole improvement
1255553f71Sda73024# coefficient. It turned out that eliminating RC4_CHAR from config
1355553f71Sda73024# line results in ~40% improvement (yes, even for C implementation).
1455553f71Sda73024# Presumably it has everything to do with AMD cache architecture and
1555553f71Sda73024# RAW or whatever penalties. Once again! The module *requires* config
1655553f71Sda73024# line *without* RC4_CHAR! As for coding "secret," I bet on partial
1755553f71Sda73024# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
1855553f71Sda73024# I simply 'inc %r8b'. Even though optimization manual discourages
1955553f71Sda73024# to operate on partial registers, it turned out to be the best bet.
2055553f71Sda73024# At least for AMD... How IA32E would perform remains to be seen...
2155553f71Sda73024
2255553f71Sda73024# As was shown by Marc Bevand reordering of couple of load operations
2355553f71Sda73024# results in even higher performance gain of 3.3x:-) At least on
2455553f71Sda73024# Opteron... For reference, 1x in this case is RC4_CHAR C-code
2555553f71Sda73024# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
2655553f71Sda73024# Latter means that if you want to *estimate* what to expect from
2755553f71Sda73024# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
2855553f71Sda73024
2955553f71Sda73024# Intel P4 EM64T core was found to run the AMD64 code really slow...
3055553f71Sda73024# The only way to achieve comparable performance on P4 was to keep
3155553f71Sda73024# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
3255553f71Sda73024# compose blended code, which would perform even within 30% marginal
3355553f71Sda73024# on either AMD and Intel platforms, I implement both cases. See
3455553f71Sda73024# rc4_skey.c for further details...
3555553f71Sda73024
3655553f71Sda73024# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
3755553f71Sda73024# those with add/sub results in 50% performance improvement of folded
3855553f71Sda73024# loop...
3955553f71Sda73024
4055553f71Sda73024# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
4155553f71Sda73024# performance by >30% [unlike P4 32-bit case that is]. But this is
4255553f71Sda73024# provided that loads are reordered even more aggressively! Both code
4355553f71Sda73024# pathes, AMD64 and EM64T, reorder loads in essentially same manner
4455553f71Sda73024# as my IA-64 implementation. On Opteron this resulted in modest 5%
4555553f71Sda73024# improvement [I had to test it], while final Intel P4 performance
4655553f71Sda73024# achieves respectful 432MBps on 2.8GHz processor now. For reference.
4755553f71Sda73024# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
4855553f71Sda73024# RC4_INT code-path. While if executed on Opteron, it's only 25%
4955553f71Sda73024# slower than the RC4_INT one [meaning that if CPU �-arch detection
5055553f71Sda73024# is not implemented, then this final RC4_CHAR code-path should be
5155553f71Sda73024# preferred, as it provides better *all-round* performance].
5255553f71Sda73024
5355553f71Sda73024# Intel Core2 was observed to perform poorly on both code paths:-( It
5455553f71Sda73024# apparently suffers from some kind of partial register stall, which
5555553f71Sda73024# occurs in 64-bit mode only [as virtually identical 32-bit loop was
5655553f71Sda73024# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
5755553f71Sda73024# cloop1 boosts its performance by 80%! This loop appears to be optimal
5855553f71Sda73024# fit for Core2 and therefore the code was modified to skip cloop8 on
5955553f71Sda73024# this CPU.
6055553f71Sda73024
6155553f71Sda73024#
6255553f71Sda73024# OpenSolaris OS modifications
6355553f71Sda73024#
6455553f71Sda73024# Sun elects to use this software under the BSD license.
6555553f71Sda73024#
6655553f71Sda73024# This source originates from OpenSSL file rc4-x86_64.pl at
6755553f71Sda73024# ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
6855553f71Sda73024# (presumably for future OpenSSL release 0.9.8h), with these changes:
6955553f71Sda73024#
7055553f71Sda73024# 1. Added some comments, "use strict", and declared all variables.
7155553f71Sda73024#
7255553f71Sda73024# 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
73*92a8e44dSDan OpenSolaris Anderson# /usr/include/sys/asm_linkage.h.
7455553f71Sda73024#
75*92a8e44dSDan OpenSolaris Anderson# 3. Changed function name from RC4() to arcfour_crypt_asm() and RC4_set_key()
7655553f71Sda73024# to arcfour_key_init(), and changed the parameter order for both to that
7755553f71Sda73024# used by OpenSolaris.
7855553f71Sda73024#
7955553f71Sda73024# 4. The current method of using cpuid feature bits 20 (NX) or 28 (HTT) from
8055553f71Sda73024# function OPENSSL_ia32_cpuid() to distinguish Intel/AMD does not work for
8155553f71Sda73024# some newer AMD64 processors, as these bits are set on both Intel EM64T
82*92a8e44dSDan OpenSolaris Anderson# processors and newer AMD64 processors.  I replaced this with C code
83*92a8e44dSDan OpenSolaris Anderson# (function arcfour_crypt_on_intel()) to call cpuid_getvendor()
84*92a8e44dSDan OpenSolaris Anderson# when executing in the kernel and getisax() when executing in userland.
8555553f71Sda73024#
86*92a8e44dSDan OpenSolaris Anderson# 5. Set a new field in the key structure, key->flag to 0 for AMD AMD64
87*92a8e44dSDan OpenSolaris Anderson# and 1 for Intel EM64T.  This is to select the most-efficient arcfour_crypt()
88*92a8e44dSDan OpenSolaris Anderson# function to use.
8955553f71Sda73024#
90*92a8e44dSDan OpenSolaris Anderson# 6. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) assemblers).
91*92a8e44dSDan OpenSolaris Anderson#
92*92a8e44dSDan OpenSolaris Anderson# 7. Removed unused RC4_CHAR, Lcloop1, and Lcloop8 code.
93*92a8e44dSDan OpenSolaris Anderson#
94*92a8e44dSDan OpenSolaris Anderson# 8. Added C function definitions for use by lint(1B).
9555553f71Sda73024#
9655553f71Sda73024
9755553f71Sda73024use strict;
9855553f71Sda73024my ($code, $dat, $inp, $out, $len, $idx, $ido, $i, @XX, @TX, $YY, $TY);
9955553f71Sda73024my $output = shift;
10055553f71Sda73024open STDOUT,">$output";
10155553f71Sda73024
10255553f71Sda73024#
10355553f71Sda73024# Parameters
10455553f71Sda73024#
10555553f71Sda73024
10655553f71Sda73024# OpenSSL:
10755553f71Sda73024# void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
10855553f71Sda73024#	unsigned char *outdata);
10955553f71Sda73024#$dat="%rdi";	    # arg1
11055553f71Sda73024#$len="%rsi";	    # arg2
11155553f71Sda73024#$inp="%rdx";	    # arg3
11255553f71Sda73024#$out="%rcx";	    # arg4
11355553f71Sda73024
11455553f71Sda73024# OpenSolaris:
115*92a8e44dSDan OpenSolaris Anderson# void arcfour_crypt_asm(ARCFour_key *key, uchar_t *in, uchar_t *out,
116*92a8e44dSDan OpenSolaris Anderson#	size_t len);
11755553f71Sda73024$dat="%rdi";	    # arg1
11855553f71Sda73024$inp="%rsi";	    # arg2
11955553f71Sda73024$out="%rdx";	    # arg3
12055553f71Sda73024$len="%rcx";	    # arg4
12155553f71Sda73024
12255553f71Sda73024#
12355553f71Sda73024# Register variables
12455553f71Sda73024#
12555553f71Sda73024# $XX[0] is key->i (aka key->x), $XX[1] is a temporary.
12655553f71Sda73024# $TX[0] and $TX[1] are temporaries.
12755553f71Sda73024# $YY is key->j (aka key->y).
12855553f71Sda73024# $TY is a temporary.
12955553f71Sda73024#
13055553f71Sda73024@XX=("%r8","%r10");
13155553f71Sda73024@TX=("%r9","%r11");
13255553f71Sda73024$YY="%r12";
13355553f71Sda73024$TY="%r13";
13455553f71Sda73024
13555553f71Sda73024$code=<<___;
136*92a8e44dSDan OpenSolaris Anderson#if defined(lint) || defined(__lint)
13755553f71Sda73024
138*92a8e44dSDan OpenSolaris Anderson#include "arcfour.h"
13955553f71Sda73024
140*92a8e44dSDan OpenSolaris Anderson/* ARGSUSED */
141*92a8e44dSDan OpenSolaris Andersonvoid
142*92a8e44dSDan OpenSolaris Andersonarcfour_crypt_asm(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
143*92a8e44dSDan OpenSolaris Anderson{}
144*92a8e44dSDan OpenSolaris Anderson
145*92a8e44dSDan OpenSolaris Anderson/* ARGSUSED */
146*92a8e44dSDan OpenSolaris Andersonvoid
147*92a8e44dSDan OpenSolaris Andersonarcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
148*92a8e44dSDan OpenSolaris Anderson{}
149*92a8e44dSDan OpenSolaris Anderson
150*92a8e44dSDan OpenSolaris Anderson#else
15155553f71Sda73024#include <sys/asm_linkage.h>
15255553f71Sda73024
153*92a8e44dSDan OpenSolaris AndersonENTRY_NP(arcfour_crypt_asm)
15455553f71Sda73024	or	$len,$len # If (len == 0) return
15555553f71Sda73024	jne	.Lentry
15655553f71Sda73024	ret
15755553f71Sda73024.Lentry:
15855553f71Sda73024	push	%r12
15955553f71Sda73024	push	%r13
16055553f71Sda73024
16155553f71Sda73024	/ Set $dat to beginning of array, key->arr[0]
16255553f71Sda73024	add	\$8,$dat
16355553f71Sda73024	/ Get key->j
16455553f71Sda73024	movl	-8($dat),$XX[0]#d
16555553f71Sda73024	/ Get key->i
16655553f71Sda73024	movl	-4($dat),$YY#d
16755553f71Sda73024
16855553f71Sda73024	/
169*92a8e44dSDan OpenSolaris Anderson	/ Use a 4-byte key schedule element array
17055553f71Sda73024	/
17155553f71Sda73024	inc	$XX[0]#b
17255553f71Sda73024	movl	($dat,$XX[0],4),$TX[0]#d
17355553f71Sda73024	test	\$-8,$len
17455553f71Sda73024	jz	.Lloop1
17555553f71Sda73024	jmp	.Lloop8
17655553f71Sda73024
17755553f71Sda73024.align	16
17855553f71Sda73024.Lloop8:
17955553f71Sda73024___
18055553f71Sda73024for ($i=0;$i<8;$i++) {
18155553f71Sda73024$code.=<<___;
18255553f71Sda73024	add	$TX[0]#b,$YY#b
18355553f71Sda73024	mov	$XX[0],$XX[1]
18455553f71Sda73024	movl	($dat,$YY,4),$TY#d
18555553f71Sda73024	ror	\$8,%rax			# ror is redundant when $i=0
18655553f71Sda73024	inc	$XX[1]#b
18755553f71Sda73024	movl	($dat,$XX[1],4),$TX[1]#d
18855553f71Sda73024	cmp	$XX[1],$YY
18955553f71Sda73024	movl	$TX[0]#d,($dat,$YY,4)
19055553f71Sda73024	cmove	$TX[0],$TX[1]
19155553f71Sda73024	movl	$TY#d,($dat,$XX[0],4)
19255553f71Sda73024	add	$TX[0]#b,$TY#b
19355553f71Sda73024	movb	($dat,$TY,4),%al
19455553f71Sda73024___
19555553f71Sda73024push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
19655553f71Sda73024}
19755553f71Sda73024$code.=<<___;
19855553f71Sda73024	ror	\$8,%rax
19955553f71Sda73024	sub	\$8,$len
20055553f71Sda73024
20155553f71Sda73024	xor	($inp),%rax
20255553f71Sda73024	add	\$8,$inp
20355553f71Sda73024	mov	%rax,($out)
20455553f71Sda73024	add	\$8,$out
20555553f71Sda73024
20655553f71Sda73024	test	\$-8,$len
20755553f71Sda73024	jnz	.Lloop8
20855553f71Sda73024	cmp	\$0,$len
20955553f71Sda73024	jne	.Lloop1
210*92a8e44dSDan OpenSolaris Anderson
21155553f71Sda73024.Lexit:
21255553f71Sda73024	/
21355553f71Sda73024	/ Cleanup and exit code
21455553f71Sda73024	/
21555553f71Sda73024	/ --i to undo ++i done at entry
21655553f71Sda73024	sub	\$1,$XX[0]#b
21755553f71Sda73024	/ set key->i
21855553f71Sda73024	movl	$XX[0]#d,-8($dat)
21955553f71Sda73024	/ set key->j
22055553f71Sda73024	movl	$YY#d,-4($dat)
22155553f71Sda73024
22255553f71Sda73024	pop	%r13
22355553f71Sda73024	pop	%r12
22455553f71Sda73024	ret
225*92a8e44dSDan OpenSolaris Anderson
22655553f71Sda73024.align	16
22755553f71Sda73024.Lloop1:
22855553f71Sda73024	add	$TX[0]#b,$YY#b
22955553f71Sda73024	movl	($dat,$YY,4),$TY#d
23055553f71Sda73024	movl	$TX[0]#d,($dat,$YY,4)
23155553f71Sda73024	movl	$TY#d,($dat,$XX[0],4)
23255553f71Sda73024	add	$TY#b,$TX[0]#b
23355553f71Sda73024	inc	$XX[0]#b
23455553f71Sda73024	movl	($dat,$TX[0],4),$TY#d
23555553f71Sda73024	movl	($dat,$XX[0],4),$TX[0]#d
23655553f71Sda73024	xorb	($inp),$TY#b
23755553f71Sda73024	inc	$inp
23855553f71Sda73024	movb	$TY#b,($out)
23955553f71Sda73024	inc	$out
24055553f71Sda73024	dec	$len
24155553f71Sda73024	jnz	.Lloop1
24255553f71Sda73024	jmp	.Lexit
24355553f71Sda73024
24455553f71Sda73024	ret
245*92a8e44dSDan OpenSolaris AndersonSET_SIZE(arcfour_crypt_asm)
24655553f71Sda73024___
24755553f71Sda73024
24855553f71Sda73024
24955553f71Sda73024#
25055553f71Sda73024# Parameters
25155553f71Sda73024#
25255553f71Sda73024
25355553f71Sda73024# OpenSSL:
25455553f71Sda73024# void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
25555553f71Sda73024#$dat="%rdi";	    # arg1
25655553f71Sda73024#$len="%rsi";	    # arg2
25755553f71Sda73024#$inp="%rdx";	    # arg3
25855553f71Sda73024
25955553f71Sda73024# OpenSolaris:
26055553f71Sda73024# void arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen);
26155553f71Sda73024$dat="%rdi";	    # arg1
26255553f71Sda73024$inp="%rsi";	    # arg2
26355553f71Sda73024$len="%rdx";	    # arg3
26455553f71Sda73024
26555553f71Sda73024# Temporaries
26655553f71Sda73024$idx="%r8";
26755553f71Sda73024$ido="%r9";
26855553f71Sda73024
26955553f71Sda73024$code.=<<___;
27055553f71Sda73024	/ int arcfour_crypt_on_intel(void);
27155553f71Sda73024.extern	arcfour_crypt_on_intel
27255553f71Sda73024
27355553f71Sda73024ENTRY_NP(arcfour_key_init)
27455553f71Sda73024	/ Find out if we're running on Intel or something else (e.g., AMD64).
27555553f71Sda73024	/ This sets %eax to 1 for Intel, otherwise 0.
27655553f71Sda73024	push	%rdi		/ Save arg1
27755553f71Sda73024	push	%rsi		/ Save arg2
27855553f71Sda73024	push	%rdx		/ Save arg3
27955553f71Sda73024	call	arcfour_crypt_on_intel
28055553f71Sda73024	pop	%rdx		/ Restore arg3
28155553f71Sda73024	pop	%rsi		/ Restore arg2
28255553f71Sda73024	pop	%rdi		/ Restore arg1
283*92a8e44dSDan OpenSolaris Anderson	/ Save return value in key->flag (1=Intel, 0=AMD)
284*92a8e44dSDan OpenSolaris Anderson	movl	%eax,1032($dat)
28555553f71Sda73024
28655553f71Sda73024	/ Set $dat to beginning of array, key->arr[0]
28755553f71Sda73024	lea	8($dat),$dat
28855553f71Sda73024	lea	($inp,$len),$inp
28955553f71Sda73024	neg	$len
29055553f71Sda73024	mov	$len,%rcx
291*92a8e44dSDan OpenSolaris Anderson
292*92a8e44dSDan OpenSolaris Anderson	xor	%eax,%eax
29355553f71Sda73024	xor	$ido,$ido
29455553f71Sda73024	xor	%r10,%r10
29555553f71Sda73024	xor	%r11,%r11
29655553f71Sda73024
297*92a8e44dSDan OpenSolaris Anderson	/ Use a 4-byte data array
298*92a8e44dSDan OpenSolaris Anderson	jmp	.Lw1stloop
29955553f71Sda73024
30055553f71Sda73024.align	16
30155553f71Sda73024.Lw1stloop:
30255553f71Sda73024	/ AMD64 (4-byte array)
30355553f71Sda73024	mov	%eax,($dat,%rax,4)
30455553f71Sda73024	add	\$1,%al
30555553f71Sda73024	jnc	.Lw1stloop
30655553f71Sda73024
30755553f71Sda73024	xor	$ido,$ido
30855553f71Sda73024	xor	$idx,$idx
309*92a8e44dSDan OpenSolaris Anderson
31055553f71Sda73024.align	16
31155553f71Sda73024.Lw2ndloop:
31255553f71Sda73024	mov	($dat,$ido,4),%r10d
31355553f71Sda73024	add	($inp,$len,1),$idx#b
31455553f71Sda73024	add	%r10b,$idx#b
31555553f71Sda73024	add	\$1,$len
31655553f71Sda73024	mov	($dat,$idx,4),%r11d
31755553f71Sda73024	cmovz	%rcx,$len
31855553f71Sda73024	mov	%r10d,($dat,$idx,4)
31955553f71Sda73024	mov	%r11d,($dat,$ido,4)
32055553f71Sda73024	add	\$1,$ido#b
32155553f71Sda73024	jnc	.Lw2ndloop
32255553f71Sda73024
323*92a8e44dSDan OpenSolaris Anderson	/ Exit code
32455553f71Sda73024	xor	%eax,%eax
32555553f71Sda73024	mov	%eax,-8($dat)
32655553f71Sda73024	mov	%eax,-4($dat)
32755553f71Sda73024
32855553f71Sda73024	ret
32955553f71Sda73024SET_SIZE(arcfour_key_init)
33055553f71Sda73024.asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
33155553f71Sda73024#endif /* !lint && !__lint */
33255553f71Sda73024___
33355553f71Sda73024
33455553f71Sda73024$code =~ s/#([bwd])/$1/gm;
33555553f71Sda73024
33655553f71Sda73024print $code;
33755553f71Sda73024
33855553f71Sda73024close STDOUT;
339