/* $FreeBSD$ */
/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */
.text	



.globl	rsaz_512_sqr
.type	rsaz_512_sqr,@function
.align	32
rsaz_512_sqr:
.cfi_startproc	
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56

	subq	$128+24,%rsp
.cfi_adjust_cfa_offset	128+24
.Lsqr_body:
.byte	102,72,15,110,202
	movq	(%rsi),%rdx
	movq	8(%rsi),%rax
	movq	%rcx,128(%rsp)
	movl	$0x80100,%r11d
	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
	cmpl	$0x80100,%r11d
	je	.Loop_sqrx
	jmp	.Loop_sqr

.align	32
.Loop_sqr:
	movl	%r8d,128+8(%rsp)

	movq	%rdx,%rbx
	movq	%rax,%rbp
	mulq	%rdx
	movq	%rax,%r8
	movq	16(%rsi),%rax
	movq	%rdx,%r9

	mulq	%rbx
	addq	%rax,%r9
	movq	24(%rsi),%rax
	movq	%rdx,%r10
	adcq	$0,%r10

	mulq	%rbx
	addq	%rax,%r10
	movq	32(%rsi),%rax
	movq	%rdx,%r11
	adcq	$0,%r11

	mulq	%rbx
	addq	%rax,%r11
	movq	40(%rsi),%rax
	movq	%rdx,%r12
	adcq	$0,%r12

	mulq	%rbx
	addq	%rax,%r12
	movq	48(%rsi),%rax
	movq	%rdx,%r13
	adcq	$0,%r13

	mulq	%rbx
	addq	%rax,%r13
	movq	56(%rsi),%rax
	movq	%rdx,%r14
	adcq	$0,%r14

	mulq	%rbx
	addq	%rax,%r14
	movq	%rbx,%rax
	adcq	$0,%rdx

	xorq	%rcx,%rcx
	addq	%r8,%r8
	movq	%rdx,%r15
	adcq	$0,%rcx

	mulq	%rax
	addq	%r8,%rdx
	adcq	$0,%rcx

	movq	%rax,(%rsp)
	movq	%rdx,8(%rsp)


	movq	16(%rsi),%rax
	mulq	%rbp
	addq	%rax,%r10
	movq	24(%rsi),%rax
	movq	%rdx,%rbx
	adcq	$0,%rbx

	mulq	%rbp
	addq	%rax,%r11
	movq	32(%rsi),%rax
	adcq	$0,%rdx
	addq	%rbx,%r11
	movq	%rdx,%rbx
	adcq	$0,%rbx

	mulq	%rbp
	addq	%rax,%r12
	movq	40(%rsi),%rax
	adcq	$0,%rdx
	addq	%rbx,%r12
	movq	%rdx,%rbx
	adcq	$0,%rbx

	mulq	%rbp
	addq	%rax,%r13
	movq	48(%rsi),%rax
	adcq	$0,%rdx
	addq	%rbx,%r13
	movq	%rdx,%rbx
	adcq	$0,%rbx

	mulq	%rbp
	addq	%rax,%r14
	movq	56(%rsi),%rax
	adcq	$0,%rdx
	addq	%rbx,%r14
	movq	%rdx,%rbx
	adcq	$0,%rbx

	mulq	%rbp
	addq	%rax,%r15
	movq	%rbp,%rax
	adcq	$0,%rdx
	addq	%rbx,%r15
	adcq	$0,%rdx

	xorq	%rbx,%rbx
	addq	%r9,%r9
	movq	%rdx,%r8
	adcq	%r10,%r10
	adcq	$0,%rbx

	mulq	%rax

	addq	%rcx,%rax
	movq	16(%rsi),%rbp
	addq	%rax,%r9
	movq	24(%rsi),%rax
	adcq	%rdx,%r10
	adcq	$0,%rbx

	movq	%r9,16(%rsp)
	movq	%r10,24(%rsp)


	mulq	%rbp
	addq	%rax,%r12
	movq	32(%rsi),%rax
	movq	%rdx,%rcx
	adcq	$0,%rcx

	mulq	%rbp
	addq	%rax,%r13
	movq	40(%rsi),%rax
	adcq	$0,%rdx
	addq	%rcx,%r13
	movq	%rdx,%rcx
	adcq	$0,%rcx

	mulq	%rbp
	addq	%rax,%r14
	movq	48(%rsi),%rax
	adcq	$0,%rdx
	addq	%rcx,%r14
	movq	%rdx,%rcx
	adcq	$0,%rcx

	mulq	%rbp
	addq	%rax,%r15
	movq	56(%rsi),%rax
	adcq	$0,%rdx
	addq	%rcx,%r15
	movq	%rdx,%rcx
	adcq	$0,%rcx

	mulq	%rbp
	addq	%rax,%r8
	movq	%rbp,%rax
	adcq	$0,%rdx
	addq	%rcx,%r8
	adcq	$0,%rdx

	xorq	%rcx,%rcx
	addq	%r11,%r11
	movq	%rdx,%r9
	adcq	%r12,%r12
	adcq	$0,%rcx

	mulq	%rax

	addq	%rbx,%rax
	movq	24(%rsi),%r10
	addq	%rax,%r11
	movq	32(%rsi),%rax
	adcq	%rdx,%r12
	adcq	$0,%rcx

	movq	%r11,32(%rsp)
	movq	%r12,40(%rsp)


	movq	%rax,%r11
	mulq	%r10
	addq	%rax,%r14
	movq	40(%rsi),%rax
	movq	%rdx,%rbx
	adcq	$0,%rbx

	movq	%rax,%r12
	mulq	%r10
	addq	%rax,%r15
	movq	48(%rsi),%rax
	adcq	$0,%rdx
	addq	%rbx,%r15
	movq	%rdx,%rbx
	adcq	$0,%rbx

	movq	%rax,%rbp
	mulq	%r10
	addq	%rax,%r8
	movq	56(%rsi),%rax
	adcq	$0,%rdx
	addq	%rbx,%r8
	movq	%rdx,%rbx
	adcq	$0,%rbx

	mulq	%r10
	addq	%rax,%r9
	movq	%r10,%rax
	adcq	$0,%rdx
	addq	%rbx,%r9
	adcq	$0,%rdx

	xorq	%rbx,%rbx
	addq	%r13,%r13
	movq	%rdx,%r10
	adcq	%r14,%r14
	adcq	$0,%rbx

	mulq	%rax

	addq	%rcx,%rax
	addq	%rax,%r13
	movq	%r12,%rax
	adcq	%rdx,%r14
	adcq	$0,%rbx

	movq	%r13,48(%rsp)
	movq	%r14,56(%rsp)


	mulq	%r11
	addq	%rax,%r8
	movq	%rbp,%rax
	movq	%rdx,%rcx
	adcq	$0,%rcx

	mulq	%r11
	addq	%rax,%r9
	movq	56(%rsi),%rax
	adcq	$0,%rdx
	addq	%rcx,%r9
	movq	%rdx,%rcx
	adcq	$0,%rcx

	movq	%rax,%r14
	mulq	%r11
	addq	%rax,%r10
	movq	%r11,%rax
	adcq	$0,%rdx
	addq	%rcx,%r10
	adcq	$0,%rdx

	xorq	%rcx,%rcx
	addq	%r15,%r15
	movq	%rdx,%r11
	adcq	%r8,%r8
	adcq	$0,%rcx

	mulq	%rax

	addq	%rbx,%rax
	addq	%rax,%r15
	movq	%rbp,%rax
	adcq	%rdx,%r8
	adcq	$0,%rcx

	movq	%r15,64(%rsp)
	movq	%r8,72(%rsp)


	mulq	%r12
	addq	%rax,%r10
	movq	%r14,%rax
	movq	%rdx,%rbx
	adcq	$0,%rbx

	mulq	%r12
	addq	%rax,%r11
	movq	%r12,%rax
	adcq	$0,%rdx
	addq	%rbx,%r11
	adcq	$0,%rdx

	xorq	%rbx,%rbx
	addq	%r9,%r9
	movq	%rdx,%r12
	adcq	%r10,%r10
	adcq	$0,%rbx

	mulq	%rax

	addq	%rcx,%rax
	addq	%rax,%r9
	movq	%r14,%rax
	adcq	%rdx,%r10
	adcq	$0,%rbx

	movq	%r9,80(%rsp)
	movq	%r10,88(%rsp)


	mulq	%rbp
	addq	%rax,%r12
	movq	%rbp,%rax
	adcq	$0,%rdx

	xorq	%rcx,%rcx
	addq	%r11,%r11
	movq	%rdx,%r13
	adcq	%r12,%r12
	adcq	$0,%rcx

	mulq	%rax

	addq	%rbx,%rax
	addq	%rax,%r11
	movq	%r14,%rax
	adcq	%rdx,%r12
	adcq	$0,%rcx

	movq	%r11,96(%rsp)
	movq	%r12,104(%rsp)


	xorq	%rbx,%rbx
	addq	%r13,%r13
	adcq	$0,%rbx

	mulq	%rax

	addq	%rcx,%rax
	addq	%r13,%rax
	adcq	%rbx,%rdx

	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15
.byte	102,72,15,126,205

	movq	%rax,112(%rsp)
	movq	%rdx,120(%rsp)

	call	__rsaz_512_reduce

	addq	64(%rsp),%r8
	adcq	72(%rsp),%r9
	adcq	80(%rsp),%r10
	adcq	88(%rsp),%r11
	adcq	96(%rsp),%r12
	adcq	104(%rsp),%r13
	adcq	112(%rsp),%r14
	adcq	120(%rsp),%r15
	sbbq	%rcx,%rcx

	call	__rsaz_512_subtract

	movq	%r8,%rdx
	movq	%r9,%rax
	movl	128+8(%rsp),%r8d
	movq	%rdi,%rsi

	decl	%r8d
	jnz	.Loop_sqr
	jmp	.Lsqr_tail

.align	32
.Loop_sqrx:
	movl	%r8d,128+8(%rsp)
.byte	102,72,15,110,199

	mulxq	%rax,%r8,%r9
	movq	%rax,%rbx

	mulxq	16(%rsi),%rcx,%r10
	xorq	%rbp,%rbp

	mulxq	24(%rsi),%rax,%r11
	adcxq	%rcx,%r9

.byte	0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
	adcxq	%rax,%r10

.byte	0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
	adcxq	%rcx,%r11

	mulxq	48(%rsi),%rcx,%r14
	adcxq	%rax,%r12
	adcxq	%rcx,%r13

	mulxq	56(%rsi),%rax,%r15
	adcxq	%rax,%r14
	adcxq	%rbp,%r15

	mulxq	%rdx,%rax,%rdi
	movq	%rbx,%rdx
	xorq	%rcx,%rcx
	adoxq	%r8,%r8
	adcxq	%rdi,%r8
	adoxq	%rbp,%rcx
	adcxq	%rbp,%rcx

	movq	%rax,(%rsp)
	movq	%r8,8(%rsp)


.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
	adoxq	%rax,%r10
	adcxq	%rbx,%r11

	mulxq	24(%rsi),%rdi,%r8
	adoxq	%rdi,%r11
.byte	0x66
	adcxq	%r8,%r12

	mulxq	32(%rsi),%rax,%rbx
	adoxq	%rax,%r12
	adcxq	%rbx,%r13

	mulxq	40(%rsi),%rdi,%r8
	adoxq	%rdi,%r13
	adcxq	%r8,%r14

.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
	adoxq	%rax,%r14
	adcxq	%rbx,%r15

.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
	adoxq	%rdi,%r15
	adcxq	%rbp,%r8
	mulxq	%rdx,%rax,%rdi
	adoxq	%rbp,%r8
.byte	0x48,0x8b,0x96,0x10,0x00,0x00,0x00

	xorq	%rbx,%rbx
	adoxq	%r9,%r9

	adcxq	%rcx,%rax
	adoxq	%r10,%r10
	adcxq	%rax,%r9
	adoxq	%rbp,%rbx
	adcxq	%rdi,%r10
	adcxq	%rbp,%rbx

	movq	%r9,16(%rsp)
.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00


	mulxq	24(%rsi),%rdi,%r9
	adoxq	%rdi,%r12
	adcxq	%r9,%r13

	mulxq	32(%rsi),%rax,%rcx
	adoxq	%rax,%r13
	adcxq	%rcx,%r14

.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
	adoxq	%rdi,%r14
	adcxq	%r9,%r15

.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
	adoxq	%rax,%r15
	adcxq	%rcx,%r8

	mulxq	56(%rsi),%rdi,%r9
	adoxq	%rdi,%r8
	adcxq	%rbp,%r9
	mulxq	%rdx,%rax,%rdi
	adoxq	%rbp,%r9
	movq	24(%rsi),%rdx

	xorq	%rcx,%rcx
	adoxq	%r11,%r11

	adcxq	%rbx,%rax
	adoxq	%r12,%r12
	adcxq	%rax,%r11
	adoxq	%rbp,%rcx
	adcxq	%rdi,%r12
	adcxq	%rbp,%rcx

	movq	%r11,32(%rsp)
	movq	%r12,40(%rsp)


	mulxq	32(%rsi),%rax,%rbx
	adoxq	%rax,%r14
	adcxq	%rbx,%r15

	mulxq	40(%rsi),%rdi,%r10
	adoxq	%rdi,%r15
	adcxq	%r10,%r8

	mulxq	48(%rsi),%rax,%rbx
	adoxq	%rax,%r8
	adcxq	%rbx,%r9

	mulxq	56(%rsi),%rdi,%r10
	adoxq	%rdi,%r9
	adcxq	%rbp,%r10
	mulxq	%rdx,%rax,%rdi
	adoxq	%rbp,%r10
	movq	32(%rsi),%rdx

	xorq	%rbx,%rbx
	adoxq	%r13,%r13

	adcxq	%rcx,%rax
	adoxq	%r14,%r14
	adcxq	%rax,%r13
	adoxq	%rbp,%rbx
	adcxq	%rdi,%r14
	adcxq	%rbp,%rbx

	movq	%r13,48(%rsp)
	movq	%r14,56(%rsp)


	mulxq	40(%rsi),%rdi,%r11
	adoxq	%rdi,%r8
	adcxq	%r11,%r9

	mulxq	48(%rsi),%rax,%rcx
	adoxq	%rax,%r9
	adcxq	%rcx,%r10

	mulxq	56(%rsi),%rdi,%r11
	adoxq	%rdi,%r10
	adcxq	%rbp,%r11
	mulxq	%rdx,%rax,%rdi
	movq	40(%rsi),%rdx
	adoxq	%rbp,%r11

	xorq	%rcx,%rcx
	adoxq	%r15,%r15

	adcxq	%rbx,%rax
	adoxq	%r8,%r8
	adcxq	%rax,%r15
	adoxq	%rbp,%rcx
	adcxq	%rdi,%r8
	adcxq	%rbp,%rcx

	movq	%r15,64(%rsp)
	movq	%r8,72(%rsp)


.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
	adoxq	%rax,%r10
	adcxq	%rbx,%r11

.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
	adoxq	%rdi,%r11
	adcxq	%rbp,%r12
	mulxq	%rdx,%rax,%rdi
	adoxq	%rbp,%r12
	movq	48(%rsi),%rdx

	xorq	%rbx,%rbx
	adoxq	%r9,%r9

	adcxq	%rcx,%rax
	adoxq	%r10,%r10
	adcxq	%rax,%r9
	adcxq	%rdi,%r10
	adoxq	%rbp,%rbx
	adcxq	%rbp,%rbx

	movq	%r9,80(%rsp)
	movq	%r10,88(%rsp)


.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
	adoxq	%rax,%r12
	adoxq	%rbp,%r13

	mulxq	%rdx,%rax,%rdi
	xorq	%rcx,%rcx
	movq	56(%rsi),%rdx
	adoxq	%r11,%r11

	adcxq	%rbx,%rax
	adoxq	%r12,%r12
	adcxq	%rax,%r11
	adoxq	%rbp,%rcx
	adcxq	%rdi,%r12
	adcxq	%rbp,%rcx

.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00


	mulxq	%rdx,%rax,%rdx
	xorq	%rbx,%rbx
	adoxq	%r13,%r13

	adcxq	%rcx,%rax
	adoxq	%rbp,%rbx
	adcxq	%r13,%rax
	adcxq	%rdx,%rbx

.byte	102,72,15,126,199
.byte	102,72,15,126,205

	movq	128(%rsp),%rdx
	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15

	movq	%rax,112(%rsp)
	movq	%rbx,120(%rsp)

	call	__rsaz_512_reducex

	addq	64(%rsp),%r8
	adcq	72(%rsp),%r9
	adcq	80(%rsp),%r10
	adcq	88(%rsp),%r11
	adcq	96(%rsp),%r12
	adcq	104(%rsp),%r13
	adcq	112(%rsp),%r14
	adcq	120(%rsp),%r15
	sbbq	%rcx,%rcx

	call	__rsaz_512_subtract

	movq	%r8,%rdx
	movq	%r9,%rax
	movl	128+8(%rsp),%r8d
	movq	%rdi,%rsi

	decl	%r8d
	jnz	.Loop_sqrx

.Lsqr_tail:

	leaq	128+24+48(%rsp),%rax
.cfi_def_cfa	%rax,8
	movq	-48(%rax),%r15
.cfi_restore	%r15
	movq	-40(%rax),%r14
.cfi_restore	%r14
	movq	-32(%rax),%r13
.cfi_restore	%r13
	movq	-24(%rax),%r12
.cfi_restore	%r12
	movq	-16(%rax),%rbp
.cfi_restore	%rbp
	movq	-8(%rax),%rbx
.cfi_restore	%rbx
	leaq	(%rax),%rsp
.cfi_def_cfa_register	%rsp
.Lsqr_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	rsaz_512_sqr,.-rsaz_512_sqr
.globl	rsaz_512_mul
.type	rsaz_512_mul,@function
.align	32
rsaz_512_mul:
.cfi_startproc	
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56

	subq	$128+24,%rsp
.cfi_adjust_cfa_offset	128+24
.Lmul_body:
.byte	102,72,15,110,199
.byte	102,72,15,110,201
	movq	%r8,128(%rsp)
	movl	$0x80100,%r11d
	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
	cmpl	$0x80100,%r11d
	je	.Lmulx
	movq	(%rdx),%rbx
	movq	%rdx,%rbp
	call	__rsaz_512_mul

.byte	102,72,15,126,199
.byte	102,72,15,126,205

	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15

	call	__rsaz_512_reduce
	jmp	.Lmul_tail

.align	32
.Lmulx:
	movq	%rdx,%rbp
	movq	(%rdx),%rdx
	call	__rsaz_512_mulx

.byte	102,72,15,126,199
.byte	102,72,15,126,205

	movq	128(%rsp),%rdx
	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15

	call	__rsaz_512_reducex
.Lmul_tail:
	addq	64(%rsp),%r8
	adcq	72(%rsp),%r9
	adcq	80(%rsp),%r10
	adcq	88(%rsp),%r11
	adcq	96(%rsp),%r12
	adcq	104(%rsp),%r13
	adcq	112(%rsp),%r14
	adcq	120(%rsp),%r15
	sbbq	%rcx,%rcx

	call	__rsaz_512_subtract

	leaq	128+24+48(%rsp),%rax
.cfi_def_cfa	%rax,8
	movq	-48(%rax),%r15
.cfi_restore	%r15
	movq	-40(%rax),%r14
.cfi_restore	%r14
	movq	-32(%rax),%r13
.cfi_restore	%r13
	movq	-24(%rax),%r12
.cfi_restore	%r12
	movq	-16(%rax),%rbp
.cfi_restore	%rbp
	movq	-8(%rax),%rbx
.cfi_restore	%rbx
	leaq	(%rax),%rsp
.cfi_def_cfa_register	%rsp
.Lmul_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	rsaz_512_mul,.-rsaz_512_mul
.globl	rsaz_512_mul_gather4
.type	rsaz_512_mul_gather4,@function
.align	32
rsaz_512_mul_gather4:
.cfi_startproc	
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56

	subq	$152,%rsp
.cfi_adjust_cfa_offset	152
.Lmul_gather4_body:
	movd	%r9d,%xmm8
	movdqa	.Linc+16(%rip),%xmm1
	movdqa	.Linc(%rip),%xmm0

	pshufd	$0,%xmm8,%xmm8
	movdqa	%xmm1,%xmm7
	movdqa	%xmm1,%xmm2
	paddd	%xmm0,%xmm1
	pcmpeqd	%xmm8,%xmm0
	movdqa	%xmm7,%xmm3
	paddd	%xmm1,%xmm2
	pcmpeqd	%xmm8,%xmm1
	movdqa	%xmm7,%xmm4
	paddd	%xmm2,%xmm3
	pcmpeqd	%xmm8,%xmm2
	movdqa	%xmm7,%xmm5
	paddd	%xmm3,%xmm4
	pcmpeqd	%xmm8,%xmm3
	movdqa	%xmm7,%xmm6
	paddd	%xmm4,%xmm5
	pcmpeqd	%xmm8,%xmm4
	paddd	%xmm5,%xmm6
	pcmpeqd	%xmm8,%xmm5
	paddd	%xmm6,%xmm7
	pcmpeqd	%xmm8,%xmm6
	pcmpeqd	%xmm8,%xmm7

	movdqa	0(%rdx),%xmm8
	movdqa	16(%rdx),%xmm9
	movdqa	32(%rdx),%xmm10
	movdqa	48(%rdx),%xmm11
	pand	%xmm0,%xmm8
	movdqa	64(%rdx),%xmm12
	pand	%xmm1,%xmm9
	movdqa	80(%rdx),%xmm13
	pand	%xmm2,%xmm10
	movdqa	96(%rdx),%xmm14
	pand	%xmm3,%xmm11
	movdqa	112(%rdx),%xmm15
	leaq	128(%rdx),%rbp
	pand	%xmm4,%xmm12
	pand	%xmm5,%xmm13
	pand	%xmm6,%xmm14
	pand	%xmm7,%xmm15
	por	%xmm10,%xmm8
	por	%xmm11,%xmm9
	por	%xmm12,%xmm8
	por	%xmm13,%xmm9
	por	%xmm14,%xmm8
	por	%xmm15,%xmm9

	por	%xmm9,%xmm8
	pshufd	$0x4e,%xmm8,%xmm9
	por	%xmm9,%xmm8
	movl	$0x80100,%r11d
	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
	cmpl	$0x80100,%r11d
	je	.Lmulx_gather
.byte	102,76,15,126,195

	movq	%r8,128(%rsp)
	movq	%rdi,128+8(%rsp)
	movq	%rcx,128+16(%rsp)

	movq	(%rsi),%rax
	movq	8(%rsi),%rcx
	mulq	%rbx
	movq	%rax,(%rsp)
	movq	%rcx,%rax
	movq	%rdx,%r8

	mulq	%rbx
	addq	%rax,%r8
	movq	16(%rsi),%rax
	movq	%rdx,%r9
	adcq	$0,%r9

	mulq	%rbx
	addq	%rax,%r9
	movq	24(%rsi),%rax
	movq	%rdx,%r10
	adcq	$0,%r10

	mulq	%rbx
	addq	%rax,%r10
	movq	32(%rsi),%rax
	movq	%rdx,%r11
	adcq	$0,%r11

	mulq	%rbx
	addq	%rax,%r11
	movq	40(%rsi),%rax
	movq	%rdx,%r12
	adcq	$0,%r12

	mulq	%rbx
	addq	%rax,%r12
	movq	48(%rsi),%rax
	movq	%rdx,%r13
	adcq	$0,%r13

	mulq	%rbx
	addq	%rax,%r13
	movq	56(%rsi),%rax
	movq	%rdx,%r14
	adcq	$0,%r14

	mulq	%rbx
	addq	%rax,%r14
	movq	(%rsi),%rax
	movq	%rdx,%r15
	adcq	$0,%r15

	leaq	8(%rsp),%rdi
	movl	$7,%ecx
	jmp	.Loop_mul_gather

.align	32
.Loop_mul_gather:
	movdqa	0(%rbp),%xmm8
	movdqa	16(%rbp),%xmm9
	movdqa	32(%rbp),%xmm10
	movdqa	48(%rbp),%xmm11
	pand	%xmm0,%xmm8
	movdqa	64(%rbp),%xmm12
	pand	%xmm1,%xmm9
	movdqa	80(%rbp),%xmm13
	pand	%xmm2,%xmm10
	movdqa	96(%rbp),%xmm14
	pand	%xmm3,%xmm11
	movdqa	112(%rbp),%xmm15
	leaq	128(%rbp),%rbp
	pand	%xmm4,%xmm12
	pand	%xmm5,%xmm13
	pand	%xmm6,%xmm14
	pand	%xmm7,%xmm15
	por	%xmm10,%xmm8
	por	%xmm11,%xmm9
	por	%xmm12,%xmm8
	por	%xmm13,%xmm9
	por	%xmm14,%xmm8
	por	%xmm15,%xmm9

	por	%xmm9,%xmm8
	pshufd	$0x4e,%xmm8,%xmm9
	por	%xmm9,%xmm8
.byte	102,76,15,126,195

	mulq	%rbx
	addq	%rax,%r8
	movq	8(%rsi),%rax
	movq	%r8,(%rdi)
	movq	%rdx,%r8
	adcq	$0,%r8

	mulq	%rbx
	addq	%rax,%r9
	movq	16(%rsi),%rax
	adcq	$0,%rdx
	addq	%r9,%r8
	movq	%rdx,%r9
	adcq	$0,%r9

	mulq	%rbx
	addq	%rax,%r10
	movq	24(%rsi),%rax
	adcq	$0,%rdx
	addq	%r10,%r9
	movq	%rdx,%r10
	adcq	$0,%r10

	mulq	%rbx
	addq	%rax,%r11
	movq	32(%rsi),%rax
	adcq	$0,%rdx
	addq	%r11,%r10
	movq	%rdx,%r11
	adcq	$0,%r11

	mulq	%rbx
	addq	%rax,%r12
	movq	40(%rsi),%rax
	adcq	$0,%rdx
	addq	%r12,%r11
	movq	%rdx,%r12
	adcq	$0,%r12

	mulq	%rbx
	addq	%rax,%r13
	movq	48(%rsi),%rax
	adcq	$0,%rdx
	addq	%r13,%r12
	movq	%rdx,%r13
	adcq	$0,%r13

	mulq	%rbx
	addq	%rax,%r14
	movq	56(%rsi),%rax
	adcq	$0,%rdx
	addq	%r14,%r13
	movq	%rdx,%r14
	adcq	$0,%r14

	mulq	%rbx
	addq	%rax,%r15
	movq	(%rsi),%rax
	adcq	$0,%rdx
	addq	%r15,%r14
	movq	%rdx,%r15
	adcq	$0,%r15

	leaq	8(%rdi),%rdi

	decl	%ecx
	jnz	.Loop_mul_gather

	movq	%r8,(%rdi)
	movq	%r9,8(%rdi)
	movq	%r10,16(%rdi)
	movq	%r11,24(%rdi)
	movq	%r12,32(%rdi)
	movq	%r13,40(%rdi)
	movq	%r14,48(%rdi)
	movq	%r15,56(%rdi)

	movq	128+8(%rsp),%rdi
	movq	128+16(%rsp),%rbp

	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15

	call	__rsaz_512_reduce
	jmp	.Lmul_gather_tail

.align	32
.Lmulx_gather:
.byte	102,76,15,126,194

	movq	%r8,128(%rsp)
	movq	%rdi,128+8(%rsp)
	movq	%rcx,128+16(%rsp)

	mulxq	(%rsi),%rbx,%r8
	movq	%rbx,(%rsp)
	xorl	%edi,%edi

	mulxq	8(%rsi),%rax,%r9

	mulxq	16(%rsi),%rbx,%r10
	adcxq	%rax,%r8

	mulxq	24(%rsi),%rax,%r11
	adcxq	%rbx,%r9

	mulxq	32(%rsi),%rbx,%r12
	adcxq	%rax,%r10

	mulxq	40(%rsi),%rax,%r13
	adcxq	%rbx,%r11

	mulxq	48(%rsi),%rbx,%r14
	adcxq	%rax,%r12

	mulxq	56(%rsi),%rax,%r15
	adcxq	%rbx,%r13
	adcxq	%rax,%r14
.byte	0x67
	movq	%r8,%rbx
	adcxq	%rdi,%r15

	movq	$-7,%rcx
	jmp	.Loop_mulx_gather

.align	32
.Loop_mulx_gather:
	movdqa	0(%rbp),%xmm8
	movdqa	16(%rbp),%xmm9
	movdqa	32(%rbp),%xmm10
	movdqa	48(%rbp),%xmm11
	pand	%xmm0,%xmm8
	movdqa	64(%rbp),%xmm12
	pand	%xmm1,%xmm9
	movdqa	80(%rbp),%xmm13
	pand	%xmm2,%xmm10
	movdqa	96(%rbp),%xmm14
	pand	%xmm3,%xmm11
	movdqa	112(%rbp),%xmm15
	leaq	128(%rbp),%rbp
	pand	%xmm4,%xmm12
	pand	%xmm5,%xmm13
	pand	%xmm6,%xmm14
	pand	%xmm7,%xmm15
	por	%xmm10,%xmm8
	por	%xmm11,%xmm9
	por	%xmm12,%xmm8
	por	%xmm13,%xmm9
	por	%xmm14,%xmm8
	por	%xmm15,%xmm9

	por	%xmm9,%xmm8
	pshufd	$0x4e,%xmm8,%xmm9
	por	%xmm9,%xmm8
.byte	102,76,15,126,194

.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
	adcxq	%rax,%rbx
	adoxq	%r9,%r8

	mulxq	8(%rsi),%rax,%r9
	adcxq	%rax,%r8
	adoxq	%r10,%r9

	mulxq	16(%rsi),%rax,%r10
	adcxq	%rax,%r9
	adoxq	%r11,%r10

.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
	adcxq	%rax,%r10
	adoxq	%r12,%r11

	mulxq	32(%rsi),%rax,%r12
	adcxq	%rax,%r11
	adoxq	%r13,%r12

	mulxq	40(%rsi),%rax,%r13
	adcxq	%rax,%r12
	adoxq	%r14,%r13

.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
	adcxq	%rax,%r13
.byte	0x67
	adoxq	%r15,%r14

	mulxq	56(%rsi),%rax,%r15
	movq	%rbx,64(%rsp,%rcx,8)
	adcxq	%rax,%r14
	adoxq	%rdi,%r15
	movq	%r8,%rbx
	adcxq	%rdi,%r15

	incq	%rcx
	jnz	.Loop_mulx_gather

	movq	%r8,64(%rsp)
	movq	%r9,64+8(%rsp)
	movq	%r10,64+16(%rsp)
	movq	%r11,64+24(%rsp)
	movq	%r12,64+32(%rsp)
	movq	%r13,64+40(%rsp)
	movq	%r14,64+48(%rsp)
	movq	%r15,64+56(%rsp)

	movq	128(%rsp),%rdx
	movq	128+8(%rsp),%rdi
	movq	128+16(%rsp),%rbp

	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15

	call	__rsaz_512_reducex

.Lmul_gather_tail:
	addq	64(%rsp),%r8
	adcq	72(%rsp),%r9
	adcq	80(%rsp),%r10
	adcq	88(%rsp),%r11
	adcq	96(%rsp),%r12
	adcq	104(%rsp),%r13
	adcq	112(%rsp),%r14
	adcq	120(%rsp),%r15
	sbbq	%rcx,%rcx

	call	__rsaz_512_subtract

	leaq	128+24+48(%rsp),%rax
.cfi_def_cfa	%rax,8
	movq	-48(%rax),%r15
.cfi_restore	%r15
	movq	-40(%rax),%r14
.cfi_restore	%r14
	movq	-32(%rax),%r13
.cfi_restore	%r13
	movq	-24(%rax),%r12
.cfi_restore	%r12
	movq	-16(%rax),%rbp
.cfi_restore	%rbp
	movq	-8(%rax),%rbx
.cfi_restore	%rbx
	leaq	(%rax),%rsp
.cfi_def_cfa_register	%rsp
.Lmul_gather4_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
.globl	rsaz_512_mul_scatter4
.type	rsaz_512_mul_scatter4,@function
.align	32
rsaz_512_mul_scatter4:
.cfi_startproc	
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56

	movl	%r9d,%r9d
	subq	$128+24,%rsp
.cfi_adjust_cfa_offset	128+24
.Lmul_scatter4_body:
	leaq	(%r8,%r9,8),%r8
.byte	102,72,15,110,199
.byte	102,72,15,110,202
.byte	102,73,15,110,208
	movq	%rcx,128(%rsp)

	movq	%rdi,%rbp
	movl	$0x80100,%r11d
	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
	cmpl	$0x80100,%r11d
	je	.Lmulx_scatter
	movq	(%rdi),%rbx
	call	__rsaz_512_mul

.byte	102,72,15,126,199
.byte	102,72,15,126,205

	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15

	call	__rsaz_512_reduce
	jmp	.Lmul_scatter_tail

.align	32
.Lmulx_scatter:
	movq	(%rdi),%rdx
	call	__rsaz_512_mulx

.byte	102,72,15,126,199
.byte	102,72,15,126,205

	movq	128(%rsp),%rdx
	movq	(%rsp),%r8
	movq	8(%rsp),%r9
	movq	16(%rsp),%r10
	movq	24(%rsp),%r11
	movq	32(%rsp),%r12
	movq	40(%rsp),%r13
	movq	48(%rsp),%r14
	movq	56(%rsp),%r15

	call	__rsaz_512_reducex

.Lmul_scatter_tail:
	addq	64(%rsp),%r8
	adcq	72(%rsp),%r9
	adcq	80(%rsp),%r10
	adcq	88(%rsp),%r11
	adcq	96(%rsp),%r12
	adcq	104(%rsp),%r13
	adcq	112(%rsp),%r14
	adcq	120(%rsp),%r15
.byte	102,72,15,126,214
	sbbq	%rcx,%rcx

	call	__rsaz_512_subtract

	movq	%r8,0(%rsi)
	movq	%r9,128(%rsi)
	movq	%r10,256(%rsi)
	movq	%r11,384(%rsi)
	movq	%r12,512(%rsi)
	movq	%r13,640(%rsi)
	movq	%r14,768(%rsi)
	movq	%r15,896(%rsi)

	leaq	128+24+48(%rsp),%rax
.cfi_def_cfa	%rax,8
	movq	-48(%rax),%r15
.cfi_restore	%r15
	movq	-40(%rax),%r14
.cfi_restore	%r14
	movq	-32(%rax),%r13
.cfi_restore	%r13
	movq	-24(%rax),%r12
.cfi_restore	%r12
	movq	-16(%rax),%rbp
.cfi_restore	%rbp
	movq	-8(%rax),%rbx
.cfi_restore	%rbx
	leaq	(%rax),%rsp
.cfi_def_cfa_register	%rsp
.Lmul_scatter4_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
.globl	rsaz_512_mul_by_one
.type	rsaz_512_mul_by_one,@function
.align	32
rsaz_512_mul_by_one:
.cfi_startproc	
	pushq	%rbx
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbx,-16
	pushq	%rbp
.cfi_adjust_cfa_offset	8
.cfi_offset	%rbp,-24
	pushq	%r12
.cfi_adjust_cfa_offset	8
.cfi_offset	%r12,-32
	pushq	%r13
.cfi_adjust_cfa_offset	8
.cfi_offset	%r13,-40
	pushq	%r14
.cfi_adjust_cfa_offset	8
.cfi_offset	%r14,-48
	pushq	%r15
.cfi_adjust_cfa_offset	8
.cfi_offset	%r15,-56

	subq	$128+24,%rsp
.cfi_adjust_cfa_offset	128+24
.Lmul_by_one_body:
	movl	OPENSSL_ia32cap_P+8(%rip),%eax
	movq	%rdx,%rbp
	movq	%rcx,128(%rsp)

	movq	(%rsi),%r8
	pxor	%xmm0,%xmm0
	movq	8(%rsi),%r9
	movq	16(%rsi),%r10
	movq	24(%rsi),%r11
	movq	32(%rsi),%r12
	movq	40(%rsi),%r13
	movq	48(%rsi),%r14
	movq	56(%rsi),%r15

	movdqa	%xmm0,(%rsp)
	movdqa	%xmm0,16(%rsp)
	movdqa	%xmm0,32(%rsp)
	movdqa	%xmm0,48(%rsp)
	movdqa	%xmm0,64(%rsp)
	movdqa	%xmm0,80(%rsp)
	movdqa	%xmm0,96(%rsp)
	andl	$0x80100,%eax
	cmpl	$0x80100,%eax
	je	.Lby_one_callx
	call	__rsaz_512_reduce
	jmp	.Lby_one_tail
.align	32
.Lby_one_callx:
	movq	128(%rsp),%rdx
	call	__rsaz_512_reducex
.Lby_one_tail:
	movq	%r8,(%rdi)
	movq	%r9,8(%rdi)
	movq	%r10,16(%rdi)
	movq	%r11,24(%rdi)
	movq	%r12,32(%rdi)
	movq	%r13,40(%rdi)
	movq	%r14,48(%rdi)
	movq	%r15,56(%rdi)

	leaq	128+24+48(%rsp),%rax
.cfi_def_cfa	%rax,8
	movq	-48(%rax),%r15
.cfi_restore	%r15
	movq	-40(%rax),%r14
.cfi_restore	%r14
	movq	-32(%rax),%r13
.cfi_restore	%r13
	movq	-24(%rax),%r12
.cfi_restore	%r12
	movq	-16(%rax),%rbp
.cfi_restore	%rbp
	movq	-8(%rax),%rbx
.cfi_restore	%rbx
	leaq	(%rax),%rsp
.cfi_def_cfa_register	%rsp
.Lmul_by_one_epilogue:
	.byte	0xf3,0xc3
.cfi_endproc	
.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
.type	__rsaz_512_reduce,@function
.align	32
__rsaz_512_reduce:
.cfi_startproc	
	movq	%r8,%rbx
	imulq	128+8(%rsp),%rbx
	movq	0(%rbp),%rax
	movl	$8,%ecx
	jmp	.Lreduction_loop

.align	32
.Lreduction_loop:
	mulq	%rbx
	movq	8(%rbp),%rax
	negq	%r8
	movq	%rdx,%r8
	adcq	$0,%r8

	mulq	%rbx
	addq	%rax,%r9
	movq	16(%rbp),%rax
	adcq	$0,%rdx
	addq	%r9,%r8
	movq	%rdx,%r9
	adcq	$0,%r9

	mulq	%rbx
	addq	%rax,%r10
	movq	24(%rbp),%rax
	adcq	$0,%rdx
	addq	%r10,%r9
	movq	%rdx,%r10
	adcq	$0,%r10

	mulq	%rbx
	addq	%rax,%r11
	movq	32(%rbp),%rax
	adcq	$0,%rdx
	addq	%r11,%r10
	movq	128+8(%rsp),%rsi


	adcq	$0,%rdx
	movq	%rdx,%r11

	mulq	%rbx
	addq	%rax,%r12
	movq	40(%rbp),%rax
	adcq	$0,%rdx
	imulq	%r8,%rsi
	addq	%r12,%r11
	movq	%rdx,%r12
	adcq	$0,%r12

	mulq	%rbx
	addq	%rax,%r13
	movq	48(%rbp),%rax
	adcq	$0,%rdx
	addq	%r13,%r12
	movq	%rdx,%r13
	adcq	$0,%r13

	mulq	%rbx
	addq	%rax,%r14
	movq	56(%rbp),%rax
	adcq	$0,%rdx
	addq	%r14,%r13
	movq	%rdx,%r14
	adcq	$0,%r14

	mulq	%rbx
	movq	%rsi,%rbx
	addq	%rax,%r15
	movq	0(%rbp),%rax
	adcq	$0,%rdx
	addq	%r15,%r14
	movq	%rdx,%r15
	adcq	$0,%r15

	decl	%ecx
	jne	.Lreduction_loop

	.byte	0xf3,0xc3
.cfi_endproc	
.size	__rsaz_512_reduce,.-__rsaz_512_reduce
.type	__rsaz_512_reducex,@function
.align	32
__rsaz_512_reducex:
.cfi_startproc	

	imulq	%r8,%rdx
	xorq	%rsi,%rsi
	movl	$8,%ecx
	jmp	.Lreduction_loopx

.align	32
.Lreduction_loopx:
	movq	%r8,%rbx
	mulxq	0(%rbp),%rax,%r8
	adcxq	%rbx,%rax
	adoxq	%r9,%r8

	mulxq	8(%rbp),%rax,%r9
	adcxq	%rax,%r8
	adoxq	%r10,%r9

	mulxq	16(%rbp),%rbx,%r10
	adcxq	%rbx,%r9
	adoxq	%r11,%r10

	mulxq	24(%rbp),%rbx,%r11
	adcxq	%rbx,%r10
	adoxq	%r12,%r11

.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
	movq	%rdx,%rax
	movq	%r8,%rdx
	adcxq	%rbx,%r11
	adoxq	%r13,%r12

	mulxq	128+8(%rsp),%rbx,%rdx
	movq	%rax,%rdx

	mulxq	40(%rbp),%rax,%r13
	adcxq	%rax,%r12
	adoxq	%r14,%r13

.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
	adcxq	%rax,%r13
	adoxq	%r15,%r14

	mulxq	56(%rbp),%rax,%r15
	movq	%rbx,%rdx
	adcxq	%rax,%r14
	adoxq	%rsi,%r15
	adcxq	%rsi,%r15

	decl	%ecx
	jne	.Lreduction_loopx

	.byte	0xf3,0xc3
.cfi_endproc	
.size	__rsaz_512_reducex,.-__rsaz_512_reducex
.type	__rsaz_512_subtract,@function
.align	32
__rsaz_512_subtract:
.cfi_startproc	
	movq	%r8,(%rdi)
	movq	%r9,8(%rdi)
	movq	%r10,16(%rdi)
	movq	%r11,24(%rdi)
	movq	%r12,32(%rdi)
	movq	%r13,40(%rdi)
	movq	%r14,48(%rdi)
	movq	%r15,56(%rdi)

	movq	0(%rbp),%r8
	movq	8(%rbp),%r9
	negq	%r8
	notq	%r9
	andq	%rcx,%r8
	movq	16(%rbp),%r10
	andq	%rcx,%r9
	notq	%r10
	movq	24(%rbp),%r11
	andq	%rcx,%r10
	notq	%r11
	movq	32(%rbp),%r12
	andq	%rcx,%r11
	notq	%r12
	movq	40(%rbp),%r13
	andq	%rcx,%r12
	notq	%r13
	movq	48(%rbp),%r14
	andq	%rcx,%r13
	notq	%r14
	movq	56(%rbp),%r15
	andq	%rcx,%r14
	notq	%r15
	andq	%rcx,%r15

	addq	(%rdi),%r8
	adcq	8(%rdi),%r9
	adcq	16(%rdi),%r10
	adcq	24(%rdi),%r11
	adcq	32(%rdi),%r12
	adcq	40(%rdi),%r13
	adcq	48(%rdi),%r14
	adcq	56(%rdi),%r15

	movq	%r8,(%rdi)
	movq	%r9,8(%rdi)
	movq	%r10,16(%rdi)
	movq	%r11,24(%rdi)
	movq	%r12,32(%rdi)
	movq	%r13,40(%rdi)
	movq	%r14,48(%rdi)
	movq	%r15,56(%rdi)

	.byte	0xf3,0xc3
.cfi_endproc	
.size	__rsaz_512_subtract,.-__rsaz_512_subtract
.type	__rsaz_512_mul,@function
.align	32
__rsaz_512_mul:
.cfi_startproc	
	leaq	8(%rsp),%rdi

	movq	(%rsi),%rax
	mulq	%rbx
	movq	%rax,(%rdi)
	movq	8(%rsi),%rax
	movq	%rdx,%r8

	mulq	%rbx
	addq	%rax,%r8
	movq	16(%rsi),%rax
	movq	%rdx,%r9
	adcq	$0,%r9

	mulq	%rbx
	addq	%rax,%r9
	movq	24(%rsi),%rax
	movq	%rdx,%r10
	adcq	$0,%r10

	mulq	%rbx
	addq	%rax,%r10
	movq	32(%rsi),%rax
	movq	%rdx,%r11
	adcq	$0,%r11

	mulq	%rbx
	addq	%rax,%r11
	movq	40(%rsi),%rax
	movq	%rdx,%r12
	adcq	$0,%r12

	mulq	%rbx
	addq	%rax,%r12
	movq	48(%rsi),%rax
	movq	%rdx,%r13
	adcq	$0,%r13

	mulq	%rbx
	addq	%rax,%r13
	movq	56(%rsi),%rax
	movq	%rdx,%r14
	adcq	$0,%r14

	mulq	%rbx
	addq	%rax,%r14
	movq	(%rsi),%rax
	movq	%rdx,%r15
	adcq	$0,%r15

	leaq	8(%rbp),%rbp
	leaq	8(%rdi),%rdi

	movl	$7,%ecx
	jmp	.Loop_mul

.align	32
.Loop_mul:
	movq	(%rbp),%rbx
	mulq	%rbx
	addq	%rax,%r8
	movq	8(%rsi),%rax
	movq	%r8,(%rdi)
	movq	%rdx,%r8
	adcq	$0,%r8

	mulq	%rbx
	addq	%rax,%r9
	movq	16(%rsi),%rax
	adcq	$0,%rdx
	addq	%r9,%r8
	movq	%rdx,%r9
	adcq	$0,%r9

	mulq	%rbx
	addq	%rax,%r10
	movq	24(%rsi),%rax
	adcq	$0,%rdx
	addq	%r10,%r9
	movq	%rdx,%r10
	adcq	$0,%r10

	mulq	%rbx
	addq	%rax,%r11
	movq	32(%rsi),%rax
	adcq	$0,%rdx
	addq	%r11,%r10
	movq	%rdx,%r11
	adcq	$0,%r11

	mulq	%rbx
	addq	%rax,%r12
	movq	40(%rsi),%rax
	adcq	$0,%rdx
	addq	%r12,%r11
	movq	%rdx,%r12
	adcq	$0,%r12

	mulq	%rbx
	addq	%rax,%r13
	movq	48(%rsi),%rax
	adcq	$0,%rdx
	addq	%r13,%r12
	movq	%rdx,%r13
	adcq	$0,%r13

	mulq	%rbx
	addq	%rax,%r14
	movq	56(%rsi),%rax
	adcq	$0,%rdx
	addq	%r14,%r13
	movq	%rdx,%r14
	leaq	8(%rbp),%rbp
	adcq	$0,%r14

	mulq	%rbx
	addq	%rax,%r15
	movq	(%rsi),%rax
	adcq	$0,%rdx
	addq	%r15,%r14
	movq	%rdx,%r15
	adcq	$0,%r15

	leaq	8(%rdi),%rdi

	decl	%ecx
	jnz	.Loop_mul

	movq	%r8,(%rdi)
	movq	%r9,8(%rdi)
	movq	%r10,16(%rdi)
	movq	%r11,24(%rdi)
	movq	%r12,32(%rdi)
	movq	%r13,40(%rdi)
	movq	%r14,48(%rdi)
	movq	%r15,56(%rdi)

	.byte	0xf3,0xc3
.cfi_endproc	
.size	__rsaz_512_mul,.-__rsaz_512_mul
.type	__rsaz_512_mulx,@function
.align	32
__rsaz_512_mulx:
.cfi_startproc	
	mulxq	(%rsi),%rbx,%r8
	movq	$-6,%rcx

	mulxq	8(%rsi),%rax,%r9
	movq	%rbx,8(%rsp)

	mulxq	16(%rsi),%rbx,%r10
	adcq	%rax,%r8

	mulxq	24(%rsi),%rax,%r11
	adcq	%rbx,%r9

	mulxq	32(%rsi),%rbx,%r12
	adcq	%rax,%r10

	mulxq	40(%rsi),%rax,%r13
	adcq	%rbx,%r11

	mulxq	48(%rsi),%rbx,%r14
	adcq	%rax,%r12

	mulxq	56(%rsi),%rax,%r15
	movq	8(%rbp),%rdx
	adcq	%rbx,%r13
	adcq	%rax,%r14
	adcq	$0,%r15

	xorq	%rdi,%rdi
	jmp	.Loop_mulx

.align	32
.Loop_mulx:
	movq	%r8,%rbx
	mulxq	(%rsi),%rax,%r8
	adcxq	%rax,%rbx
	adoxq	%r9,%r8

	mulxq	8(%rsi),%rax,%r9
	adcxq	%rax,%r8
	adoxq	%r10,%r9

	mulxq	16(%rsi),%rax,%r10
	adcxq	%rax,%r9
	adoxq	%r11,%r10

	mulxq	24(%rsi),%rax,%r11
	adcxq	%rax,%r10
	adoxq	%r12,%r11

.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
	adcxq	%rax,%r11
	adoxq	%r13,%r12

	mulxq	40(%rsi),%rax,%r13
	adcxq	%rax,%r12
	adoxq	%r14,%r13

	mulxq	48(%rsi),%rax,%r14
	adcxq	%rax,%r13
	adoxq	%r15,%r14

	mulxq	56(%rsi),%rax,%r15
	movq	64(%rbp,%rcx,8),%rdx
	movq	%rbx,8+64-8(%rsp,%rcx,8)
	adcxq	%rax,%r14
	adoxq	%rdi,%r15
	adcxq	%rdi,%r15

	incq	%rcx
	jnz	.Loop_mulx

	movq	%r8,%rbx
	mulxq	(%rsi),%rax,%r8
	adcxq	%rax,%rbx
	adoxq	%r9,%r8

.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
	adcxq	%rax,%r8
	adoxq	%r10,%r9

.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
	adcxq	%rax,%r9
	adoxq	%r11,%r10

	mulxq	24(%rsi),%rax,%r11
	adcxq	%rax,%r10
	adoxq	%r12,%r11

	mulxq	32(%rsi),%rax,%r12
	adcxq	%rax,%r11
	adoxq	%r13,%r12

	mulxq	40(%rsi),%rax,%r13
	adcxq	%rax,%r12
	adoxq	%r14,%r13

.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
	adcxq	%rax,%r13
	adoxq	%r15,%r14

.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
	adcxq	%rax,%r14
	adoxq	%rdi,%r15
	adcxq	%rdi,%r15

	movq	%rbx,8+64-8(%rsp)
	movq	%r8,8+64(%rsp)
	movq	%r9,8+64+8(%rsp)
	movq	%r10,8+64+16(%rsp)
	movq	%r11,8+64+24(%rsp)
	movq	%r12,8+64+32(%rsp)
	movq	%r13,8+64+40(%rsp)
	movq	%r14,8+64+48(%rsp)
	movq	%r15,8+64+56(%rsp)

	.byte	0xf3,0xc3
.cfi_endproc	
.size	__rsaz_512_mulx,.-__rsaz_512_mulx
.globl	rsaz_512_scatter4
.type	rsaz_512_scatter4,@function
.align	16
rsaz_512_scatter4:
.cfi_startproc	
	leaq	(%rdi,%rdx,8),%rdi
	movl	$8,%r9d
	jmp	.Loop_scatter
.align	16
.Loop_scatter:
	movq	(%rsi),%rax
	leaq	8(%rsi),%rsi
	movq	%rax,(%rdi)
	leaq	128(%rdi),%rdi
	decl	%r9d
	jnz	.Loop_scatter
	.byte	0xf3,0xc3
.cfi_endproc	
.size	rsaz_512_scatter4,.-rsaz_512_scatter4

.globl	rsaz_512_gather4
.type	rsaz_512_gather4,@function
.align	16
rsaz_512_gather4:
.cfi_startproc	
	movd	%edx,%xmm8
	movdqa	.Linc+16(%rip),%xmm1
	movdqa	.Linc(%rip),%xmm0

	pshufd	$0,%xmm8,%xmm8
	movdqa	%xmm1,%xmm7
	movdqa	%xmm1,%xmm2
	paddd	%xmm0,%xmm1
	pcmpeqd	%xmm8,%xmm0
	movdqa	%xmm7,%xmm3
	paddd	%xmm1,%xmm2
	pcmpeqd	%xmm8,%xmm1
	movdqa	%xmm7,%xmm4
	paddd	%xmm2,%xmm3
	pcmpeqd	%xmm8,%xmm2
	movdqa	%xmm7,%xmm5
	paddd	%xmm3,%xmm4
	pcmpeqd	%xmm8,%xmm3
	movdqa	%xmm7,%xmm6
	paddd	%xmm4,%xmm5
	pcmpeqd	%xmm8,%xmm4
	paddd	%xmm5,%xmm6
	pcmpeqd	%xmm8,%xmm5
	paddd	%xmm6,%xmm7
	pcmpeqd	%xmm8,%xmm6
	pcmpeqd	%xmm8,%xmm7
	movl	$8,%r9d
	jmp	.Loop_gather
.align	16
.Loop_gather:
	movdqa	0(%rsi),%xmm8
	movdqa	16(%rsi),%xmm9
	movdqa	32(%rsi),%xmm10
	movdqa	48(%rsi),%xmm11
	pand	%xmm0,%xmm8
	movdqa	64(%rsi),%xmm12
	pand	%xmm1,%xmm9
	movdqa	80(%rsi),%xmm13
	pand	%xmm2,%xmm10
	movdqa	96(%rsi),%xmm14
	pand	%xmm3,%xmm11
	movdqa	112(%rsi),%xmm15
	leaq	128(%rsi),%rsi
	pand	%xmm4,%xmm12
	pand	%xmm5,%xmm13
	pand	%xmm6,%xmm14
	pand	%xmm7,%xmm15
	por	%xmm10,%xmm8
	por	%xmm11,%xmm9
	por	%xmm12,%xmm8
	por	%xmm13,%xmm9
	por	%xmm14,%xmm8
	por	%xmm15,%xmm9

	por	%xmm9,%xmm8
	pshufd	$0x4e,%xmm8,%xmm9
	por	%xmm9,%xmm8
	movq	%xmm8,(%rdi)
	leaq	8(%rdi),%rdi
	decl	%r9d
	jnz	.Loop_gather
	.byte	0xf3,0xc3
.LSEH_end_rsaz_512_gather4:
.cfi_endproc	
.size	rsaz_512_gather4,.-rsaz_512_gather4

.align	64
.Linc:
.long	0,0, 1,1
.long	2,2, 2,2