xref: /illumos-gate/usr/src/lib/libc/amd64/gen/memcpy.S (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1*5d9d9091SRichard Lowe/*
2*5d9d9091SRichard Lowe * CDDL HEADER START
3*5d9d9091SRichard Lowe *
4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the
5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License").
6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License.
7*5d9d9091SRichard Lowe *
8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing.
10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions
11*5d9d9091SRichard Lowe * and limitations under the License.
12*5d9d9091SRichard Lowe *
13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each
14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the
16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying
17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner]
18*5d9d9091SRichard Lowe *
19*5d9d9091SRichard Lowe * CDDL HEADER END
20*5d9d9091SRichard Lowe */
21*5d9d9091SRichard Lowe
22*5d9d9091SRichard Lowe/*
23*5d9d9091SRichard Lowe * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24*5d9d9091SRichard Lowe * Use is subject to license terms.
25*5d9d9091SRichard Lowe */
26*5d9d9091SRichard Lowe
27*5d9d9091SRichard Lowe/*
28*5d9d9091SRichard Lowe * Copyright (c) 2008, Intel Corporation
29*5d9d9091SRichard Lowe * All rights reserved.
30*5d9d9091SRichard Lowe */
31*5d9d9091SRichard Lowe
32*5d9d9091SRichard Lowe/*
33*5d9d9091SRichard Lowe * memcpy.s - copies two blocks of memory
34*5d9d9091SRichard Lowe *	Implements memcpy() and memmove() libc primitives.
35*5d9d9091SRichard Lowe */
36*5d9d9091SRichard Lowe
37*5d9d9091SRichard Lowe	.file	"memcpy.s"
38*5d9d9091SRichard Lowe
39*5d9d9091SRichard Lowe#include <sys/asm_linkage.h>
40*5d9d9091SRichard Lowe
41*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memmove,function)
42*5d9d9091SRichard Lowe	ANSI_PRAGMA_WEAK(memcpy,function)
43*5d9d9091SRichard Lowe
44*5d9d9091SRichard Lowe#include "cache.h"
45*5d9d9091SRichard Lowe#include "proc64_id.h"
46*5d9d9091SRichard Lowe
47*5d9d9091SRichard Lowe#define L(s) .memcpy##s
48*5d9d9091SRichard Lowe
49*5d9d9091SRichard Lowe/*
50*5d9d9091SRichard Lowe * memcpy algorithm overview:
51*5d9d9091SRichard Lowe *
52*5d9d9091SRichard Lowe * Thresholds used below were determined experimentally.
53*5d9d9091SRichard Lowe *
54*5d9d9091SRichard Lowe * Pseudo code:
55*5d9d9091SRichard Lowe *
56*5d9d9091SRichard Lowe * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
57*5d9d9091SRichard Lowe * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
58*5d9d9091SRichard Lowe * future AMD processors.
59*5d9d9091SRichard Lowe *
60*5d9d9091SRichard Lowe *
61*5d9d9091SRichard Lowe * If (size <= 128 bytes) {
62*5d9d9091SRichard Lowe *	do unrolled code (primarily 8-byte loads/stores) regardless of
63*5d9d9091SRichard Lowe *	alignment.
64*5d9d9091SRichard Lowe * } else {
65*5d9d9091SRichard Lowe *	Align destination to 16-byte boundary
66*5d9d9091SRichard Lowe *
67*5d9d9091SRichard Lowe *      if (NO_SSE) {
68*5d9d9091SRichard Lowe *		If (size > half of the largest level cache) {
69*5d9d9091SRichard Lowe *			Use 8-byte non-temporal stores (64-bytes/loop)
70*5d9d9091SRichard Lowe *		} else {
71*5d9d9091SRichard Lowe *			if (size > 4K && size <= half l1 cache size) {
72*5d9d9091SRichard Lowe *				Use rep movsq
73*5d9d9091SRichard Lowe *			} else {
74*5d9d9091SRichard Lowe *				Use 8-byte loads/stores (64 bytes per loop)
75*5d9d9091SRichard Lowe *			}
76*5d9d9091SRichard Lowe *		}
77*5d9d9091SRichard Lowe *
78*5d9d9091SRichard Lowe *	} else { **USE SSE**
79*5d9d9091SRichard Lowe *		If (size > half of the largest level cache) {
80*5d9d9091SRichard Lowe *			Use 16-byte non-temporal stores (128-bytes per loop)
81*5d9d9091SRichard Lowe *		} else {
82*5d9d9091SRichard Lowe *			If (both source and destination are aligned) {
83*5d9d9091SRichard Lowe *			    Use 16-byte aligned loads and stores (128 bytes/loop)
84*5d9d9091SRichard Lowe *			} else {
85*5d9d9091SRichard Lowe *			    use pairs of xmm registers with SSE2 or SSSE3
86*5d9d9091SRichard Lowe *			    instructions to concatenate and shift appropriately
87*5d9d9091SRichard Lowe *			    to account for source unalignment. This enables
88*5d9d9091SRichard Lowe *			    16-byte aligned loads to be done.
89*5d9d9091SRichard Lowe *			}
90*5d9d9091SRichard Lowe *		}
91*5d9d9091SRichard Lowe	}
92*5d9d9091SRichard Lowe *
93*5d9d9091SRichard Lowe *	Finish any remaining bytes via unrolled code above.
94*5d9d9091SRichard Lowe * }
95*5d9d9091SRichard Lowe *
96*5d9d9091SRichard Lowe * memmove overview:
97*5d9d9091SRichard Lowe *	memmove is the same as memcpy except one case where copy needs to be
98*5d9d9091SRichard Lowe *	done backwards. The copy backwards code is done in a similar manner.
99*5d9d9091SRichard Lowe */
100*5d9d9091SRichard Lowe
101*5d9d9091SRichard Lowe	ENTRY(memmove)
102*5d9d9091SRichard Lowe	cmp	%rsi,%rdi		# if dst <= src
103*5d9d9091SRichard Lowe	jbe	L(CopyForward)		# then do copy forward
104*5d9d9091SRichard Lowe	mov	%rsi,%r9		# move src to r9
105*5d9d9091SRichard Lowe	add	%rdx,%r9		# add len to get addr of end of src
106*5d9d9091SRichard Lowe	cmp	%r9,%rdi		# if dst < end of src
107*5d9d9091SRichard Lowe	jb	L(CopyBackwards)	# then do copy backwards
108*5d9d9091SRichard Lowe	jmp	L(CopyForward)
109*5d9d9091SRichard Lowe
110*5d9d9091SRichard Lowe	ENTRY (memcpy)
111*5d9d9091SRichard LoweL(CopyForward):
112*5d9d9091SRichard Lowe	mov    %rdx,%r8
113*5d9d9091SRichard Lowe	mov    %rdi,%rcx
114*5d9d9091SRichard Lowe	mov    %rsi,%rdx
115*5d9d9091SRichard Lowe	mov    %rdi,%rax
116*5d9d9091SRichard Lowe	lea    L(fwdPxQx)(%rip),%r11
117*5d9d9091SRichard Lowe	cmp    $0x80,%r8		# 128
118*5d9d9091SRichard Lowe	jg     L(ck_use_sse2)
119*5d9d9091SRichard Lowe	add    %r8,%rcx
120*5d9d9091SRichard Lowe	add    %r8,%rdx
121*5d9d9091SRichard Lowe
122*5d9d9091SRichard Lowe	movslq (%r11,%r8,4),%r10
123*5d9d9091SRichard Lowe	lea    (%r10,%r11,1),%r11
124*5d9d9091SRichard Lowe	jmpq   *%r11
125*5d9d9091SRichard Lowe
126*5d9d9091SRichard Lowe	.balign 16
127*5d9d9091SRichard LoweL(ShrtAlignNew):
128*5d9d9091SRichard Lowe	lea    L(AliPxQx)(%rip),%r11
129*5d9d9091SRichard Lowe	mov    %rcx,%r9
130*5d9d9091SRichard Lowe	and    $0xf,%r9
131*5d9d9091SRichard Lowe
132*5d9d9091SRichard Lowe	movslq (%r11,%r9,4),%r10
133*5d9d9091SRichard Lowe	lea    (%r10,%r11,1),%r11
134*5d9d9091SRichard Lowe	jmpq   *%r11
135*5d9d9091SRichard Lowe
136*5d9d9091SRichard Lowe	.balign 16
137*5d9d9091SRichard LoweL(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
138*5d9d9091SRichard Lowe           .int        L(P1Q0)-L(fwdPxQx)
139*5d9d9091SRichard Lowe           .int        L(P2Q0)-L(fwdPxQx)
140*5d9d9091SRichard Lowe           .int        L(P3Q0)-L(fwdPxQx)
141*5d9d9091SRichard Lowe           .int        L(P4Q0)-L(fwdPxQx)
142*5d9d9091SRichard Lowe           .int        L(P5Q0)-L(fwdPxQx)
143*5d9d9091SRichard Lowe           .int        L(P6Q0)-L(fwdPxQx)
144*5d9d9091SRichard Lowe           .int        L(P7Q0)-L(fwdPxQx)
145*5d9d9091SRichard Lowe
146*5d9d9091SRichard Lowe           .int        L(P0Q1)-L(fwdPxQx)
147*5d9d9091SRichard Lowe           .int        L(P1Q1)-L(fwdPxQx)
148*5d9d9091SRichard Lowe           .int        L(P2Q1)-L(fwdPxQx)
149*5d9d9091SRichard Lowe           .int        L(P3Q1)-L(fwdPxQx)
150*5d9d9091SRichard Lowe           .int        L(P4Q1)-L(fwdPxQx)
151*5d9d9091SRichard Lowe           .int        L(P5Q1)-L(fwdPxQx)
152*5d9d9091SRichard Lowe           .int        L(P6Q1)-L(fwdPxQx)
153*5d9d9091SRichard Lowe           .int        L(P7Q1)-L(fwdPxQx)
154*5d9d9091SRichard Lowe
155*5d9d9091SRichard Lowe           .int        L(P0Q2)-L(fwdPxQx)
156*5d9d9091SRichard Lowe           .int        L(P1Q2)-L(fwdPxQx)
157*5d9d9091SRichard Lowe           .int        L(P2Q2)-L(fwdPxQx)
158*5d9d9091SRichard Lowe           .int        L(P3Q2)-L(fwdPxQx)
159*5d9d9091SRichard Lowe           .int        L(P4Q2)-L(fwdPxQx)
160*5d9d9091SRichard Lowe           .int        L(P5Q2)-L(fwdPxQx)
161*5d9d9091SRichard Lowe           .int        L(P6Q2)-L(fwdPxQx)
162*5d9d9091SRichard Lowe           .int        L(P7Q2)-L(fwdPxQx)
163*5d9d9091SRichard Lowe
164*5d9d9091SRichard Lowe           .int        L(P0Q3)-L(fwdPxQx)
165*5d9d9091SRichard Lowe           .int        L(P1Q3)-L(fwdPxQx)
166*5d9d9091SRichard Lowe           .int        L(P2Q3)-L(fwdPxQx)
167*5d9d9091SRichard Lowe           .int        L(P3Q3)-L(fwdPxQx)
168*5d9d9091SRichard Lowe           .int        L(P4Q3)-L(fwdPxQx)
169*5d9d9091SRichard Lowe           .int        L(P5Q3)-L(fwdPxQx)
170*5d9d9091SRichard Lowe           .int        L(P6Q3)-L(fwdPxQx)
171*5d9d9091SRichard Lowe           .int        L(P7Q3)-L(fwdPxQx)
172*5d9d9091SRichard Lowe
173*5d9d9091SRichard Lowe           .int        L(P0Q4)-L(fwdPxQx)
174*5d9d9091SRichard Lowe           .int        L(P1Q4)-L(fwdPxQx)
175*5d9d9091SRichard Lowe           .int        L(P2Q4)-L(fwdPxQx)
176*5d9d9091SRichard Lowe           .int        L(P3Q4)-L(fwdPxQx)
177*5d9d9091SRichard Lowe           .int        L(P4Q4)-L(fwdPxQx)
178*5d9d9091SRichard Lowe           .int        L(P5Q4)-L(fwdPxQx)
179*5d9d9091SRichard Lowe           .int        L(P6Q4)-L(fwdPxQx)
180*5d9d9091SRichard Lowe           .int        L(P7Q4)-L(fwdPxQx)
181*5d9d9091SRichard Lowe
182*5d9d9091SRichard Lowe           .int        L(P0Q5)-L(fwdPxQx)
183*5d9d9091SRichard Lowe           .int        L(P1Q5)-L(fwdPxQx)
184*5d9d9091SRichard Lowe           .int        L(P2Q5)-L(fwdPxQx)
185*5d9d9091SRichard Lowe           .int        L(P3Q5)-L(fwdPxQx)
186*5d9d9091SRichard Lowe           .int        L(P4Q5)-L(fwdPxQx)
187*5d9d9091SRichard Lowe           .int        L(P5Q5)-L(fwdPxQx)
188*5d9d9091SRichard Lowe           .int        L(P6Q5)-L(fwdPxQx)
189*5d9d9091SRichard Lowe           .int        L(P7Q5)-L(fwdPxQx)
190*5d9d9091SRichard Lowe
191*5d9d9091SRichard Lowe           .int        L(P0Q6)-L(fwdPxQx)
192*5d9d9091SRichard Lowe           .int        L(P1Q6)-L(fwdPxQx)
193*5d9d9091SRichard Lowe           .int        L(P2Q6)-L(fwdPxQx)
194*5d9d9091SRichard Lowe           .int        L(P3Q6)-L(fwdPxQx)
195*5d9d9091SRichard Lowe           .int        L(P4Q6)-L(fwdPxQx)
196*5d9d9091SRichard Lowe           .int        L(P5Q6)-L(fwdPxQx)
197*5d9d9091SRichard Lowe           .int        L(P6Q6)-L(fwdPxQx)
198*5d9d9091SRichard Lowe           .int        L(P7Q6)-L(fwdPxQx)
199*5d9d9091SRichard Lowe
200*5d9d9091SRichard Lowe           .int        L(P0Q7)-L(fwdPxQx)
201*5d9d9091SRichard Lowe           .int        L(P1Q7)-L(fwdPxQx)
202*5d9d9091SRichard Lowe           .int        L(P2Q7)-L(fwdPxQx)
203*5d9d9091SRichard Lowe           .int        L(P3Q7)-L(fwdPxQx)
204*5d9d9091SRichard Lowe           .int        L(P4Q7)-L(fwdPxQx)
205*5d9d9091SRichard Lowe           .int        L(P5Q7)-L(fwdPxQx)
206*5d9d9091SRichard Lowe           .int        L(P6Q7)-L(fwdPxQx)
207*5d9d9091SRichard Lowe           .int        L(P7Q7)-L(fwdPxQx)
208*5d9d9091SRichard Lowe
209*5d9d9091SRichard Lowe           .int        L(P0Q8)-L(fwdPxQx)
210*5d9d9091SRichard Lowe           .int        L(P1Q8)-L(fwdPxQx)
211*5d9d9091SRichard Lowe           .int        L(P2Q8)-L(fwdPxQx)
212*5d9d9091SRichard Lowe           .int        L(P3Q8)-L(fwdPxQx)
213*5d9d9091SRichard Lowe           .int        L(P4Q8)-L(fwdPxQx)
214*5d9d9091SRichard Lowe           .int        L(P5Q8)-L(fwdPxQx)
215*5d9d9091SRichard Lowe           .int        L(P6Q8)-L(fwdPxQx)
216*5d9d9091SRichard Lowe           .int        L(P7Q8)-L(fwdPxQx)
217*5d9d9091SRichard Lowe
218*5d9d9091SRichard Lowe           .int        L(P0Q9)-L(fwdPxQx)
219*5d9d9091SRichard Lowe           .int        L(P1Q9)-L(fwdPxQx)
220*5d9d9091SRichard Lowe           .int        L(P2Q9)-L(fwdPxQx)
221*5d9d9091SRichard Lowe           .int        L(P3Q9)-L(fwdPxQx)
222*5d9d9091SRichard Lowe           .int        L(P4Q9)-L(fwdPxQx)
223*5d9d9091SRichard Lowe           .int        L(P5Q9)-L(fwdPxQx)
224*5d9d9091SRichard Lowe           .int        L(P6Q9)-L(fwdPxQx)
225*5d9d9091SRichard Lowe           .int        L(P7Q9)-L(fwdPxQx)
226*5d9d9091SRichard Lowe
227*5d9d9091SRichard Lowe           .int        L(P0QA)-L(fwdPxQx)
228*5d9d9091SRichard Lowe           .int        L(P1QA)-L(fwdPxQx)
229*5d9d9091SRichard Lowe           .int        L(P2QA)-L(fwdPxQx)
230*5d9d9091SRichard Lowe           .int        L(P3QA)-L(fwdPxQx)
231*5d9d9091SRichard Lowe           .int        L(P4QA)-L(fwdPxQx)
232*5d9d9091SRichard Lowe           .int        L(P5QA)-L(fwdPxQx)
233*5d9d9091SRichard Lowe           .int        L(P6QA)-L(fwdPxQx)
234*5d9d9091SRichard Lowe           .int        L(P7QA)-L(fwdPxQx)
235*5d9d9091SRichard Lowe
236*5d9d9091SRichard Lowe           .int        L(P0QB)-L(fwdPxQx)
237*5d9d9091SRichard Lowe           .int        L(P1QB)-L(fwdPxQx)
238*5d9d9091SRichard Lowe           .int        L(P2QB)-L(fwdPxQx)
239*5d9d9091SRichard Lowe           .int        L(P3QB)-L(fwdPxQx)
240*5d9d9091SRichard Lowe           .int        L(P4QB)-L(fwdPxQx)
241*5d9d9091SRichard Lowe           .int        L(P5QB)-L(fwdPxQx)
242*5d9d9091SRichard Lowe           .int        L(P6QB)-L(fwdPxQx)
243*5d9d9091SRichard Lowe           .int        L(P7QB)-L(fwdPxQx)
244*5d9d9091SRichard Lowe
245*5d9d9091SRichard Lowe           .int        L(P0QC)-L(fwdPxQx)
246*5d9d9091SRichard Lowe           .int        L(P1QC)-L(fwdPxQx)
247*5d9d9091SRichard Lowe           .int        L(P2QC)-L(fwdPxQx)
248*5d9d9091SRichard Lowe           .int        L(P3QC)-L(fwdPxQx)
249*5d9d9091SRichard Lowe           .int        L(P4QC)-L(fwdPxQx)
250*5d9d9091SRichard Lowe           .int        L(P5QC)-L(fwdPxQx)
251*5d9d9091SRichard Lowe           .int        L(P6QC)-L(fwdPxQx)
252*5d9d9091SRichard Lowe           .int        L(P7QC)-L(fwdPxQx)
253*5d9d9091SRichard Lowe
254*5d9d9091SRichard Lowe           .int        L(P0QD)-L(fwdPxQx)
255*5d9d9091SRichard Lowe           .int        L(P1QD)-L(fwdPxQx)
256*5d9d9091SRichard Lowe           .int        L(P2QD)-L(fwdPxQx)
257*5d9d9091SRichard Lowe           .int        L(P3QD)-L(fwdPxQx)
258*5d9d9091SRichard Lowe           .int        L(P4QD)-L(fwdPxQx)
259*5d9d9091SRichard Lowe           .int        L(P5QD)-L(fwdPxQx)
260*5d9d9091SRichard Lowe           .int        L(P6QD)-L(fwdPxQx)
261*5d9d9091SRichard Lowe           .int        L(P7QD)-L(fwdPxQx)
262*5d9d9091SRichard Lowe
263*5d9d9091SRichard Lowe           .int        L(P0QE)-L(fwdPxQx)
264*5d9d9091SRichard Lowe           .int        L(P1QE)-L(fwdPxQx)
265*5d9d9091SRichard Lowe           .int        L(P2QE)-L(fwdPxQx)
266*5d9d9091SRichard Lowe           .int        L(P3QE)-L(fwdPxQx)
267*5d9d9091SRichard Lowe           .int        L(P4QE)-L(fwdPxQx)
268*5d9d9091SRichard Lowe           .int        L(P5QE)-L(fwdPxQx)
269*5d9d9091SRichard Lowe           .int        L(P6QE)-L(fwdPxQx)
270*5d9d9091SRichard Lowe           .int        L(P7QE)-L(fwdPxQx)
271*5d9d9091SRichard Lowe
272*5d9d9091SRichard Lowe           .int        L(P0QF)-L(fwdPxQx)
273*5d9d9091SRichard Lowe           .int        L(P1QF)-L(fwdPxQx)
274*5d9d9091SRichard Lowe           .int        L(P2QF)-L(fwdPxQx)
275*5d9d9091SRichard Lowe           .int        L(P3QF)-L(fwdPxQx)
276*5d9d9091SRichard Lowe           .int        L(P4QF)-L(fwdPxQx)
277*5d9d9091SRichard Lowe           .int        L(P5QF)-L(fwdPxQx)
278*5d9d9091SRichard Lowe           .int        L(P6QF)-L(fwdPxQx)
279*5d9d9091SRichard Lowe           .int        L(P7QF)-L(fwdPxQx)
280*5d9d9091SRichard Lowe
281*5d9d9091SRichard Lowe           .int        L(P0QG)-L(fwdPxQx)	# 0x80
282*5d9d9091SRichard Lowe
283*5d9d9091SRichard Lowe	   .balign 16
284*5d9d9091SRichard LoweL(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
285*5d9d9091SRichard Lowe           .int        L(A1Q0)-L(AliPxQx)
286*5d9d9091SRichard Lowe           .int        L(A2Q0)-L(AliPxQx)
287*5d9d9091SRichard Lowe           .int        L(A3Q0)-L(AliPxQx)
288*5d9d9091SRichard Lowe           .int        L(A4Q0)-L(AliPxQx)
289*5d9d9091SRichard Lowe           .int        L(A5Q0)-L(AliPxQx)
290*5d9d9091SRichard Lowe           .int        L(A6Q0)-L(AliPxQx)
291*5d9d9091SRichard Lowe           .int        L(A7Q0)-L(AliPxQx)
292*5d9d9091SRichard Lowe           .int        L(A0Q1)-L(AliPxQx)
293*5d9d9091SRichard Lowe           .int        L(A1Q1)-L(AliPxQx)
294*5d9d9091SRichard Lowe           .int        L(A2Q1)-L(AliPxQx)
295*5d9d9091SRichard Lowe           .int        L(A3Q1)-L(AliPxQx)
296*5d9d9091SRichard Lowe           .int        L(A4Q1)-L(AliPxQx)
297*5d9d9091SRichard Lowe           .int        L(A5Q1)-L(AliPxQx)
298*5d9d9091SRichard Lowe           .int        L(A6Q1)-L(AliPxQx)
299*5d9d9091SRichard Lowe           .int        L(A7Q1)-L(AliPxQx)
300*5d9d9091SRichard Lowe
301*5d9d9091SRichard Lowe	.balign 16
302*5d9d9091SRichard LoweL(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
303*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
304*5d9d9091SRichard Lowe	sub    $0xf,%r8
305*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
306*5d9d9091SRichard Lowe
307*5d9d9091SRichard Lowe	movzwq 0x1(%rdx),%r10
308*5d9d9091SRichard Lowe	mov    %r10w,0x1(%rcx)
309*5d9d9091SRichard Lowe
310*5d9d9091SRichard Lowe	mov    0x3(%rdx),%r9d
311*5d9d9091SRichard Lowe	mov    %r9d,0x3(%rcx)
312*5d9d9091SRichard Lowe
313*5d9d9091SRichard Lowe	mov    0x7(%rdx),%r11
314*5d9d9091SRichard Lowe	add    $0xf,%rdx
315*5d9d9091SRichard Lowe	mov    %r11,0x7(%rcx)
316*5d9d9091SRichard Lowe
317*5d9d9091SRichard Lowe	add    $0xf,%rcx
318*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
319*5d9d9091SRichard Lowe
320*5d9d9091SRichard Lowe	.balign 16
321*5d9d9091SRichard LoweL(A2Q0):			# ; need to move 8+ 6=2+4 bytes
322*5d9d9091SRichard Lowe	movzwq (%rdx),%r10
323*5d9d9091SRichard Lowe	sub    $0xe,%r8
324*5d9d9091SRichard Lowe	mov    %r10w,(%rcx)
325*5d9d9091SRichard Lowe
326*5d9d9091SRichard Lowe	mov    0x2(%rdx),%r9d
327*5d9d9091SRichard Lowe	mov    %r9d,0x2(%rcx)
328*5d9d9091SRichard Lowe
329*5d9d9091SRichard Lowe	mov    0x6(%rdx),%r11
330*5d9d9091SRichard Lowe	add    $0xe,%rdx
331*5d9d9091SRichard Lowe	mov    %r11,0x6(%rcx)
332*5d9d9091SRichard Lowe	add    $0xe,%rcx
333*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
334*5d9d9091SRichard Lowe
335*5d9d9091SRichard Lowe	.balign 16
336*5d9d9091SRichard LoweL(A3Q0):			# ; need to move 8+ 5=1+4 bytes
337*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
338*5d9d9091SRichard Lowe	sub    $0xd,%r8
339*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
340*5d9d9091SRichard Lowe
341*5d9d9091SRichard Lowe	mov    0x1(%rdx),%r9d
342*5d9d9091SRichard Lowe	mov    %r9d,0x1(%rcx)
343*5d9d9091SRichard Lowe
344*5d9d9091SRichard Lowe	mov    0x5(%rdx),%r10
345*5d9d9091SRichard Lowe	add    $0xd,%rdx
346*5d9d9091SRichard Lowe	mov    %r10,0x5(%rcx)
347*5d9d9091SRichard Lowe
348*5d9d9091SRichard Lowe	add    $0xd,%rcx
349*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
350*5d9d9091SRichard Lowe
351*5d9d9091SRichard Lowe	.balign 16
352*5d9d9091SRichard LoweL(A4Q0):			# ; need to move 8+4 bytes
353*5d9d9091SRichard Lowe	mov    (%rdx),%r9d
354*5d9d9091SRichard Lowe	sub    $0xc,%r8
355*5d9d9091SRichard Lowe	mov    %r9d,(%rcx)
356*5d9d9091SRichard Lowe
357*5d9d9091SRichard Lowe	mov    0x4(%rdx),%r10
358*5d9d9091SRichard Lowe	add    $0xc,%rdx
359*5d9d9091SRichard Lowe	mov    %r10,0x4(%rcx)
360*5d9d9091SRichard Lowe
361*5d9d9091SRichard Lowe	add    $0xc,%rcx
362*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
363*5d9d9091SRichard Lowe
364*5d9d9091SRichard Lowe	.balign 16
365*5d9d9091SRichard LoweL(A5Q0):			# ; need to move 8+ 3=1+2 bytes
366*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
367*5d9d9091SRichard Lowe	sub    $0xb,%r8
368*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
369*5d9d9091SRichard Lowe
370*5d9d9091SRichard Lowe	movzwq 0x1(%rdx),%r10
371*5d9d9091SRichard Lowe	mov    %r10w,0x1(%rcx)
372*5d9d9091SRichard Lowe
373*5d9d9091SRichard Lowe	mov    0x3(%rdx),%r9
374*5d9d9091SRichard Lowe	add    $0xb,%rdx
375*5d9d9091SRichard Lowe	mov    %r9,0x3(%rcx)
376*5d9d9091SRichard Lowe
377*5d9d9091SRichard Lowe	add    $0xb,%rcx
378*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
379*5d9d9091SRichard Lowe
380*5d9d9091SRichard Lowe	.balign 16
381*5d9d9091SRichard LoweL(A6Q0):			# ; need to move 8+2 bytes
382*5d9d9091SRichard Lowe	movzwq (%rdx),%r10
383*5d9d9091SRichard Lowe	sub    $0xa,%r8
384*5d9d9091SRichard Lowe	mov    %r10w,(%rcx)
385*5d9d9091SRichard Lowe
386*5d9d9091SRichard Lowe	mov    0x2(%rdx),%r9
387*5d9d9091SRichard Lowe	add    $0xa,%rdx
388*5d9d9091SRichard Lowe	mov    %r9,0x2(%rcx)
389*5d9d9091SRichard Lowe
390*5d9d9091SRichard Lowe	add    $0xa,%rcx
391*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
392*5d9d9091SRichard Lowe
393*5d9d9091SRichard Lowe	.balign 16
394*5d9d9091SRichard LoweL(A7Q0):			# ; need to move 8+1 byte
395*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
396*5d9d9091SRichard Lowe	sub    $0x9,%r8
397*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
398*5d9d9091SRichard Lowe
399*5d9d9091SRichard Lowe	mov    0x1(%rdx),%r10
400*5d9d9091SRichard Lowe	add    $0x9,%rdx
401*5d9d9091SRichard Lowe	mov    %r10,0x1(%rcx)
402*5d9d9091SRichard Lowe
403*5d9d9091SRichard Lowe	add    $0x9,%rcx
404*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
405*5d9d9091SRichard Lowe
406*5d9d9091SRichard Lowe	.balign 16
407*5d9d9091SRichard LoweL(A0Q1):			# ; need to move 8 bytes
408*5d9d9091SRichard Lowe
409*5d9d9091SRichard Lowe	mov    (%rdx),%r10
410*5d9d9091SRichard Lowe	add    $0x8,%rdx
411*5d9d9091SRichard Lowe	sub    $0x8,%r8
412*5d9d9091SRichard Lowe	mov    %r10,(%rcx)
413*5d9d9091SRichard Lowe
414*5d9d9091SRichard Lowe	add    $0x8,%rcx
415*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
416*5d9d9091SRichard Lowe
417*5d9d9091SRichard Lowe	.balign 16
418*5d9d9091SRichard LoweL(A1Q1):			# ; need to move 7=1+2+4 bytes
419*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
420*5d9d9091SRichard Lowe	sub    $0x7,%r8
421*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
422*5d9d9091SRichard Lowe
423*5d9d9091SRichard Lowe	movzwq 0x1(%rdx),%r10
424*5d9d9091SRichard Lowe	mov    %r10w,0x1(%rcx)
425*5d9d9091SRichard Lowe
426*5d9d9091SRichard Lowe	mov    0x3(%rdx),%r9d
427*5d9d9091SRichard Lowe	add    $0x7,%rdx
428*5d9d9091SRichard Lowe	mov    %r9d,0x3(%rcx)
429*5d9d9091SRichard Lowe	add    $0x7,%rcx
430*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
431*5d9d9091SRichard Lowe
432*5d9d9091SRichard Lowe	.balign 16
433*5d9d9091SRichard LoweL(A2Q1):			# ; need to move 6=2+4 bytes
434*5d9d9091SRichard Lowe	movzwq (%rdx),%r10
435*5d9d9091SRichard Lowe	sub    $0x6,%r8
436*5d9d9091SRichard Lowe	mov    %r10w,(%rcx)
437*5d9d9091SRichard Lowe	mov    0x2(%rdx),%r9d
438*5d9d9091SRichard Lowe	add    $0x6,%rdx
439*5d9d9091SRichard Lowe	mov    %r9d,0x2(%rcx)
440*5d9d9091SRichard Lowe	add    $0x6,%rcx
441*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
442*5d9d9091SRichard Lowe
443*5d9d9091SRichard Lowe	.balign 16
444*5d9d9091SRichard LoweL(A3Q1):			# ; need to move 5=1+4 bytes
445*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
446*5d9d9091SRichard Lowe	sub    $0x5,%r8
447*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
448*5d9d9091SRichard Lowe	mov    0x1(%rdx),%r9d
449*5d9d9091SRichard Lowe	add    $0x5,%rdx
450*5d9d9091SRichard Lowe	mov    %r9d,0x1(%rcx)
451*5d9d9091SRichard Lowe	add    $0x5,%rcx
452*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
453*5d9d9091SRichard Lowe
454*5d9d9091SRichard Lowe	.balign 16
455*5d9d9091SRichard LoweL(A4Q1):			# ; need to move 4 bytes
456*5d9d9091SRichard Lowe	mov    (%rdx),%r9d
457*5d9d9091SRichard Lowe	sub    $0x4,%r8
458*5d9d9091SRichard Lowe	add    $0x4,%rdx
459*5d9d9091SRichard Lowe	mov    %r9d,(%rcx)
460*5d9d9091SRichard Lowe	add    $0x4,%rcx
461*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
462*5d9d9091SRichard Lowe
463*5d9d9091SRichard Lowe	.balign 16
464*5d9d9091SRichard LoweL(A5Q1):			# ; need to move 3=1+2 bytes
465*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
466*5d9d9091SRichard Lowe	sub    $0x3,%r8
467*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
468*5d9d9091SRichard Lowe
469*5d9d9091SRichard Lowe	movzwq 0x1(%rdx),%r10
470*5d9d9091SRichard Lowe	add    $0x3,%rdx
471*5d9d9091SRichard Lowe	mov    %r10w,0x1(%rcx)
472*5d9d9091SRichard Lowe
473*5d9d9091SRichard Lowe	add    $0x3,%rcx
474*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
475*5d9d9091SRichard Lowe
476*5d9d9091SRichard Lowe	.balign 16
477*5d9d9091SRichard LoweL(A6Q1):			# ; need to move 2 bytes
478*5d9d9091SRichard Lowe	movzwq (%rdx),%r10
479*5d9d9091SRichard Lowe	sub    $0x2,%r8
480*5d9d9091SRichard Lowe	add    $0x2,%rdx
481*5d9d9091SRichard Lowe	mov    %r10w,(%rcx)
482*5d9d9091SRichard Lowe	add    $0x2,%rcx
483*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
484*5d9d9091SRichard Lowe
485*5d9d9091SRichard Lowe	.balign 16
486*5d9d9091SRichard LoweL(A7Q1):			# ; need to move 1 byte
487*5d9d9091SRichard Lowe	movzbq (%rdx),%r11
488*5d9d9091SRichard Lowe	dec    %r8
489*5d9d9091SRichard Lowe	inc    %rdx
490*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
491*5d9d9091SRichard Lowe	inc    %rcx
492*5d9d9091SRichard Lowe	jmp    L(now_qw_aligned)
493*5d9d9091SRichard Lowe
494*5d9d9091SRichard Lowe
495*5d9d9091SRichard Lowe	.balign 16
496*5d9d9091SRichard LoweL(P0QG):
497*5d9d9091SRichard Lowe	mov    -0x80(%rdx),%r9
498*5d9d9091SRichard Lowe	mov    %r9,-0x80(%rcx)
499*5d9d9091SRichard LoweL(P0QF):
500*5d9d9091SRichard Lowe	mov    -0x78(%rdx),%r10
501*5d9d9091SRichard Lowe	mov    %r10,-0x78(%rcx)
502*5d9d9091SRichard LoweL(P0QE):
503*5d9d9091SRichard Lowe	mov    -0x70(%rdx),%r9
504*5d9d9091SRichard Lowe	mov    %r9,-0x70(%rcx)
505*5d9d9091SRichard LoweL(P0QD):
506*5d9d9091SRichard Lowe	mov    -0x68(%rdx),%r10
507*5d9d9091SRichard Lowe	mov    %r10,-0x68(%rcx)
508*5d9d9091SRichard LoweL(P0QC):
509*5d9d9091SRichard Lowe	mov    -0x60(%rdx),%r9
510*5d9d9091SRichard Lowe	mov    %r9,-0x60(%rcx)
511*5d9d9091SRichard LoweL(P0QB):
512*5d9d9091SRichard Lowe	mov    -0x58(%rdx),%r10
513*5d9d9091SRichard Lowe	mov    %r10,-0x58(%rcx)
514*5d9d9091SRichard LoweL(P0QA):
515*5d9d9091SRichard Lowe	mov    -0x50(%rdx),%r9
516*5d9d9091SRichard Lowe	mov    %r9,-0x50(%rcx)
517*5d9d9091SRichard LoweL(P0Q9):
518*5d9d9091SRichard Lowe	mov    -0x48(%rdx),%r10
519*5d9d9091SRichard Lowe	mov    %r10,-0x48(%rcx)
520*5d9d9091SRichard LoweL(P0Q8):
521*5d9d9091SRichard Lowe	mov    -0x40(%rdx),%r9
522*5d9d9091SRichard Lowe	mov    %r9,-0x40(%rcx)
523*5d9d9091SRichard LoweL(P0Q7):
524*5d9d9091SRichard Lowe	mov    -0x38(%rdx),%r10
525*5d9d9091SRichard Lowe	mov    %r10,-0x38(%rcx)
526*5d9d9091SRichard LoweL(P0Q6):
527*5d9d9091SRichard Lowe	mov    -0x30(%rdx),%r9
528*5d9d9091SRichard Lowe	mov    %r9,-0x30(%rcx)
529*5d9d9091SRichard LoweL(P0Q5):
530*5d9d9091SRichard Lowe	mov    -0x28(%rdx),%r10
531*5d9d9091SRichard Lowe	mov    %r10,-0x28(%rcx)
532*5d9d9091SRichard LoweL(P0Q4):
533*5d9d9091SRichard Lowe	mov    -0x20(%rdx),%r9
534*5d9d9091SRichard Lowe	mov    %r9,-0x20(%rcx)
535*5d9d9091SRichard LoweL(P0Q3):
536*5d9d9091SRichard Lowe	mov    -0x18(%rdx),%r10
537*5d9d9091SRichard Lowe	mov    %r10,-0x18(%rcx)
538*5d9d9091SRichard LoweL(P0Q2):
539*5d9d9091SRichard Lowe	mov    -0x10(%rdx),%r9
540*5d9d9091SRichard Lowe	mov    %r9,-0x10(%rcx)
541*5d9d9091SRichard LoweL(P0Q1):
542*5d9d9091SRichard Lowe	mov    -0x8(%rdx),%r10
543*5d9d9091SRichard Lowe	mov    %r10,-0x8(%rcx)
544*5d9d9091SRichard LoweL(P0Q0):
545*5d9d9091SRichard Lowe	ret
546*5d9d9091SRichard Lowe
547*5d9d9091SRichard Lowe	.balign 16
548*5d9d9091SRichard LoweL(P1QF):
549*5d9d9091SRichard Lowe	mov    -0x79(%rdx),%r9
550*5d9d9091SRichard Lowe	mov    %r9,-0x79(%rcx)
551*5d9d9091SRichard LoweL(P1QE):
552*5d9d9091SRichard Lowe	mov    -0x71(%rdx),%r11
553*5d9d9091SRichard Lowe	mov    %r11,-0x71(%rcx)
554*5d9d9091SRichard LoweL(P1QD):
555*5d9d9091SRichard Lowe	mov    -0x69(%rdx),%r10
556*5d9d9091SRichard Lowe	mov    %r10,-0x69(%rcx)
557*5d9d9091SRichard LoweL(P1QC):
558*5d9d9091SRichard Lowe	mov    -0x61(%rdx),%r9
559*5d9d9091SRichard Lowe	mov    %r9,-0x61(%rcx)
560*5d9d9091SRichard LoweL(P1QB):
561*5d9d9091SRichard Lowe	mov    -0x59(%rdx),%r11
562*5d9d9091SRichard Lowe	mov    %r11,-0x59(%rcx)
563*5d9d9091SRichard LoweL(P1QA):
564*5d9d9091SRichard Lowe	mov    -0x51(%rdx),%r10
565*5d9d9091SRichard Lowe	mov    %r10,-0x51(%rcx)
566*5d9d9091SRichard LoweL(P1Q9):
567*5d9d9091SRichard Lowe	mov    -0x49(%rdx),%r9
568*5d9d9091SRichard Lowe	mov    %r9,-0x49(%rcx)
569*5d9d9091SRichard LoweL(P1Q8):
570*5d9d9091SRichard Lowe	mov    -0x41(%rdx),%r11
571*5d9d9091SRichard Lowe	mov    %r11,-0x41(%rcx)
572*5d9d9091SRichard LoweL(P1Q7):
573*5d9d9091SRichard Lowe	mov    -0x39(%rdx),%r10
574*5d9d9091SRichard Lowe	mov    %r10,-0x39(%rcx)
575*5d9d9091SRichard LoweL(P1Q6):
576*5d9d9091SRichard Lowe	mov    -0x31(%rdx),%r9
577*5d9d9091SRichard Lowe	mov    %r9,-0x31(%rcx)
578*5d9d9091SRichard LoweL(P1Q5):
579*5d9d9091SRichard Lowe	mov    -0x29(%rdx),%r11
580*5d9d9091SRichard Lowe	mov    %r11,-0x29(%rcx)
581*5d9d9091SRichard LoweL(P1Q4):
582*5d9d9091SRichard Lowe	mov    -0x21(%rdx),%r10
583*5d9d9091SRichard Lowe	mov    %r10,-0x21(%rcx)
584*5d9d9091SRichard LoweL(P1Q3):
585*5d9d9091SRichard Lowe	mov    -0x19(%rdx),%r9
586*5d9d9091SRichard Lowe	mov    %r9,-0x19(%rcx)
587*5d9d9091SRichard LoweL(P1Q2):
588*5d9d9091SRichard Lowe	mov    -0x11(%rdx),%r11
589*5d9d9091SRichard Lowe	mov    %r11,-0x11(%rcx)
590*5d9d9091SRichard LoweL(P1Q1):
591*5d9d9091SRichard Lowe	mov    -0x9(%rdx),%r10
592*5d9d9091SRichard Lowe	mov    %r10,-0x9(%rcx)
593*5d9d9091SRichard LoweL(P1Q0):
594*5d9d9091SRichard Lowe	movzbq -0x1(%rdx),%r9
595*5d9d9091SRichard Lowe	mov    %r9b,-0x1(%rcx)
596*5d9d9091SRichard Lowe	ret
597*5d9d9091SRichard Lowe
598*5d9d9091SRichard Lowe	.balign 16
599*5d9d9091SRichard LoweL(P2QF):
600*5d9d9091SRichard Lowe	mov    -0x7a(%rdx),%r9
601*5d9d9091SRichard Lowe	mov    %r9,-0x7a(%rcx)
602*5d9d9091SRichard LoweL(P2QE):
603*5d9d9091SRichard Lowe	mov    -0x72(%rdx),%r11
604*5d9d9091SRichard Lowe	mov    %r11,-0x72(%rcx)
605*5d9d9091SRichard LoweL(P2QD):
606*5d9d9091SRichard Lowe	mov    -0x6a(%rdx),%r10
607*5d9d9091SRichard Lowe	mov    %r10,-0x6a(%rcx)
608*5d9d9091SRichard LoweL(P2QC):
609*5d9d9091SRichard Lowe	mov    -0x62(%rdx),%r9
610*5d9d9091SRichard Lowe	mov    %r9,-0x62(%rcx)
611*5d9d9091SRichard LoweL(P2QB):
612*5d9d9091SRichard Lowe	mov    -0x5a(%rdx),%r11
613*5d9d9091SRichard Lowe	mov    %r11,-0x5a(%rcx)
614*5d9d9091SRichard LoweL(P2QA):
615*5d9d9091SRichard Lowe	mov    -0x52(%rdx),%r10
616*5d9d9091SRichard Lowe	mov    %r10,-0x52(%rcx)
617*5d9d9091SRichard LoweL(P2Q9):
618*5d9d9091SRichard Lowe	mov    -0x4a(%rdx),%r9
619*5d9d9091SRichard Lowe	mov    %r9,-0x4a(%rcx)
620*5d9d9091SRichard LoweL(P2Q8):
621*5d9d9091SRichard Lowe	mov    -0x42(%rdx),%r11
622*5d9d9091SRichard Lowe	mov    %r11,-0x42(%rcx)
623*5d9d9091SRichard LoweL(P2Q7):
624*5d9d9091SRichard Lowe	mov    -0x3a(%rdx),%r10
625*5d9d9091SRichard Lowe	mov    %r10,-0x3a(%rcx)
626*5d9d9091SRichard LoweL(P2Q6):
627*5d9d9091SRichard Lowe	mov    -0x32(%rdx),%r9
628*5d9d9091SRichard Lowe	mov    %r9,-0x32(%rcx)
629*5d9d9091SRichard LoweL(P2Q5):
630*5d9d9091SRichard Lowe	mov    -0x2a(%rdx),%r11
631*5d9d9091SRichard Lowe	mov    %r11,-0x2a(%rcx)
632*5d9d9091SRichard LoweL(P2Q4):
633*5d9d9091SRichard Lowe	mov    -0x22(%rdx),%r10
634*5d9d9091SRichard Lowe	mov    %r10,-0x22(%rcx)
635*5d9d9091SRichard LoweL(P2Q3):
636*5d9d9091SRichard Lowe	mov    -0x1a(%rdx),%r9
637*5d9d9091SRichard Lowe	mov    %r9,-0x1a(%rcx)
638*5d9d9091SRichard LoweL(P2Q2):
639*5d9d9091SRichard Lowe	mov    -0x12(%rdx),%r11
640*5d9d9091SRichard Lowe	mov    %r11,-0x12(%rcx)
641*5d9d9091SRichard LoweL(P2Q1):
642*5d9d9091SRichard Lowe	mov    -0xa(%rdx),%r10
643*5d9d9091SRichard Lowe	mov    %r10,-0xa(%rcx)
644*5d9d9091SRichard LoweL(P2Q0):
645*5d9d9091SRichard Lowe	movzwq -0x2(%rdx),%r9
646*5d9d9091SRichard Lowe	mov    %r9w,-0x2(%rcx)
647*5d9d9091SRichard Lowe	ret
648*5d9d9091SRichard Lowe
649*5d9d9091SRichard Lowe	.balign 16
650*5d9d9091SRichard LoweL(P3QF):
651*5d9d9091SRichard Lowe	mov    -0x7b(%rdx),%r9
652*5d9d9091SRichard Lowe	mov    %r9,-0x7b(%rcx)
653*5d9d9091SRichard LoweL(P3QE):
654*5d9d9091SRichard Lowe	mov    -0x73(%rdx),%r11
655*5d9d9091SRichard Lowe	mov    %r11,-0x73(%rcx)
656*5d9d9091SRichard LoweL(P3QD):
657*5d9d9091SRichard Lowe	mov    -0x6b(%rdx),%r10
658*5d9d9091SRichard Lowe	mov    %r10,-0x6b(%rcx)
659*5d9d9091SRichard LoweL(P3QC):
660*5d9d9091SRichard Lowe	mov    -0x63(%rdx),%r9
661*5d9d9091SRichard Lowe	mov    %r9,-0x63(%rcx)
662*5d9d9091SRichard LoweL(P3QB):
663*5d9d9091SRichard Lowe	mov    -0x5b(%rdx),%r11
664*5d9d9091SRichard Lowe	mov    %r11,-0x5b(%rcx)
665*5d9d9091SRichard LoweL(P3QA):
666*5d9d9091SRichard Lowe	mov    -0x53(%rdx),%r10
667*5d9d9091SRichard Lowe	mov    %r10,-0x53(%rcx)
668*5d9d9091SRichard LoweL(P3Q9):
669*5d9d9091SRichard Lowe	mov    -0x4b(%rdx),%r9
670*5d9d9091SRichard Lowe	mov    %r9,-0x4b(%rcx)
671*5d9d9091SRichard LoweL(P3Q8):
672*5d9d9091SRichard Lowe	mov    -0x43(%rdx),%r11
673*5d9d9091SRichard Lowe	mov    %r11,-0x43(%rcx)
674*5d9d9091SRichard LoweL(P3Q7):
675*5d9d9091SRichard Lowe	mov    -0x3b(%rdx),%r10
676*5d9d9091SRichard Lowe	mov    %r10,-0x3b(%rcx)
677*5d9d9091SRichard LoweL(P3Q6):
678*5d9d9091SRichard Lowe	mov    -0x33(%rdx),%r9
679*5d9d9091SRichard Lowe	mov    %r9,-0x33(%rcx)
680*5d9d9091SRichard LoweL(P3Q5):
681*5d9d9091SRichard Lowe	mov    -0x2b(%rdx),%r11
682*5d9d9091SRichard Lowe	mov    %r11,-0x2b(%rcx)
683*5d9d9091SRichard LoweL(P3Q4):
684*5d9d9091SRichard Lowe	mov    -0x23(%rdx),%r10
685*5d9d9091SRichard Lowe	mov    %r10,-0x23(%rcx)
686*5d9d9091SRichard LoweL(P3Q3):
687*5d9d9091SRichard Lowe	mov    -0x1b(%rdx),%r9
688*5d9d9091SRichard Lowe	mov    %r9,-0x1b(%rcx)
689*5d9d9091SRichard LoweL(P3Q2):
690*5d9d9091SRichard Lowe	mov    -0x13(%rdx),%r11
691*5d9d9091SRichard Lowe	mov    %r11,-0x13(%rcx)
692*5d9d9091SRichard LoweL(P3Q1):
693*5d9d9091SRichard Lowe	mov    -0xb(%rdx),%r10
694*5d9d9091SRichard Lowe	mov    %r10,-0xb(%rcx)
695*5d9d9091SRichard Lowe	/*
696*5d9d9091SRichard Lowe	 * These trailing loads/stores have to do all their loads 1st,
697*5d9d9091SRichard Lowe	 * then do the stores.
698*5d9d9091SRichard Lowe	 */
699*5d9d9091SRichard LoweL(P3Q0):
700*5d9d9091SRichard Lowe	movzwq -0x3(%rdx),%r9
701*5d9d9091SRichard Lowe	movzbq -0x1(%rdx),%r10
702*5d9d9091SRichard Lowe	mov    %r9w,-0x3(%rcx)
703*5d9d9091SRichard Lowe	mov    %r10b,-0x1(%rcx)
704*5d9d9091SRichard Lowe	ret
705*5d9d9091SRichard Lowe
706*5d9d9091SRichard Lowe	.balign 16
707*5d9d9091SRichard LoweL(P4QF):
708*5d9d9091SRichard Lowe	mov    -0x7c(%rdx),%r9
709*5d9d9091SRichard Lowe	mov    %r9,-0x7c(%rcx)
710*5d9d9091SRichard LoweL(P4QE):
711*5d9d9091SRichard Lowe	mov    -0x74(%rdx),%r11
712*5d9d9091SRichard Lowe	mov    %r11,-0x74(%rcx)
713*5d9d9091SRichard LoweL(P4QD):
714*5d9d9091SRichard Lowe	mov    -0x6c(%rdx),%r10
715*5d9d9091SRichard Lowe	mov    %r10,-0x6c(%rcx)
716*5d9d9091SRichard LoweL(P4QC):
717*5d9d9091SRichard Lowe	mov    -0x64(%rdx),%r9
718*5d9d9091SRichard Lowe	mov    %r9,-0x64(%rcx)
719*5d9d9091SRichard LoweL(P4QB):
720*5d9d9091SRichard Lowe	mov    -0x5c(%rdx),%r11
721*5d9d9091SRichard Lowe	mov    %r11,-0x5c(%rcx)
722*5d9d9091SRichard LoweL(P4QA):
723*5d9d9091SRichard Lowe	mov    -0x54(%rdx),%r10
724*5d9d9091SRichard Lowe	mov    %r10,-0x54(%rcx)
725*5d9d9091SRichard LoweL(P4Q9):
726*5d9d9091SRichard Lowe	mov    -0x4c(%rdx),%r9
727*5d9d9091SRichard Lowe	mov    %r9,-0x4c(%rcx)
728*5d9d9091SRichard LoweL(P4Q8):
729*5d9d9091SRichard Lowe	mov    -0x44(%rdx),%r11
730*5d9d9091SRichard Lowe	mov    %r11,-0x44(%rcx)
731*5d9d9091SRichard LoweL(P4Q7):
732*5d9d9091SRichard Lowe	mov    -0x3c(%rdx),%r10
733*5d9d9091SRichard Lowe	mov    %r10,-0x3c(%rcx)
734*5d9d9091SRichard LoweL(P4Q6):
735*5d9d9091SRichard Lowe	mov    -0x34(%rdx),%r9
736*5d9d9091SRichard Lowe	mov    %r9,-0x34(%rcx)
737*5d9d9091SRichard LoweL(P4Q5):
738*5d9d9091SRichard Lowe	mov    -0x2c(%rdx),%r11
739*5d9d9091SRichard Lowe	mov    %r11,-0x2c(%rcx)
740*5d9d9091SRichard LoweL(P4Q4):
741*5d9d9091SRichard Lowe	mov    -0x24(%rdx),%r10
742*5d9d9091SRichard Lowe	mov    %r10,-0x24(%rcx)
743*5d9d9091SRichard LoweL(P4Q3):
744*5d9d9091SRichard Lowe	mov    -0x1c(%rdx),%r9
745*5d9d9091SRichard Lowe	mov    %r9,-0x1c(%rcx)
746*5d9d9091SRichard LoweL(P4Q2):
747*5d9d9091SRichard Lowe	mov    -0x14(%rdx),%r11
748*5d9d9091SRichard Lowe	mov    %r11,-0x14(%rcx)
749*5d9d9091SRichard LoweL(P4Q1):
750*5d9d9091SRichard Lowe	mov    -0xc(%rdx),%r10
751*5d9d9091SRichard Lowe	mov    %r10,-0xc(%rcx)
752*5d9d9091SRichard LoweL(P4Q0):
753*5d9d9091SRichard Lowe	mov    -0x4(%rdx),%r9d
754*5d9d9091SRichard Lowe	mov    %r9d,-0x4(%rcx)
755*5d9d9091SRichard Lowe	ret
756*5d9d9091SRichard Lowe
757*5d9d9091SRichard Lowe	.balign 16
758*5d9d9091SRichard LoweL(P5QF):
759*5d9d9091SRichard Lowe	mov    -0x7d(%rdx),%r9
760*5d9d9091SRichard Lowe	mov    %r9,-0x7d(%rcx)
761*5d9d9091SRichard LoweL(P5QE):
762*5d9d9091SRichard Lowe	mov    -0x75(%rdx),%r11
763*5d9d9091SRichard Lowe	mov    %r11,-0x75(%rcx)
764*5d9d9091SRichard LoweL(P5QD):
765*5d9d9091SRichard Lowe	mov    -0x6d(%rdx),%r10
766*5d9d9091SRichard Lowe	mov    %r10,-0x6d(%rcx)
767*5d9d9091SRichard LoweL(P5QC):
768*5d9d9091SRichard Lowe	mov    -0x65(%rdx),%r9
769*5d9d9091SRichard Lowe	mov    %r9,-0x65(%rcx)
770*5d9d9091SRichard LoweL(P5QB):
771*5d9d9091SRichard Lowe	mov    -0x5d(%rdx),%r11
772*5d9d9091SRichard Lowe	mov    %r11,-0x5d(%rcx)
773*5d9d9091SRichard LoweL(P5QA):
774*5d9d9091SRichard Lowe	mov    -0x55(%rdx),%r10
775*5d9d9091SRichard Lowe	mov    %r10,-0x55(%rcx)
776*5d9d9091SRichard LoweL(P5Q9):
777*5d9d9091SRichard Lowe	mov    -0x4d(%rdx),%r9
778*5d9d9091SRichard Lowe	mov    %r9,-0x4d(%rcx)
779*5d9d9091SRichard LoweL(P5Q8):
780*5d9d9091SRichard Lowe	mov    -0x45(%rdx),%r11
781*5d9d9091SRichard Lowe	mov    %r11,-0x45(%rcx)
782*5d9d9091SRichard LoweL(P5Q7):
783*5d9d9091SRichard Lowe	mov    -0x3d(%rdx),%r10
784*5d9d9091SRichard Lowe	mov    %r10,-0x3d(%rcx)
785*5d9d9091SRichard LoweL(P5Q6):
786*5d9d9091SRichard Lowe	mov    -0x35(%rdx),%r9
787*5d9d9091SRichard Lowe	mov    %r9,-0x35(%rcx)
788*5d9d9091SRichard LoweL(P5Q5):
789*5d9d9091SRichard Lowe	mov    -0x2d(%rdx),%r11
790*5d9d9091SRichard Lowe	mov    %r11,-0x2d(%rcx)
791*5d9d9091SRichard LoweL(P5Q4):
792*5d9d9091SRichard Lowe	mov    -0x25(%rdx),%r10
793*5d9d9091SRichard Lowe	mov    %r10,-0x25(%rcx)
794*5d9d9091SRichard LoweL(P5Q3):
795*5d9d9091SRichard Lowe	mov    -0x1d(%rdx),%r9
796*5d9d9091SRichard Lowe	mov    %r9,-0x1d(%rcx)
797*5d9d9091SRichard LoweL(P5Q2):
798*5d9d9091SRichard Lowe	mov    -0x15(%rdx),%r11
799*5d9d9091SRichard Lowe	mov    %r11,-0x15(%rcx)
800*5d9d9091SRichard LoweL(P5Q1):
801*5d9d9091SRichard Lowe	mov    -0xd(%rdx),%r10
802*5d9d9091SRichard Lowe	mov    %r10,-0xd(%rcx)
803*5d9d9091SRichard Lowe	/*
804*5d9d9091SRichard Lowe	 * These trailing loads/stores have to do all their loads 1st,
805*5d9d9091SRichard Lowe	 * then do the stores.
806*5d9d9091SRichard Lowe	 */
807*5d9d9091SRichard LoweL(P5Q0):
808*5d9d9091SRichard Lowe	mov    -0x5(%rdx),%r9d
809*5d9d9091SRichard Lowe	movzbq -0x1(%rdx),%r10
810*5d9d9091SRichard Lowe	mov    %r9d,-0x5(%rcx)
811*5d9d9091SRichard Lowe	mov    %r10b,-0x1(%rcx)
812*5d9d9091SRichard Lowe	ret
813*5d9d9091SRichard Lowe
814*5d9d9091SRichard Lowe	.balign 16
815*5d9d9091SRichard LoweL(P6QF):
816*5d9d9091SRichard Lowe	mov    -0x7e(%rdx),%r9
817*5d9d9091SRichard Lowe	mov    %r9,-0x7e(%rcx)
818*5d9d9091SRichard LoweL(P6QE):
819*5d9d9091SRichard Lowe	mov    -0x76(%rdx),%r11
820*5d9d9091SRichard Lowe	mov    %r11,-0x76(%rcx)
821*5d9d9091SRichard LoweL(P6QD):
822*5d9d9091SRichard Lowe	mov    -0x6e(%rdx),%r10
823*5d9d9091SRichard Lowe	mov    %r10,-0x6e(%rcx)
824*5d9d9091SRichard LoweL(P6QC):
825*5d9d9091SRichard Lowe	mov    -0x66(%rdx),%r9
826*5d9d9091SRichard Lowe	mov    %r9,-0x66(%rcx)
827*5d9d9091SRichard LoweL(P6QB):
828*5d9d9091SRichard Lowe	mov    -0x5e(%rdx),%r11
829*5d9d9091SRichard Lowe	mov    %r11,-0x5e(%rcx)
830*5d9d9091SRichard LoweL(P6QA):
831*5d9d9091SRichard Lowe	mov    -0x56(%rdx),%r10
832*5d9d9091SRichard Lowe	mov    %r10,-0x56(%rcx)
833*5d9d9091SRichard LoweL(P6Q9):
834*5d9d9091SRichard Lowe	mov    -0x4e(%rdx),%r9
835*5d9d9091SRichard Lowe	mov    %r9,-0x4e(%rcx)
836*5d9d9091SRichard LoweL(P6Q8):
837*5d9d9091SRichard Lowe	mov    -0x46(%rdx),%r11
838*5d9d9091SRichard Lowe	mov    %r11,-0x46(%rcx)
839*5d9d9091SRichard LoweL(P6Q7):
840*5d9d9091SRichard Lowe	mov    -0x3e(%rdx),%r10
841*5d9d9091SRichard Lowe	mov    %r10,-0x3e(%rcx)
842*5d9d9091SRichard LoweL(P6Q6):
843*5d9d9091SRichard Lowe	mov    -0x36(%rdx),%r9
844*5d9d9091SRichard Lowe	mov    %r9,-0x36(%rcx)
845*5d9d9091SRichard LoweL(P6Q5):
846*5d9d9091SRichard Lowe	mov    -0x2e(%rdx),%r11
847*5d9d9091SRichard Lowe	mov    %r11,-0x2e(%rcx)
848*5d9d9091SRichard LoweL(P6Q4):
849*5d9d9091SRichard Lowe	mov    -0x26(%rdx),%r10
850*5d9d9091SRichard Lowe	mov    %r10,-0x26(%rcx)
851*5d9d9091SRichard LoweL(P6Q3):
852*5d9d9091SRichard Lowe	mov    -0x1e(%rdx),%r9
853*5d9d9091SRichard Lowe	mov    %r9,-0x1e(%rcx)
854*5d9d9091SRichard LoweL(P6Q2):
855*5d9d9091SRichard Lowe	mov    -0x16(%rdx),%r11
856*5d9d9091SRichard Lowe	mov    %r11,-0x16(%rcx)
857*5d9d9091SRichard LoweL(P6Q1):
858*5d9d9091SRichard Lowe	mov    -0xe(%rdx),%r10
859*5d9d9091SRichard Lowe	mov    %r10,-0xe(%rcx)
860*5d9d9091SRichard Lowe	/*
861*5d9d9091SRichard Lowe	 * These trailing loads/stores have to do all their loads 1st,
862*5d9d9091SRichard Lowe	 * then do the stores.
863*5d9d9091SRichard Lowe	 */
864*5d9d9091SRichard LoweL(P6Q0):
865*5d9d9091SRichard Lowe	mov    -0x6(%rdx),%r9d
866*5d9d9091SRichard Lowe	movzwq -0x2(%rdx),%r10
867*5d9d9091SRichard Lowe	mov    %r9d,-0x6(%rcx)
868*5d9d9091SRichard Lowe	mov    %r10w,-0x2(%rcx)
869*5d9d9091SRichard Lowe	ret
870*5d9d9091SRichard Lowe
871*5d9d9091SRichard Lowe	.balign 16
872*5d9d9091SRichard LoweL(P7QF):
873*5d9d9091SRichard Lowe	mov    -0x7f(%rdx),%r9
874*5d9d9091SRichard Lowe	mov    %r9,-0x7f(%rcx)
875*5d9d9091SRichard LoweL(P7QE):
876*5d9d9091SRichard Lowe	mov    -0x77(%rdx),%r11
877*5d9d9091SRichard Lowe	mov    %r11,-0x77(%rcx)
878*5d9d9091SRichard LoweL(P7QD):
879*5d9d9091SRichard Lowe	mov    -0x6f(%rdx),%r10
880*5d9d9091SRichard Lowe	mov    %r10,-0x6f(%rcx)
881*5d9d9091SRichard LoweL(P7QC):
882*5d9d9091SRichard Lowe	mov    -0x67(%rdx),%r9
883*5d9d9091SRichard Lowe	mov    %r9,-0x67(%rcx)
884*5d9d9091SRichard LoweL(P7QB):
885*5d9d9091SRichard Lowe	mov    -0x5f(%rdx),%r11
886*5d9d9091SRichard Lowe	mov    %r11,-0x5f(%rcx)
887*5d9d9091SRichard LoweL(P7QA):
888*5d9d9091SRichard Lowe	mov    -0x57(%rdx),%r10
889*5d9d9091SRichard Lowe	mov    %r10,-0x57(%rcx)
890*5d9d9091SRichard LoweL(P7Q9):
891*5d9d9091SRichard Lowe	mov    -0x4f(%rdx),%r9
892*5d9d9091SRichard Lowe	mov    %r9,-0x4f(%rcx)
893*5d9d9091SRichard LoweL(P7Q8):
894*5d9d9091SRichard Lowe	mov    -0x47(%rdx),%r11
895*5d9d9091SRichard Lowe	mov    %r11,-0x47(%rcx)
896*5d9d9091SRichard LoweL(P7Q7):
897*5d9d9091SRichard Lowe	mov    -0x3f(%rdx),%r10
898*5d9d9091SRichard Lowe	mov    %r10,-0x3f(%rcx)
899*5d9d9091SRichard LoweL(P7Q6):
900*5d9d9091SRichard Lowe	mov    -0x37(%rdx),%r9
901*5d9d9091SRichard Lowe	mov    %r9,-0x37(%rcx)
902*5d9d9091SRichard LoweL(P7Q5):
903*5d9d9091SRichard Lowe	mov    -0x2f(%rdx),%r11
904*5d9d9091SRichard Lowe	mov    %r11,-0x2f(%rcx)
905*5d9d9091SRichard LoweL(P7Q4):
906*5d9d9091SRichard Lowe	mov    -0x27(%rdx),%r10
907*5d9d9091SRichard Lowe	mov    %r10,-0x27(%rcx)
908*5d9d9091SRichard LoweL(P7Q3):
909*5d9d9091SRichard Lowe	mov    -0x1f(%rdx),%r9
910*5d9d9091SRichard Lowe	mov    %r9,-0x1f(%rcx)
911*5d9d9091SRichard LoweL(P7Q2):
912*5d9d9091SRichard Lowe	mov    -0x17(%rdx),%r11
913*5d9d9091SRichard Lowe	mov    %r11,-0x17(%rcx)
914*5d9d9091SRichard LoweL(P7Q1):
915*5d9d9091SRichard Lowe	mov    -0xf(%rdx),%r10
916*5d9d9091SRichard Lowe	mov    %r10,-0xf(%rcx)
917*5d9d9091SRichard Lowe	/*
918*5d9d9091SRichard Lowe	 * These trailing loads/stores have to do all their loads 1st,
919*5d9d9091SRichard Lowe	 * then do the stores.
920*5d9d9091SRichard Lowe	 */
921*5d9d9091SRichard LoweL(P7Q0):
922*5d9d9091SRichard Lowe	mov    -0x7(%rdx),%r9d
923*5d9d9091SRichard Lowe	movzwq -0x3(%rdx),%r10
924*5d9d9091SRichard Lowe	movzbq -0x1(%rdx),%r11
925*5d9d9091SRichard Lowe	mov    %r9d,-0x7(%rcx)
926*5d9d9091SRichard Lowe	mov    %r10w,-0x3(%rcx)
927*5d9d9091SRichard Lowe	mov    %r11b,-0x1(%rcx)
928*5d9d9091SRichard Lowe	ret
929*5d9d9091SRichard Lowe
930*5d9d9091SRichard Lowe	.balign 16
931*5d9d9091SRichard LoweL(ck_use_sse2):
932*5d9d9091SRichard Lowe	/*
933*5d9d9091SRichard Lowe	 * Align dest to 16 byte boundary.
934*5d9d9091SRichard Lowe	 */
935*5d9d9091SRichard Lowe	test   $0xf,%rcx
936*5d9d9091SRichard Lowe	jnz    L(ShrtAlignNew)
937*5d9d9091SRichard Lowe
938*5d9d9091SRichard LoweL(now_qw_aligned):
939*5d9d9091SRichard Lowe	cmpl   $NO_SSE,.memops_method(%rip)
940*5d9d9091SRichard Lowe	je     L(Loop8byte_pre)
941*5d9d9091SRichard Lowe
942*5d9d9091SRichard Lowe	/*
943*5d9d9091SRichard Lowe	 * The fall-through path is to do SSE2 16-byte load/stores
944*5d9d9091SRichard Lowe	 */
945*5d9d9091SRichard Lowe
946*5d9d9091SRichard Lowe	/*
947*5d9d9091SRichard Lowe	 * If current move size is larger than half of the highest level cache
948*5d9d9091SRichard Lowe	 * size, then do non-temporal moves.
949*5d9d9091SRichard Lowe	 */
950*5d9d9091SRichard Lowe	mov    .largest_level_cache_size(%rip),%r9d
951*5d9d9091SRichard Lowe	shr    %r9		# take half of it
952*5d9d9091SRichard Lowe	cmp    %r9,%r8
953*5d9d9091SRichard Lowe	jg     L(sse2_nt_move)
954*5d9d9091SRichard Lowe
955*5d9d9091SRichard Lowe	/*
956*5d9d9091SRichard Lowe	 * If both the source and dest are aligned, then use the both aligned
957*5d9d9091SRichard Lowe	 * logic. Well aligned data should reap the rewards.
958*5d9d9091SRichard Lowe	 */
959*5d9d9091SRichard Lowe	test   $0xf,%rdx
960*5d9d9091SRichard Lowe	jz     L(pre_both_aligned)
961*5d9d9091SRichard Lowe
962*5d9d9091SRichard Lowe	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
963*5d9d9091SRichard Lowe	testl  $USE_SSSE3,.memops_method(%rip)
964*5d9d9091SRichard Lowe	jz     1f
965*5d9d9091SRichard Lowe	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
966*5d9d9091SRichard Lowe
967*5d9d9091SRichard Lowe1:
968*5d9d9091SRichard Lowe	/*
969*5d9d9091SRichard Lowe	 * if the src is not 16 byte aligned...
970*5d9d9091SRichard Lowe	 */
971*5d9d9091SRichard Lowe	mov    %rdx,%r11
972*5d9d9091SRichard Lowe	and    $0xf,%r11
973*5d9d9091SRichard Lowe	movdqu (%rdx),%xmm0
974*5d9d9091SRichard Lowe	movdqa %xmm0,(%rcx)
975*5d9d9091SRichard Lowe	add    $0x10,%rdx
976*5d9d9091SRichard Lowe	sub    %r11,%rdx
977*5d9d9091SRichard Lowe	add    $0x10,%rcx
978*5d9d9091SRichard Lowe	sub    $0x10,%r8
979*5d9d9091SRichard Lowe	movdqa (%rdx),%xmm1
980*5d9d9091SRichard Lowe
981*5d9d9091SRichard Lowe	movslq (%r10,%r11,4),%r9
982*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
983*5d9d9091SRichard Lowe	jmpq   *%r10
984*5d9d9091SRichard Lowe
985*5d9d9091SRichard Lowe	    .balign 16
986*5d9d9091SRichard LoweL(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
987*5d9d9091SRichard Lowe	    .int        L(mov3dqa1) -L(SSSE3_src)
988*5d9d9091SRichard Lowe	    .int        L(mov3dqa2) -L(SSSE3_src)
989*5d9d9091SRichard Lowe	    .int        L(mov3dqa3) -L(SSSE3_src)
990*5d9d9091SRichard Lowe	    .int        L(mov3dqa4) -L(SSSE3_src)
991*5d9d9091SRichard Lowe	    .int        L(mov3dqa5) -L(SSSE3_src)
992*5d9d9091SRichard Lowe	    .int        L(mov3dqa6) -L(SSSE3_src)
993*5d9d9091SRichard Lowe	    .int        L(mov3dqa7) -L(SSSE3_src)
994*5d9d9091SRichard Lowe	    .int        L(movdqa8)  -L(SSSE3_src)
995*5d9d9091SRichard Lowe	    .int        L(mov3dqa9) -L(SSSE3_src)
996*5d9d9091SRichard Lowe	    .int        L(mov3dqa10)-L(SSSE3_src)
997*5d9d9091SRichard Lowe	    .int        L(mov3dqa11)-L(SSSE3_src)
998*5d9d9091SRichard Lowe	    .int        L(mov3dqa12)-L(SSSE3_src)
999*5d9d9091SRichard Lowe	    .int        L(mov3dqa13)-L(SSSE3_src)
1000*5d9d9091SRichard Lowe	    .int        L(mov3dqa14)-L(SSSE3_src)
1001*5d9d9091SRichard Lowe	    .int        L(mov3dqa15)-L(SSSE3_src)
1002*5d9d9091SRichard LoweL(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
1003*5d9d9091SRichard Lowe	    .int        L(movdqa1) -L(SSE_src)
1004*5d9d9091SRichard Lowe	    .int        L(movdqa2) -L(SSE_src)
1005*5d9d9091SRichard Lowe	    .int        L(movdqa3) -L(SSE_src)
1006*5d9d9091SRichard Lowe	    .int        L(movdqa4) -L(SSE_src)
1007*5d9d9091SRichard Lowe	    .int        L(movdqa5) -L(SSE_src)
1008*5d9d9091SRichard Lowe	    .int        L(movdqa6) -L(SSE_src)
1009*5d9d9091SRichard Lowe	    .int        L(movdqa7) -L(SSE_src)
1010*5d9d9091SRichard Lowe	    .int        L(movdqa8) -L(SSE_src)
1011*5d9d9091SRichard Lowe	    .int        L(movdqa9) -L(SSE_src)
1012*5d9d9091SRichard Lowe	    .int        L(movdqa10)-L(SSE_src)
1013*5d9d9091SRichard Lowe	    .int        L(movdqa11)-L(SSE_src)
1014*5d9d9091SRichard Lowe	    .int        L(movdqa12)-L(SSE_src)
1015*5d9d9091SRichard Lowe	    .int        L(movdqa13)-L(SSE_src)
1016*5d9d9091SRichard Lowe	    .int        L(movdqa14)-L(SSE_src)
1017*5d9d9091SRichard Lowe	    .int        L(movdqa15)-L(SSE_src)
1018*5d9d9091SRichard Lowe
1019*5d9d9091SRichard Lowe	.balign 16
1020*5d9d9091SRichard LoweL(movdqa1):
1021*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
1022*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
1023*5d9d9091SRichard Lowe	lea    0x20(%rdx),%rdx
1024*5d9d9091SRichard Lowe	lea    -0x20(%r8),%r8
1025*5d9d9091SRichard Lowe
1026*5d9d9091SRichard Lowe	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
1027*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
1028*5d9d9091SRichard Lowe	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
1029*5d9d9091SRichard Lowe	por    %xmm1,%xmm3 # OR them together
1030*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1031*5d9d9091SRichard Lowe
1032*5d9d9091SRichard Lowe	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
1033*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
1034*5d9d9091SRichard Lowe	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
1035*5d9d9091SRichard Lowe	por    %xmm2,%xmm0 # OR them together
1036*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)     # store it
1037*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx) # store it
1038*5d9d9091SRichard Lowe	lea    0x20(%rcx),%rcx
1039*5d9d9091SRichard Lowe
1040*5d9d9091SRichard Lowe	jge    L(movdqa1)
1041*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1042*5d9d9091SRichard Lowe
1043*5d9d9091SRichard Lowe	.balign 16
1044*5d9d9091SRichard LoweL(movdqa2):
1045*5d9d9091SRichard Lowe	sub    $0x20,%r8
1046*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1047*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1048*5d9d9091SRichard Lowe	add    $0x20,%rdx
1049*5d9d9091SRichard Lowe
1050*5d9d9091SRichard Lowe	psrldq $0x2,%xmm1
1051*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1052*5d9d9091SRichard Lowe	pslldq $0xe,%xmm3
1053*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1054*5d9d9091SRichard Lowe
1055*5d9d9091SRichard Lowe	psrldq $0x2,%xmm2
1056*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1057*5d9d9091SRichard Lowe	pslldq $0xe,%xmm0
1058*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1059*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1060*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1061*5d9d9091SRichard Lowe
1062*5d9d9091SRichard Lowe	add    $0x20,%rcx
1063*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1064*5d9d9091SRichard Lowe	jge    L(movdqa2)
1065*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1066*5d9d9091SRichard Lowe
1067*5d9d9091SRichard Lowe	.balign 16
1068*5d9d9091SRichard LoweL(movdqa3):
1069*5d9d9091SRichard Lowe	sub    $0x20,%r8
1070*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1071*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1072*5d9d9091SRichard Lowe	add    $0x20,%rdx
1073*5d9d9091SRichard Lowe
1074*5d9d9091SRichard Lowe	psrldq $0x3,%xmm1
1075*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1076*5d9d9091SRichard Lowe	pslldq $0xd,%xmm3
1077*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1078*5d9d9091SRichard Lowe
1079*5d9d9091SRichard Lowe	psrldq $0x3,%xmm2
1080*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1081*5d9d9091SRichard Lowe	pslldq $0xd,%xmm0
1082*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1083*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1084*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1085*5d9d9091SRichard Lowe
1086*5d9d9091SRichard Lowe	add    $0x20,%rcx
1087*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1088*5d9d9091SRichard Lowe	jge    L(movdqa3)
1089*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1090*5d9d9091SRichard Lowe
1091*5d9d9091SRichard Lowe	.balign 16
1092*5d9d9091SRichard LoweL(movdqa4):
1093*5d9d9091SRichard Lowe	sub    $0x20,%r8
1094*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1095*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1096*5d9d9091SRichard Lowe	add    $0x20,%rdx
1097*5d9d9091SRichard Lowe
1098*5d9d9091SRichard Lowe	psrldq $0x4,%xmm1
1099*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1100*5d9d9091SRichard Lowe	pslldq $0xc,%xmm3
1101*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1102*5d9d9091SRichard Lowe
1103*5d9d9091SRichard Lowe	psrldq $0x4,%xmm2
1104*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1105*5d9d9091SRichard Lowe	pslldq $0xc,%xmm0
1106*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1107*5d9d9091SRichard Lowe
1108*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1109*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1110*5d9d9091SRichard Lowe
1111*5d9d9091SRichard Lowe	add    $0x20,%rcx
1112*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1113*5d9d9091SRichard Lowe	jge    L(movdqa4)
1114*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1115*5d9d9091SRichard Lowe
1116*5d9d9091SRichard Lowe	.balign 16
1117*5d9d9091SRichard LoweL(movdqa5):
1118*5d9d9091SRichard Lowe	sub    $0x20,%r8
1119*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1120*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1121*5d9d9091SRichard Lowe	add    $0x20,%rdx
1122*5d9d9091SRichard Lowe
1123*5d9d9091SRichard Lowe	psrldq $0x5,%xmm1
1124*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1125*5d9d9091SRichard Lowe	pslldq $0xb,%xmm3
1126*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1127*5d9d9091SRichard Lowe
1128*5d9d9091SRichard Lowe	psrldq $0x5,%xmm2
1129*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1130*5d9d9091SRichard Lowe	pslldq $0xb,%xmm0
1131*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1132*5d9d9091SRichard Lowe
1133*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1134*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1135*5d9d9091SRichard Lowe
1136*5d9d9091SRichard Lowe	add    $0x20,%rcx
1137*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1138*5d9d9091SRichard Lowe	jge    L(movdqa5)
1139*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1140*5d9d9091SRichard Lowe
1141*5d9d9091SRichard Lowe	.balign 16
1142*5d9d9091SRichard LoweL(movdqa6):
1143*5d9d9091SRichard Lowe	sub    $0x20,%r8
1144*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1145*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1146*5d9d9091SRichard Lowe	add    $0x20,%rdx
1147*5d9d9091SRichard Lowe
1148*5d9d9091SRichard Lowe	psrldq $0x6,%xmm1
1149*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1150*5d9d9091SRichard Lowe	pslldq $0xa,%xmm3
1151*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1152*5d9d9091SRichard Lowe
1153*5d9d9091SRichard Lowe	psrldq $0x6,%xmm2
1154*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1155*5d9d9091SRichard Lowe	pslldq $0xa,%xmm0
1156*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1157*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1158*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1159*5d9d9091SRichard Lowe
1160*5d9d9091SRichard Lowe	add    $0x20,%rcx
1161*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1162*5d9d9091SRichard Lowe	jge    L(movdqa6)
1163*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1164*5d9d9091SRichard Lowe
1165*5d9d9091SRichard Lowe	.balign 16
1166*5d9d9091SRichard LoweL(movdqa7):
1167*5d9d9091SRichard Lowe	sub    $0x20,%r8
1168*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1169*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1170*5d9d9091SRichard Lowe	add    $0x20,%rdx
1171*5d9d9091SRichard Lowe
1172*5d9d9091SRichard Lowe	psrldq $0x7,%xmm1
1173*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1174*5d9d9091SRichard Lowe	pslldq $0x9,%xmm3
1175*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1176*5d9d9091SRichard Lowe
1177*5d9d9091SRichard Lowe	psrldq $0x7,%xmm2
1178*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1179*5d9d9091SRichard Lowe	pslldq $0x9,%xmm0
1180*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1181*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1182*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1183*5d9d9091SRichard Lowe
1184*5d9d9091SRichard Lowe	add    $0x20,%rcx
1185*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1186*5d9d9091SRichard Lowe	jge    L(movdqa7)
1187*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1188*5d9d9091SRichard Lowe
1189*5d9d9091SRichard Lowe	.balign 16
1190*5d9d9091SRichard LoweL(movdqa8):
1191*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1192*5d9d9091SRichard Lowe	sub    $0x30,%r8
1193*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1194*5d9d9091SRichard Lowe	movdqa 0x30(%rdx),%xmm5
1195*5d9d9091SRichard Lowe	lea    0x30(%rdx),%rdx
1196*5d9d9091SRichard Lowe
1197*5d9d9091SRichard Lowe	shufpd $0x1,%xmm3,%xmm1
1198*5d9d9091SRichard Lowe	movdqa %xmm1,(%rcx)
1199*5d9d9091SRichard Lowe
1200*5d9d9091SRichard Lowe	cmp    $0x30,%r8
1201*5d9d9091SRichard Lowe
1202*5d9d9091SRichard Lowe	shufpd $0x1,%xmm0,%xmm3
1203*5d9d9091SRichard Lowe	movdqa %xmm3,0x10(%rcx)
1204*5d9d9091SRichard Lowe
1205*5d9d9091SRichard Lowe	movdqa %xmm5,%xmm1
1206*5d9d9091SRichard Lowe	shufpd $0x1,%xmm5,%xmm0
1207*5d9d9091SRichard Lowe	movdqa %xmm0,0x20(%rcx)
1208*5d9d9091SRichard Lowe
1209*5d9d9091SRichard Lowe	lea    0x30(%rcx),%rcx
1210*5d9d9091SRichard Lowe
1211*5d9d9091SRichard Lowe	jge    L(movdqa8)
1212*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1213*5d9d9091SRichard Lowe
1214*5d9d9091SRichard Lowe	.balign 16
1215*5d9d9091SRichard LoweL(movdqa9):
1216*5d9d9091SRichard Lowe	sub    $0x20,%r8
1217*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1218*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1219*5d9d9091SRichard Lowe	add    $0x20,%rdx
1220*5d9d9091SRichard Lowe
1221*5d9d9091SRichard Lowe	psrldq $0x9,%xmm1
1222*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1223*5d9d9091SRichard Lowe	pslldq $0x7,%xmm3
1224*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1225*5d9d9091SRichard Lowe
1226*5d9d9091SRichard Lowe	psrldq $0x9,%xmm2
1227*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1228*5d9d9091SRichard Lowe	pslldq $0x7,%xmm0
1229*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1230*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1231*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1232*5d9d9091SRichard Lowe
1233*5d9d9091SRichard Lowe	add    $0x20,%rcx
1234*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1235*5d9d9091SRichard Lowe	jge    L(movdqa9)
1236*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1237*5d9d9091SRichard Lowe
1238*5d9d9091SRichard Lowe	.balign 16
1239*5d9d9091SRichard LoweL(movdqa10):
1240*5d9d9091SRichard Lowe	sub    $0x20,%r8
1241*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1242*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1243*5d9d9091SRichard Lowe	add    $0x20,%rdx
1244*5d9d9091SRichard Lowe
1245*5d9d9091SRichard Lowe	psrldq $0xa,%xmm1
1246*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1247*5d9d9091SRichard Lowe	pslldq $0x6,%xmm3
1248*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1249*5d9d9091SRichard Lowe
1250*5d9d9091SRichard Lowe	psrldq $0xa,%xmm2
1251*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1252*5d9d9091SRichard Lowe	pslldq $0x6,%xmm0
1253*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1254*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1255*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1256*5d9d9091SRichard Lowe
1257*5d9d9091SRichard Lowe	add    $0x20,%rcx
1258*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1259*5d9d9091SRichard Lowe	jge    L(movdqa10)
1260*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1261*5d9d9091SRichard Lowe
1262*5d9d9091SRichard Lowe	.balign 16
1263*5d9d9091SRichard LoweL(movdqa11):
1264*5d9d9091SRichard Lowe	sub    $0x20,%r8
1265*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1266*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1267*5d9d9091SRichard Lowe	add    $0x20,%rdx
1268*5d9d9091SRichard Lowe
1269*5d9d9091SRichard Lowe	psrldq $0xb,%xmm1
1270*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1271*5d9d9091SRichard Lowe	pslldq $0x5,%xmm3
1272*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1273*5d9d9091SRichard Lowe
1274*5d9d9091SRichard Lowe	psrldq $0xb,%xmm2
1275*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1276*5d9d9091SRichard Lowe	pslldq $0x5,%xmm0
1277*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1278*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1279*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1280*5d9d9091SRichard Lowe
1281*5d9d9091SRichard Lowe	add    $0x20,%rcx
1282*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1283*5d9d9091SRichard Lowe	jge    L(movdqa11)
1284*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1285*5d9d9091SRichard Lowe
1286*5d9d9091SRichard Lowe	.balign 16
1287*5d9d9091SRichard LoweL(movdqa12):
1288*5d9d9091SRichard Lowe	sub    $0x20,%r8
1289*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1290*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1291*5d9d9091SRichard Lowe	add    $0x20,%rdx
1292*5d9d9091SRichard Lowe
1293*5d9d9091SRichard Lowe	psrldq $0xc,%xmm1
1294*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1295*5d9d9091SRichard Lowe	pslldq $0x4,%xmm3
1296*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1297*5d9d9091SRichard Lowe
1298*5d9d9091SRichard Lowe	psrldq $0xc,%xmm2
1299*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1300*5d9d9091SRichard Lowe	pslldq $0x4,%xmm0
1301*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1302*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1303*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1304*5d9d9091SRichard Lowe
1305*5d9d9091SRichard Lowe	add    $0x20,%rcx
1306*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1307*5d9d9091SRichard Lowe	jge    L(movdqa12)
1308*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1309*5d9d9091SRichard Lowe
1310*5d9d9091SRichard Lowe	.balign 16
1311*5d9d9091SRichard LoweL(movdqa13):
1312*5d9d9091SRichard Lowe	sub    $0x20,%r8
1313*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1314*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1315*5d9d9091SRichard Lowe	add    $0x20,%rdx
1316*5d9d9091SRichard Lowe
1317*5d9d9091SRichard Lowe	psrldq $0xd,%xmm1
1318*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1319*5d9d9091SRichard Lowe	pslldq $0x3,%xmm3
1320*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1321*5d9d9091SRichard Lowe
1322*5d9d9091SRichard Lowe	psrldq $0xd,%xmm2
1323*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1324*5d9d9091SRichard Lowe	pslldq $0x3,%xmm0
1325*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1326*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1327*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1328*5d9d9091SRichard Lowe
1329*5d9d9091SRichard Lowe	add    $0x20,%rcx
1330*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1331*5d9d9091SRichard Lowe	jge    L(movdqa13)
1332*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1333*5d9d9091SRichard Lowe
1334*5d9d9091SRichard Lowe	.balign 16
1335*5d9d9091SRichard LoweL(movdqa14):
1336*5d9d9091SRichard Lowe	sub    $0x20,%r8
1337*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1338*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1339*5d9d9091SRichard Lowe	add    $0x20,%rdx
1340*5d9d9091SRichard Lowe
1341*5d9d9091SRichard Lowe	psrldq $0xe,%xmm1
1342*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1343*5d9d9091SRichard Lowe	pslldq $0x2,%xmm3
1344*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1345*5d9d9091SRichard Lowe
1346*5d9d9091SRichard Lowe	psrldq $0xe,%xmm2
1347*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1348*5d9d9091SRichard Lowe	pslldq $0x2,%xmm0
1349*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1350*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1351*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1352*5d9d9091SRichard Lowe
1353*5d9d9091SRichard Lowe	add    $0x20,%rcx
1354*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1355*5d9d9091SRichard Lowe	jge    L(movdqa14)
1356*5d9d9091SRichard Lowe	jmp    L(movdqa_epi)
1357*5d9d9091SRichard Lowe
1358*5d9d9091SRichard Lowe	.balign 16
1359*5d9d9091SRichard LoweL(movdqa15):
1360*5d9d9091SRichard Lowe	sub    $0x20,%r8
1361*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm3
1362*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm0
1363*5d9d9091SRichard Lowe	add    $0x20,%rdx
1364*5d9d9091SRichard Lowe
1365*5d9d9091SRichard Lowe	psrldq $0xf,%xmm1
1366*5d9d9091SRichard Lowe	movdqa %xmm3,%xmm2
1367*5d9d9091SRichard Lowe	pslldq $0x1,%xmm3
1368*5d9d9091SRichard Lowe	por    %xmm1,%xmm3
1369*5d9d9091SRichard Lowe
1370*5d9d9091SRichard Lowe	psrldq $0xf,%xmm2
1371*5d9d9091SRichard Lowe	movdqa %xmm0,%xmm1
1372*5d9d9091SRichard Lowe	pslldq $0x1,%xmm0
1373*5d9d9091SRichard Lowe	por    %xmm2,%xmm0
1374*5d9d9091SRichard Lowe	movdqa %xmm3,(%rcx)
1375*5d9d9091SRichard Lowe	movdqa %xmm0,0x10(%rcx)
1376*5d9d9091SRichard Lowe
1377*5d9d9091SRichard Lowe	add    $0x20,%rcx
1378*5d9d9091SRichard Lowe	cmp    $0x20,%r8
1379*5d9d9091SRichard Lowe	jge    L(movdqa15)
1380*5d9d9091SRichard Lowe	#jmp   L(movdqa_epi)
1381*5d9d9091SRichard Lowe
1382*5d9d9091SRichard Lowe	.balign 16
1383*5d9d9091SRichard LoweL(movdqa_epi):
1384*5d9d9091SRichard Lowe	lea    L(fwdPxQx)(%rip),%r10
1385*5d9d9091SRichard Lowe	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
1386*5d9d9091SRichard Lowe	add    %r8,%rcx
1387*5d9d9091SRichard Lowe	add    %r8,%rdx
1388*5d9d9091SRichard Lowe
1389*5d9d9091SRichard Lowe	movslq (%r10,%r8,4),%r9
1390*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
1391*5d9d9091SRichard Lowe	jmpq   *%r10
1392*5d9d9091SRichard Lowe
1393*5d9d9091SRichard Lowe	.balign 16
1394*5d9d9091SRichard LoweL(mov3dqa1):
1395*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
1396*5d9d9091SRichard Lowe	sub	$0x30,%r8
1397*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
1398*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
1399*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1400*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1401*5d9d9091SRichard Lowe
1402*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
1403*5d9d9091SRichard Lowe	#palignr	$0x1,%xmm1,%xmm3
1404*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1405*5d9d9091SRichard Lowe	.byte	0xd9,0x01
1406*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      # store it
1407*5d9d9091SRichard Lowe
1408*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
1409*5d9d9091SRichard Lowe	#palignr	$0x1,%xmm2,%xmm0
1410*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1411*5d9d9091SRichard Lowe	.byte	0xc2,0x01
1412*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)  # store it
1413*5d9d9091SRichard Lowe
1414*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
1415*5d9d9091SRichard Lowe	#palignr	$0x1,%xmm4,%xmm5
1416*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1417*5d9d9091SRichard Lowe	.byte	0xec,0x01
1418*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)  # store it
1419*5d9d9091SRichard Lowe
1420*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1421*5d9d9091SRichard Lowe	jge	L(mov3dqa1)
1422*5d9d9091SRichard Lowe
1423*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1424*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1425*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1426*5d9d9091SRichard Lowe	sub	$0x10,%r8
1427*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1428*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1429*5d9d9091SRichard Lowe	#palignr	$0x1,%xmm1,%xmm3
1430*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1431*5d9d9091SRichard Lowe	.byte	0xd9,0x01
1432*5d9d9091SRichard Lowe
1433*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1434*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1435*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1436*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1437*5d9d9091SRichard Lowe
1438*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1439*5d9d9091SRichard Lowe	sub	$0x10,%r8
1440*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1441*5d9d9091SRichard Lowe	#palignr	$0x1,%xmm2,%xmm0
1442*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1443*5d9d9091SRichard Lowe	.byte	0xc2,0x01
1444*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1445*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1446*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1447*5d9d9091SRichard Lowe
1448*5d9d9091SRichard Lowe	.balign 16
1449*5d9d9091SRichard LoweL(mov3dqa2):
1450*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1451*5d9d9091SRichard Lowe	sub	$0x30,%r8
1452*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1453*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1454*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1455*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1456*5d9d9091SRichard Lowe
1457*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1458*5d9d9091SRichard Lowe	#palignr	$0x2,%xmm1,%xmm3
1459*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1460*5d9d9091SRichard Lowe	.byte	0xd9,0x02
1461*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1462*5d9d9091SRichard Lowe
1463*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1464*5d9d9091SRichard Lowe	#palignr	$0x2,%xmm2,%xmm0
1465*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1466*5d9d9091SRichard Lowe	.byte	0xc2,0x02
1467*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1468*5d9d9091SRichard Lowe
1469*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1470*5d9d9091SRichard Lowe	#palignr	$0x2,%xmm4,%xmm5
1471*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1472*5d9d9091SRichard Lowe	.byte	0xec,0x02
1473*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1474*5d9d9091SRichard Lowe
1475*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1476*5d9d9091SRichard Lowe	jge	L(mov3dqa2)
1477*5d9d9091SRichard Lowe
1478*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1479*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1480*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1481*5d9d9091SRichard Lowe	sub	$0x10,%r8
1482*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1483*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1484*5d9d9091SRichard Lowe	#palignr	$0x2,%xmm1,%xmm3
1485*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1486*5d9d9091SRichard Lowe	.byte	0xd9,0x02
1487*5d9d9091SRichard Lowe
1488*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1489*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1490*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1491*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1492*5d9d9091SRichard Lowe
1493*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1494*5d9d9091SRichard Lowe	sub	$0x10,%r8
1495*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1496*5d9d9091SRichard Lowe	#palignr	$0x2,%xmm2,%xmm0
1497*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1498*5d9d9091SRichard Lowe	.byte	0xc2,0x02
1499*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1500*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1501*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1502*5d9d9091SRichard Lowe
1503*5d9d9091SRichard Lowe	.balign 16
1504*5d9d9091SRichard LoweL(mov3dqa3):
1505*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1506*5d9d9091SRichard Lowe	sub	$0x30,%r8
1507*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1508*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1509*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1510*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1511*5d9d9091SRichard Lowe
1512*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1513*5d9d9091SRichard Lowe	#palignr	$0x3,%xmm1,%xmm3
1514*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1515*5d9d9091SRichard Lowe	.byte	0xd9,0x03
1516*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1517*5d9d9091SRichard Lowe
1518*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1519*5d9d9091SRichard Lowe	#palignr	$0x3,%xmm2,%xmm0
1520*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1521*5d9d9091SRichard Lowe	.byte	0xc2,0x03
1522*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1523*5d9d9091SRichard Lowe
1524*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1525*5d9d9091SRichard Lowe	#palignr	$0x3,%xmm4,%xmm5
1526*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1527*5d9d9091SRichard Lowe	.byte	0xec,0x03
1528*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1529*5d9d9091SRichard Lowe
1530*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1531*5d9d9091SRichard Lowe	jge	L(mov3dqa3)
1532*5d9d9091SRichard Lowe
1533*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1534*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1535*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1536*5d9d9091SRichard Lowe	sub	$0x10,%r8
1537*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1538*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1539*5d9d9091SRichard Lowe	#palignr	$0x3,%xmm1,%xmm3
1540*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1541*5d9d9091SRichard Lowe	.byte	0xd9,0x03
1542*5d9d9091SRichard Lowe
1543*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1544*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1545*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1546*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1547*5d9d9091SRichard Lowe
1548*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1549*5d9d9091SRichard Lowe	sub	$0x10,%r8
1550*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1551*5d9d9091SRichard Lowe	#palignr	$0x3,%xmm2,%xmm0
1552*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1553*5d9d9091SRichard Lowe	.byte	0xc2,0x03
1554*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1555*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1556*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1557*5d9d9091SRichard Lowe
1558*5d9d9091SRichard Lowe	.balign 16
1559*5d9d9091SRichard LoweL(mov3dqa4):
1560*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1561*5d9d9091SRichard Lowe	sub	$0x30,%r8
1562*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1563*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1564*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1565*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1566*5d9d9091SRichard Lowe
1567*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1568*5d9d9091SRichard Lowe	#palignr	$0x4,%xmm1,%xmm3
1569*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1570*5d9d9091SRichard Lowe	.byte	0xd9,0x04
1571*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1572*5d9d9091SRichard Lowe
1573*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1574*5d9d9091SRichard Lowe	#palignr	$0x4,%xmm2,%xmm0
1575*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1576*5d9d9091SRichard Lowe	.byte	0xc2,0x04
1577*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1578*5d9d9091SRichard Lowe
1579*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1580*5d9d9091SRichard Lowe	#palignr	$0x4,%xmm4,%xmm5
1581*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1582*5d9d9091SRichard Lowe	.byte	0xec,0x04
1583*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1584*5d9d9091SRichard Lowe
1585*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1586*5d9d9091SRichard Lowe	jge	L(mov3dqa4)
1587*5d9d9091SRichard Lowe
1588*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1589*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1590*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1591*5d9d9091SRichard Lowe	sub	$0x10,%r8
1592*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1593*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1594*5d9d9091SRichard Lowe	#palignr	$0x4,%xmm1,%xmm3
1595*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1596*5d9d9091SRichard Lowe	.byte	0xd9,0x04
1597*5d9d9091SRichard Lowe
1598*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1599*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1600*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1601*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1602*5d9d9091SRichard Lowe
1603*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1604*5d9d9091SRichard Lowe	sub	$0x10,%r8
1605*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1606*5d9d9091SRichard Lowe	#palignr	$0x4,%xmm2,%xmm0
1607*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1608*5d9d9091SRichard Lowe	.byte	0xc2,0x04
1609*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1610*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1611*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1612*5d9d9091SRichard Lowe
1613*5d9d9091SRichard Lowe	.balign 16
1614*5d9d9091SRichard LoweL(mov3dqa5):
1615*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1616*5d9d9091SRichard Lowe	sub	$0x30,%r8
1617*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1618*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1619*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1620*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1621*5d9d9091SRichard Lowe
1622*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1623*5d9d9091SRichard Lowe	#palignr	$0x5,%xmm1,%xmm3
1624*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1625*5d9d9091SRichard Lowe	.byte	0xd9,0x05
1626*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1627*5d9d9091SRichard Lowe
1628*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1629*5d9d9091SRichard Lowe	#palignr	$0x5,%xmm2,%xmm0
1630*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1631*5d9d9091SRichard Lowe	.byte	0xc2,0x05
1632*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1633*5d9d9091SRichard Lowe
1634*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1635*5d9d9091SRichard Lowe	#palignr	$0x5,%xmm4,%xmm5
1636*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1637*5d9d9091SRichard Lowe	.byte	0xec,0x05
1638*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1639*5d9d9091SRichard Lowe
1640*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1641*5d9d9091SRichard Lowe	jge	L(mov3dqa5)
1642*5d9d9091SRichard Lowe
1643*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1644*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1645*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1646*5d9d9091SRichard Lowe	sub	$0x10,%r8
1647*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1648*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1649*5d9d9091SRichard Lowe	#palignr	$0x5,%xmm1,%xmm3
1650*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1651*5d9d9091SRichard Lowe	.byte	0xd9,0x05
1652*5d9d9091SRichard Lowe
1653*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1654*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1655*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1656*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1657*5d9d9091SRichard Lowe
1658*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1659*5d9d9091SRichard Lowe	sub	$0x10,%r8
1660*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1661*5d9d9091SRichard Lowe	#palignr	$0x5,%xmm2,%xmm0
1662*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1663*5d9d9091SRichard Lowe	.byte	0xc2,0x05
1664*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1665*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1666*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1667*5d9d9091SRichard Lowe
1668*5d9d9091SRichard Lowe	.balign 16
1669*5d9d9091SRichard LoweL(mov3dqa6):
1670*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1671*5d9d9091SRichard Lowe	sub	$0x30,%r8
1672*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1673*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1674*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1675*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1676*5d9d9091SRichard Lowe
1677*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1678*5d9d9091SRichard Lowe	#palignr	$0x6,%xmm1,%xmm3
1679*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1680*5d9d9091SRichard Lowe	.byte	0xd9,0x06
1681*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1682*5d9d9091SRichard Lowe
1683*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1684*5d9d9091SRichard Lowe	#palignr	$0x6,%xmm2,%xmm0
1685*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1686*5d9d9091SRichard Lowe	.byte	0xc2,0x06
1687*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1688*5d9d9091SRichard Lowe
1689*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1690*5d9d9091SRichard Lowe	#palignr	$0x6,%xmm4,%xmm5
1691*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1692*5d9d9091SRichard Lowe	.byte	0xec,0x06
1693*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1694*5d9d9091SRichard Lowe
1695*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1696*5d9d9091SRichard Lowe	jge	L(mov3dqa6)
1697*5d9d9091SRichard Lowe
1698*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1699*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1700*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1701*5d9d9091SRichard Lowe	sub	$0x10,%r8
1702*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1703*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1704*5d9d9091SRichard Lowe	#palignr	$0x6,%xmm1,%xmm3
1705*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1706*5d9d9091SRichard Lowe	.byte	0xd9,0x06
1707*5d9d9091SRichard Lowe
1708*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1709*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1710*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1711*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1712*5d9d9091SRichard Lowe
1713*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1714*5d9d9091SRichard Lowe	sub	$0x10,%r8
1715*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1716*5d9d9091SRichard Lowe	#palignr	$0x6,%xmm2,%xmm0
1717*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1718*5d9d9091SRichard Lowe	.byte	0xc2,0x06
1719*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1720*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1721*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1722*5d9d9091SRichard Lowe
1723*5d9d9091SRichard Lowe	.balign 16
1724*5d9d9091SRichard LoweL(mov3dqa7):
1725*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1726*5d9d9091SRichard Lowe	sub	$0x30,%r8
1727*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1728*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1729*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1730*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1731*5d9d9091SRichard Lowe
1732*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1733*5d9d9091SRichard Lowe	#palignr	$0x7,%xmm1,%xmm3
1734*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1735*5d9d9091SRichard Lowe	.byte	0xd9,0x07
1736*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1737*5d9d9091SRichard Lowe
1738*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1739*5d9d9091SRichard Lowe	#palignr	$0x7,%xmm2,%xmm0
1740*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1741*5d9d9091SRichard Lowe	.byte	0xc2,0x07
1742*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1743*5d9d9091SRichard Lowe
1744*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1745*5d9d9091SRichard Lowe	#palignr	$0x7,%xmm4,%xmm5
1746*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1747*5d9d9091SRichard Lowe	.byte	0xec,0x07
1748*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1749*5d9d9091SRichard Lowe
1750*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1751*5d9d9091SRichard Lowe	jge	L(mov3dqa7)
1752*5d9d9091SRichard Lowe
1753*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1754*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1755*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1756*5d9d9091SRichard Lowe	sub	$0x10,%r8
1757*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1758*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1759*5d9d9091SRichard Lowe	#palignr	$0x7,%xmm1,%xmm3
1760*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1761*5d9d9091SRichard Lowe	.byte	0xd9,0x07
1762*5d9d9091SRichard Lowe
1763*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1764*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1765*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1766*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1767*5d9d9091SRichard Lowe
1768*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1769*5d9d9091SRichard Lowe	sub	$0x10,%r8
1770*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1771*5d9d9091SRichard Lowe	#palignr	$0x7,%xmm2,%xmm0
1772*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1773*5d9d9091SRichard Lowe	.byte	0xc2,0x07
1774*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1775*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1776*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1777*5d9d9091SRichard Lowe
1778*5d9d9091SRichard Lowe	.balign 16
1779*5d9d9091SRichard LoweL(mov3dqa9):
1780*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1781*5d9d9091SRichard Lowe	sub	$0x30,%r8
1782*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1783*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1784*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1785*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1786*5d9d9091SRichard Lowe
1787*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1788*5d9d9091SRichard Lowe	#palignr	$0x9,%xmm1,%xmm3
1789*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1790*5d9d9091SRichard Lowe	.byte	0xd9,0x09
1791*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1792*5d9d9091SRichard Lowe
1793*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1794*5d9d9091SRichard Lowe	#palignr	$0x9,%xmm2,%xmm0
1795*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1796*5d9d9091SRichard Lowe	.byte	0xc2,0x09
1797*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1798*5d9d9091SRichard Lowe
1799*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1800*5d9d9091SRichard Lowe	#palignr	$0x9,%xmm4,%xmm5
1801*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1802*5d9d9091SRichard Lowe	.byte	0xec,0x09
1803*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1804*5d9d9091SRichard Lowe
1805*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1806*5d9d9091SRichard Lowe	jge	L(mov3dqa9)
1807*5d9d9091SRichard Lowe
1808*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1809*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1810*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1811*5d9d9091SRichard Lowe	sub	$0x10,%r8
1812*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1813*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1814*5d9d9091SRichard Lowe	#palignr	$0x9,%xmm1,%xmm3
1815*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1816*5d9d9091SRichard Lowe	.byte	0xd9,0x09
1817*5d9d9091SRichard Lowe
1818*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1819*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1820*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1821*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1822*5d9d9091SRichard Lowe
1823*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1824*5d9d9091SRichard Lowe	sub	$0x10,%r8
1825*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1826*5d9d9091SRichard Lowe	#palignr	$0x9,%xmm2,%xmm0
1827*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1828*5d9d9091SRichard Lowe	.byte	0xc2,0x09
1829*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1830*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1831*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1832*5d9d9091SRichard Lowe
1833*5d9d9091SRichard Lowe	.balign 16
1834*5d9d9091SRichard LoweL(mov3dqa10):
1835*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1836*5d9d9091SRichard Lowe	sub	$0x30,%r8
1837*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1838*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1839*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1840*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1841*5d9d9091SRichard Lowe
1842*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1843*5d9d9091SRichard Lowe	#palignr	$0xa,%xmm1,%xmm3
1844*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1845*5d9d9091SRichard Lowe	.byte	0xd9,0x0a
1846*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1847*5d9d9091SRichard Lowe
1848*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1849*5d9d9091SRichard Lowe	#palignr	$0xa,%xmm2,%xmm0
1850*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1851*5d9d9091SRichard Lowe	.byte	0xc2,0x0a
1852*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1853*5d9d9091SRichard Lowe
1854*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1855*5d9d9091SRichard Lowe	#palignr	$0xa,%xmm4,%xmm5
1856*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1857*5d9d9091SRichard Lowe	.byte	0xec,0x0a
1858*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1859*5d9d9091SRichard Lowe
1860*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1861*5d9d9091SRichard Lowe	jge	L(mov3dqa10)
1862*5d9d9091SRichard Lowe
1863*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1864*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1865*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1866*5d9d9091SRichard Lowe	sub	$0x10,%r8
1867*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1868*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1869*5d9d9091SRichard Lowe	#palignr	$0xa,%xmm1,%xmm3
1870*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1871*5d9d9091SRichard Lowe	.byte	0xd9,0x0a
1872*5d9d9091SRichard Lowe
1873*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1874*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1875*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1876*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1877*5d9d9091SRichard Lowe
1878*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1879*5d9d9091SRichard Lowe	sub	$0x10,%r8
1880*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1881*5d9d9091SRichard Lowe	#palignr	$0xa,%xmm2,%xmm0
1882*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1883*5d9d9091SRichard Lowe	.byte	0xc2,0x0a
1884*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1885*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1886*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1887*5d9d9091SRichard Lowe
1888*5d9d9091SRichard Lowe	.balign 16
1889*5d9d9091SRichard LoweL(mov3dqa11):
1890*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1891*5d9d9091SRichard Lowe	sub	$0x30,%r8
1892*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1893*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1894*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1895*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1896*5d9d9091SRichard Lowe
1897*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1898*5d9d9091SRichard Lowe	#palignr	$0xb,%xmm1,%xmm3
1899*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1900*5d9d9091SRichard Lowe	.byte	0xd9,0x0b
1901*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1902*5d9d9091SRichard Lowe
1903*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1904*5d9d9091SRichard Lowe	#palignr	$0xb,%xmm2,%xmm0
1905*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1906*5d9d9091SRichard Lowe	.byte	0xc2,0x0b
1907*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1908*5d9d9091SRichard Lowe
1909*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1910*5d9d9091SRichard Lowe	#palignr	$0xb,%xmm4,%xmm5
1911*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1912*5d9d9091SRichard Lowe	.byte	0xec,0x0b
1913*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1914*5d9d9091SRichard Lowe
1915*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1916*5d9d9091SRichard Lowe	jge	L(mov3dqa11)
1917*5d9d9091SRichard Lowe
1918*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1919*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1920*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1921*5d9d9091SRichard Lowe	sub	$0x10,%r8
1922*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1923*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1924*5d9d9091SRichard Lowe	#palignr	$0xb,%xmm1,%xmm3
1925*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1926*5d9d9091SRichard Lowe	.byte	0xd9,0x0b
1927*5d9d9091SRichard Lowe
1928*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1929*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1930*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1931*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1932*5d9d9091SRichard Lowe
1933*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1934*5d9d9091SRichard Lowe	sub	$0x10,%r8
1935*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1936*5d9d9091SRichard Lowe	#palignr	$0xb,%xmm2,%xmm0
1937*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1938*5d9d9091SRichard Lowe	.byte	0xc2,0x0b
1939*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1940*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1941*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1942*5d9d9091SRichard Lowe
1943*5d9d9091SRichard Lowe	.balign 16
1944*5d9d9091SRichard LoweL(mov3dqa12):
1945*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
1946*5d9d9091SRichard Lowe	sub	$0x30,%r8
1947*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
1948*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
1949*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
1950*5d9d9091SRichard Lowe	cmp	$0x30,%r8
1951*5d9d9091SRichard Lowe
1952*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
1953*5d9d9091SRichard Lowe	#palignr	$0xc,%xmm1,%xmm3
1954*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1955*5d9d9091SRichard Lowe	.byte	0xd9,0x0c
1956*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
1957*5d9d9091SRichard Lowe
1958*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
1959*5d9d9091SRichard Lowe	#palignr	$0xc,%xmm2,%xmm0
1960*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1961*5d9d9091SRichard Lowe	.byte	0xc2,0x0c
1962*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
1963*5d9d9091SRichard Lowe
1964*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
1965*5d9d9091SRichard Lowe	#palignr	$0xc,%xmm4,%xmm5
1966*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1967*5d9d9091SRichard Lowe	.byte	0xec,0x0c
1968*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
1969*5d9d9091SRichard Lowe
1970*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
1971*5d9d9091SRichard Lowe	jge	L(mov3dqa12)
1972*5d9d9091SRichard Lowe
1973*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1974*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1975*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1976*5d9d9091SRichard Lowe	sub	$0x10,%r8
1977*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1978*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
1979*5d9d9091SRichard Lowe	#palignr	$0xc,%xmm1,%xmm3
1980*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1981*5d9d9091SRichard Lowe	.byte	0xd9,0x0c
1982*5d9d9091SRichard Lowe
1983*5d9d9091SRichard Lowe	cmp	$0x10,%r8
1984*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
1985*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1986*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
1987*5d9d9091SRichard Lowe
1988*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1989*5d9d9091SRichard Lowe	sub	$0x10,%r8
1990*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
1991*5d9d9091SRichard Lowe	#palignr	$0xc,%xmm2,%xmm0
1992*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
1993*5d9d9091SRichard Lowe	.byte	0xc2,0x0c
1994*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
1995*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
1996*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
1997*5d9d9091SRichard Lowe
1998*5d9d9091SRichard Lowe	.balign 16
1999*5d9d9091SRichard LoweL(mov3dqa13):
2000*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
2001*5d9d9091SRichard Lowe	sub	$0x30,%r8
2002*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
2003*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
2004*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
2005*5d9d9091SRichard Lowe	cmp	$0x30,%r8
2006*5d9d9091SRichard Lowe
2007*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
2008*5d9d9091SRichard Lowe	#palignr	$0xd,%xmm1,%xmm3
2009*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2010*5d9d9091SRichard Lowe	.byte	0xd9,0x0d
2011*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
2012*5d9d9091SRichard Lowe
2013*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
2014*5d9d9091SRichard Lowe	#palignr	$0xd,%xmm2,%xmm0
2015*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2016*5d9d9091SRichard Lowe	.byte	0xc2,0x0d
2017*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
2018*5d9d9091SRichard Lowe
2019*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
2020*5d9d9091SRichard Lowe	#palignr	$0xd,%xmm4,%xmm5
2021*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2022*5d9d9091SRichard Lowe	.byte	0xec,0x0d
2023*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
2024*5d9d9091SRichard Lowe
2025*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
2026*5d9d9091SRichard Lowe	jge	L(mov3dqa13)
2027*5d9d9091SRichard Lowe
2028*5d9d9091SRichard Lowe	cmp	$0x10,%r8
2029*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
2030*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2031*5d9d9091SRichard Lowe	sub	$0x10,%r8
2032*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
2033*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
2034*5d9d9091SRichard Lowe	#palignr	$0xd,%xmm1,%xmm3
2035*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2036*5d9d9091SRichard Lowe	.byte	0xd9,0x0d
2037*5d9d9091SRichard Lowe
2038*5d9d9091SRichard Lowe	cmp	$0x10,%r8
2039*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
2040*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
2041*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
2042*5d9d9091SRichard Lowe
2043*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2044*5d9d9091SRichard Lowe	sub	$0x10,%r8
2045*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
2046*5d9d9091SRichard Lowe	#palignr	$0xd,%xmm2,%xmm0
2047*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2048*5d9d9091SRichard Lowe	.byte	0xc2,0x0d
2049*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
2050*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
2051*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
2052*5d9d9091SRichard Lowe
2053*5d9d9091SRichard Lowe	.balign 16
2054*5d9d9091SRichard LoweL(mov3dqa14):
2055*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
2056*5d9d9091SRichard Lowe	sub	$0x30,%r8
2057*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
2058*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
2059*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
2060*5d9d9091SRichard Lowe	cmp	$0x30,%r8
2061*5d9d9091SRichard Lowe
2062*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
2063*5d9d9091SRichard Lowe	#palignr	$0xe,%xmm1,%xmm3
2064*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2065*5d9d9091SRichard Lowe	.byte	0xd9,0x0e
2066*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
2067*5d9d9091SRichard Lowe
2068*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
2069*5d9d9091SRichard Lowe	#palignr	$0xe,%xmm2,%xmm0
2070*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2071*5d9d9091SRichard Lowe	.byte	0xc2,0x0e
2072*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
2073*5d9d9091SRichard Lowe
2074*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
2075*5d9d9091SRichard Lowe	#palignr	$0xe,%xmm4,%xmm5
2076*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2077*5d9d9091SRichard Lowe	.byte	0xec,0x0e
2078*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
2079*5d9d9091SRichard Lowe
2080*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
2081*5d9d9091SRichard Lowe	jge	L(mov3dqa14)
2082*5d9d9091SRichard Lowe
2083*5d9d9091SRichard Lowe	cmp	$0x10,%r8
2084*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
2085*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2086*5d9d9091SRichard Lowe	sub	$0x10,%r8
2087*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
2088*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
2089*5d9d9091SRichard Lowe	#palignr	$0xe,%xmm1,%xmm3
2090*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2091*5d9d9091SRichard Lowe	.byte	0xd9,0x0e
2092*5d9d9091SRichard Lowe
2093*5d9d9091SRichard Lowe	cmp	$0x10,%r8
2094*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
2095*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
2096*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
2097*5d9d9091SRichard Lowe
2098*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2099*5d9d9091SRichard Lowe	sub	$0x10,%r8
2100*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
2101*5d9d9091SRichard Lowe	#palignr	$0xe,%xmm2,%xmm0
2102*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2103*5d9d9091SRichard Lowe	.byte	0xc2,0x0e
2104*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
2105*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
2106*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
2107*5d9d9091SRichard Lowe
2108*5d9d9091SRichard Lowe	.balign 16
2109*5d9d9091SRichard LoweL(mov3dqa15):
2110*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3
2111*5d9d9091SRichard Lowe	sub	$0x30,%r8
2112*5d9d9091SRichard Lowe	movdqa	0x20(%rdx),%xmm0
2113*5d9d9091SRichard Lowe	movdqa	0x30(%rdx),%xmm5
2114*5d9d9091SRichard Lowe	lea	0x30(%rdx),%rdx
2115*5d9d9091SRichard Lowe	cmp	$0x30,%r8
2116*5d9d9091SRichard Lowe
2117*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2
2118*5d9d9091SRichard Lowe	#palignr	$0xf,%xmm1,%xmm3
2119*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2120*5d9d9091SRichard Lowe	.byte	0xd9,0x0f
2121*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)
2122*5d9d9091SRichard Lowe
2123*5d9d9091SRichard Lowe	movdqa	%xmm0,%xmm4
2124*5d9d9091SRichard Lowe	#palignr	$0xf,%xmm2,%xmm0
2125*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2126*5d9d9091SRichard Lowe	.byte	0xc2,0x0f
2127*5d9d9091SRichard Lowe	movdqa	%xmm0,0x10(%rcx)
2128*5d9d9091SRichard Lowe
2129*5d9d9091SRichard Lowe	movdqa	%xmm5,%xmm1
2130*5d9d9091SRichard Lowe	#palignr	$0xf,%xmm4,%xmm5
2131*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2132*5d9d9091SRichard Lowe	.byte	0xec,0x0f
2133*5d9d9091SRichard Lowe	movdqa	%xmm5,0x20(%rcx)
2134*5d9d9091SRichard Lowe
2135*5d9d9091SRichard Lowe	lea	0x30(%rcx),%rcx
2136*5d9d9091SRichard Lowe	jge	L(mov3dqa15)
2137*5d9d9091SRichard Lowe
2138*5d9d9091SRichard Lowe	cmp	$0x10,%r8
2139*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
2140*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2141*5d9d9091SRichard Lowe	sub	$0x10,%r8
2142*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
2143*5d9d9091SRichard Lowe	movdqa	%xmm3,%xmm2		# save for use next concat
2144*5d9d9091SRichard Lowe	#palignr	$0xf,%xmm1,%xmm3
2145*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2146*5d9d9091SRichard Lowe	.byte	0xd9,0x0f
2147*5d9d9091SRichard Lowe
2148*5d9d9091SRichard Lowe	cmp	$0x10,%r8
2149*5d9d9091SRichard Lowe	movdqa	%xmm3,(%rcx)      	# store it
2150*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
2151*5d9d9091SRichard Lowe	jl	L(movdqa_epi)
2152*5d9d9091SRichard Lowe
2153*5d9d9091SRichard Lowe	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2154*5d9d9091SRichard Lowe	sub	$0x10,%r8
2155*5d9d9091SRichard Lowe	lea	0x10(%rdx),%rdx
2156*5d9d9091SRichard Lowe	#palignr	$0xf,%xmm2,%xmm0
2157*5d9d9091SRichard Lowe	.byte	0x66,0x0f,0x3a,0x0f
2158*5d9d9091SRichard Lowe	.byte	0xc2,0x0f
2159*5d9d9091SRichard Lowe	movdqa	%xmm0,(%rcx)      	# store it
2160*5d9d9091SRichard Lowe	lea	0x10(%rcx),%rcx
2161*5d9d9091SRichard Lowe	jmp	L(movdqa_epi)
2162*5d9d9091SRichard Lowe
2163*5d9d9091SRichard Lowe	.balign 16
2164*5d9d9091SRichard LoweL(sse2_nt_move):
2165*5d9d9091SRichard Lowe	lea	0x40(%rcx),%rcx
2166*5d9d9091SRichard Lowe	lea	0x40(%rdx),%rdx
2167*5d9d9091SRichard Lowe	lea	-0x40(%r8),%r8
2168*5d9d9091SRichard Lowe
2169*5d9d9091SRichard Lowe	/*
2170*5d9d9091SRichard Lowe	 * doesn't matter if source is aligned for stuff out of cache.
2171*5d9d9091SRichard Lowe	 * the mis-aligned penalty is masked by the slowness of main memory.
2172*5d9d9091SRichard Lowe	 */
2173*5d9d9091SRichard Lowe	prefetchnta 0x180(%rdx)
2174*5d9d9091SRichard Lowe	movdqu	-0x40(%rdx),%xmm0
2175*5d9d9091SRichard Lowe	movdqu	-0x30(%rdx),%xmm1
2176*5d9d9091SRichard Lowe
2177*5d9d9091SRichard Lowe	cmp	$0x40,%r8
2178*5d9d9091SRichard Lowe	movntdq	%xmm0,-0x40(%rcx)
2179*5d9d9091SRichard Lowe	movntdq	%xmm1,-0x30(%rcx)
2180*5d9d9091SRichard Lowe
2181*5d9d9091SRichard Lowe	movdqu	-0x20(%rdx),%xmm2
2182*5d9d9091SRichard Lowe	movdqu	-0x10(%rdx),%xmm3
2183*5d9d9091SRichard Lowe
2184*5d9d9091SRichard Lowe	movntdq	%xmm2,-0x20(%rcx)
2185*5d9d9091SRichard Lowe	movntdq	%xmm3,-0x10(%rcx)
2186*5d9d9091SRichard Lowe
2187*5d9d9091SRichard Lowe	jge	L(sse2_nt_move)
2188*5d9d9091SRichard Lowe
2189*5d9d9091SRichard Lowe	lea	L(Fix16EndTable)(%rip),%r10
2190*5d9d9091SRichard Lowe	mov	%r8,%r9
2191*5d9d9091SRichard Lowe	and	$0xFFFFFFFFFFFFFFF0,%r9
2192*5d9d9091SRichard Lowe	add	%r9,%rcx
2193*5d9d9091SRichard Lowe	add	%r9,%rdx
2194*5d9d9091SRichard Lowe	sub	%r9,%r8
2195*5d9d9091SRichard Lowe	shr	$0x4,%r9
2196*5d9d9091SRichard Lowe	sfence
2197*5d9d9091SRichard Lowe
2198*5d9d9091SRichard Lowe	movslq	(%r10,%r9,4),%r11
2199*5d9d9091SRichard Lowe	lea	(%r11,%r10,1),%r10
2200*5d9d9091SRichard Lowe	jmpq	*%r10
2201*5d9d9091SRichard Lowe
2202*5d9d9091SRichard Lowe	.balign 16
2203*5d9d9091SRichard LoweL(Fix16EndTable):
2204*5d9d9091SRichard Lowe	.int    L(fix16_0)-L(Fix16EndTable)
2205*5d9d9091SRichard Lowe	.int    L(fix16_1)-L(Fix16EndTable)
2206*5d9d9091SRichard Lowe	.int    L(fix16_2)-L(Fix16EndTable)
2207*5d9d9091SRichard Lowe	.int    L(fix16_3)-L(Fix16EndTable)
2208*5d9d9091SRichard Lowe
2209*5d9d9091SRichard Lowe	.balign 16
2210*5d9d9091SRichard LoweL(fix16_3):
2211*5d9d9091SRichard Lowe	movdqu -0x30(%rdx),%xmm1
2212*5d9d9091SRichard Lowe	movdqa %xmm1,-0x30(%rcx)
2213*5d9d9091SRichard LoweL(fix16_2):
2214*5d9d9091SRichard Lowe	movdqu -0x20(%rdx),%xmm2
2215*5d9d9091SRichard Lowe	movdqa %xmm2,-0x20(%rcx)
2216*5d9d9091SRichard LoweL(fix16_1):
2217*5d9d9091SRichard Lowe	movdqu -0x10(%rdx),%xmm3
2218*5d9d9091SRichard Lowe	movdqa %xmm3,-0x10(%rcx)
2219*5d9d9091SRichard LoweL(fix16_0):
2220*5d9d9091SRichard Lowe	lea    L(fwdPxQx)(%rip),%r10
2221*5d9d9091SRichard Lowe	add    %r8,%rdx
2222*5d9d9091SRichard Lowe	add    %r8,%rcx
2223*5d9d9091SRichard Lowe
2224*5d9d9091SRichard Lowe	movslq (%r10,%r8,4),%r9
2225*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
2226*5d9d9091SRichard Lowe	jmpq   *%r10
2227*5d9d9091SRichard Lowe
2228*5d9d9091SRichard Lowe	.balign 16
2229*5d9d9091SRichard LoweL(pre_both_aligned):
2230*5d9d9091SRichard Lowe	cmp    $0x80,%r8
2231*5d9d9091SRichard Lowe	jl     L(fix_16b)
2232*5d9d9091SRichard Lowe
2233*5d9d9091SRichard Lowe	.balign 16
2234*5d9d9091SRichard LoweL(both_aligned):
2235*5d9d9091SRichard Lowe
2236*5d9d9091SRichard Lowe	/*
2237*5d9d9091SRichard Lowe	 * this 'paired' load/load/store/store seems to do best.
2238*5d9d9091SRichard Lowe	 */
2239*5d9d9091SRichard Lowe	movdqa (%rdx),%xmm0
2240*5d9d9091SRichard Lowe	movdqa 0x10(%rdx),%xmm1
2241*5d9d9091SRichard Lowe
2242*5d9d9091SRichard Lowe	movdqa %xmm0,(%rcx)
2243*5d9d9091SRichard Lowe	movdqa %xmm1,0x10(%rcx)
2244*5d9d9091SRichard Lowe	lea    -0x80(%r8),%r8
2245*5d9d9091SRichard Lowe
2246*5d9d9091SRichard Lowe	movdqa 0x20(%rdx),%xmm2
2247*5d9d9091SRichard Lowe	movdqa 0x30(%rdx),%xmm3
2248*5d9d9091SRichard Lowe
2249*5d9d9091SRichard Lowe	movdqa %xmm2,0x20(%rcx)
2250*5d9d9091SRichard Lowe	movdqa %xmm3,0x30(%rcx)
2251*5d9d9091SRichard Lowe
2252*5d9d9091SRichard Lowe	movdqa 0x40(%rdx),%xmm0
2253*5d9d9091SRichard Lowe	movdqa 0x50(%rdx),%xmm1
2254*5d9d9091SRichard Lowe	cmp    $0x80,%r8
2255*5d9d9091SRichard Lowe
2256*5d9d9091SRichard Lowe	movdqa %xmm0,0x40(%rcx)
2257*5d9d9091SRichard Lowe	movdqa %xmm1,0x50(%rcx)
2258*5d9d9091SRichard Lowe
2259*5d9d9091SRichard Lowe	movdqa 0x60(%rdx),%xmm2
2260*5d9d9091SRichard Lowe	movdqa 0x70(%rdx),%xmm3
2261*5d9d9091SRichard Lowe	lea    0x80(%rdx),%rdx
2262*5d9d9091SRichard Lowe	movdqa %xmm2,0x60(%rcx)
2263*5d9d9091SRichard Lowe	movdqa %xmm3,0x70(%rcx)
2264*5d9d9091SRichard Lowe	lea    0x80(%rcx),%rcx
2265*5d9d9091SRichard Lowe	jge    L(both_aligned)
2266*5d9d9091SRichard Lowe
2267*5d9d9091SRichard LoweL(fix_16b):
2268*5d9d9091SRichard Lowe	add    %r8,%rcx
2269*5d9d9091SRichard Lowe	lea    L(fwdPxQx)(%rip),%r10
2270*5d9d9091SRichard Lowe	add    %r8,%rdx
2271*5d9d9091SRichard Lowe
2272*5d9d9091SRichard Lowe	movslq (%r10,%r8,4),%r9
2273*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
2274*5d9d9091SRichard Lowe	jmpq   *%r10
2275*5d9d9091SRichard Lowe
2276*5d9d9091SRichard Lowe	.balign 16
2277*5d9d9091SRichard LoweL(Loop8byte_pre):
2278*5d9d9091SRichard Lowe	# Use 8-byte moves
2279*5d9d9091SRichard Lowe	mov    .largest_level_cache_size(%rip),%r9d
2280*5d9d9091SRichard Lowe	shr    %r9		# take half of it
2281*5d9d9091SRichard Lowe	cmp    %r9,%r8
2282*5d9d9091SRichard Lowe	jge    L(byte8_nt_top)
2283*5d9d9091SRichard Lowe	# Find out whether to use rep movsq
2284*5d9d9091SRichard Lowe	cmp    $4096,%r8
2285*5d9d9091SRichard Lowe	jle    L(byte8_top)
2286*5d9d9091SRichard Lowe	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
2287*5d9d9091SRichard Lowe	cmp    %r9,%r8
2288*5d9d9091SRichard Lowe	jle    L(use_rep)
2289*5d9d9091SRichard Lowe
2290*5d9d9091SRichard Lowe	.balign     16
2291*5d9d9091SRichard LoweL(byte8_top):
2292*5d9d9091SRichard Lowe	mov    (%rdx),%r9
2293*5d9d9091SRichard Lowe	mov    0x8(%rdx),%r10
2294*5d9d9091SRichard Lowe	lea    -0x40(%r8),%r8
2295*5d9d9091SRichard Lowe	mov    %r9,(%rcx)
2296*5d9d9091SRichard Lowe	mov    %r10,0x8(%rcx)
2297*5d9d9091SRichard Lowe	mov    0x10(%rdx),%r11
2298*5d9d9091SRichard Lowe	mov    0x18(%rdx),%r9
2299*5d9d9091SRichard Lowe	mov    %r11,0x10(%rcx)
2300*5d9d9091SRichard Lowe	mov    %r9,0x18(%rcx)
2301*5d9d9091SRichard Lowe
2302*5d9d9091SRichard Lowe	cmp    $0x40,%r8
2303*5d9d9091SRichard Lowe	mov    0x20(%rdx),%r10
2304*5d9d9091SRichard Lowe	mov    0x28(%rdx),%r11
2305*5d9d9091SRichard Lowe	mov    %r10,0x20(%rcx)
2306*5d9d9091SRichard Lowe	mov    %r11,0x28(%rcx)
2307*5d9d9091SRichard Lowe	mov    0x30(%rdx),%r9
2308*5d9d9091SRichard Lowe	mov    0x38(%rdx),%r10
2309*5d9d9091SRichard Lowe	lea    0x40(%rdx),%rdx
2310*5d9d9091SRichard Lowe	mov    %r9,0x30(%rcx)
2311*5d9d9091SRichard Lowe	mov    %r10,0x38(%rcx)
2312*5d9d9091SRichard Lowe	lea    0x40(%rcx),%rcx
2313*5d9d9091SRichard Lowe	jg     L(byte8_top)
2314*5d9d9091SRichard Lowe
2315*5d9d9091SRichard LoweL(byte8_end):
2316*5d9d9091SRichard Lowe	lea    L(fwdPxQx)(%rip),%r10
2317*5d9d9091SRichard Lowe	lea    (%rdx,%r8,1),%rdx
2318*5d9d9091SRichard Lowe	lea    (%rcx,%r8,1),%rcx
2319*5d9d9091SRichard Lowe
2320*5d9d9091SRichard Lowe	movslq (%r10,%r8,4),%r9
2321*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
2322*5d9d9091SRichard Lowe	jmpq   *%r10
2323*5d9d9091SRichard Lowe
2324*5d9d9091SRichard Lowe	.balign	16
2325*5d9d9091SRichard LoweL(use_rep):
2326*5d9d9091SRichard Lowe	mov    %rdx,%rsi		# %rsi = source
2327*5d9d9091SRichard Lowe	mov    %rcx,%rdi		# %rdi = destination
2328*5d9d9091SRichard Lowe	mov    %r8,%rcx			# %rcx = count
2329*5d9d9091SRichard Lowe	shrq   $3,%rcx			# 8-byte word count
2330*5d9d9091SRichard Lowe	rep
2331*5d9d9091SRichard Lowe	  movsq
2332*5d9d9091SRichard Lowe	mov    %rsi,%rdx		# source
2333*5d9d9091SRichard Lowe	mov    %rdi,%rcx		# destination
2334*5d9d9091SRichard Lowe	andq   $7,%r8			# remainder
2335*5d9d9091SRichard Lowe	jnz    L(byte8_end)
2336*5d9d9091SRichard Lowe	ret
2337*5d9d9091SRichard Lowe
2338*5d9d9091SRichard Lowe	.balign 16
2339*5d9d9091SRichard LoweL(byte8_nt_top):
2340*5d9d9091SRichard Lowe	sub    $0x40,%r8
2341*5d9d9091SRichard Lowe	prefetchnta 0x180(%rdx)
2342*5d9d9091SRichard Lowe	mov    (%rdx),%r9
2343*5d9d9091SRichard Lowe	movnti %r9,(%rcx)
2344*5d9d9091SRichard Lowe	mov    0x8(%rdx),%r10
2345*5d9d9091SRichard Lowe	movnti %r10,0x8(%rcx)
2346*5d9d9091SRichard Lowe	mov    0x10(%rdx),%r11
2347*5d9d9091SRichard Lowe	movnti %r11,0x10(%rcx)
2348*5d9d9091SRichard Lowe	mov    0x18(%rdx),%r9
2349*5d9d9091SRichard Lowe	movnti %r9,0x18(%rcx)
2350*5d9d9091SRichard Lowe	mov    0x20(%rdx),%r10
2351*5d9d9091SRichard Lowe	movnti %r10,0x20(%rcx)
2352*5d9d9091SRichard Lowe	mov    0x28(%rdx),%r11
2353*5d9d9091SRichard Lowe	movnti %r11,0x28(%rcx)
2354*5d9d9091SRichard Lowe	mov    0x30(%rdx),%r9
2355*5d9d9091SRichard Lowe	movnti %r9,0x30(%rcx)
2356*5d9d9091SRichard Lowe	mov    0x38(%rdx),%r10
2357*5d9d9091SRichard Lowe	movnti %r10,0x38(%rcx)
2358*5d9d9091SRichard Lowe
2359*5d9d9091SRichard Lowe	lea    0x40(%rdx),%rdx
2360*5d9d9091SRichard Lowe	lea    0x40(%rcx),%rcx
2361*5d9d9091SRichard Lowe	cmp    $0x40,%r8
2362*5d9d9091SRichard Lowe	jge    L(byte8_nt_top)
2363*5d9d9091SRichard Lowe	sfence
2364*5d9d9091SRichard Lowe	jmp    L(byte8_end)
2365*5d9d9091SRichard Lowe
2366*5d9d9091SRichard Lowe	SET_SIZE(memcpy)
2367*5d9d9091SRichard Lowe
2368*5d9d9091SRichard Lowe	.balign 16
2369*5d9d9091SRichard LoweL(CopyBackwards):
2370*5d9d9091SRichard Lowe	mov    %rdx,%r8
2371*5d9d9091SRichard Lowe	mov    %rdi,%rcx
2372*5d9d9091SRichard Lowe	mov    %rsi,%rdx
2373*5d9d9091SRichard Lowe	mov    %rdi,%rax		# return value
2374*5d9d9091SRichard Lowe
2375*5d9d9091SRichard Lowe	# ck alignment of last byte
2376*5d9d9091SRichard Lowe	lea    (%rcx,%r8,1),%rcx
2377*5d9d9091SRichard Lowe	test   $0x7,%rcx
2378*5d9d9091SRichard Lowe	lea    (%rdx,%r8,1),%rdx
2379*5d9d9091SRichard Lowe	jne    L(bk_align)
2380*5d9d9091SRichard Lowe
2381*5d9d9091SRichard LoweL(bk_qw_aligned):
2382*5d9d9091SRichard Lowe	lea    L(bkPxQx)(%rip),%r10
2383*5d9d9091SRichard Lowe
2384*5d9d9091SRichard Lowe	cmp    $0x90,%r8		# 144
2385*5d9d9091SRichard Lowe	jg     L(bk_ck_sse2_alignment)
2386*5d9d9091SRichard Lowe
2387*5d9d9091SRichard Lowe	sub    %r8,%rcx
2388*5d9d9091SRichard Lowe	sub    %r8,%rdx
2389*5d9d9091SRichard Lowe
2390*5d9d9091SRichard Lowe	movslq (%r10,%r8,4),%r9
2391*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
2392*5d9d9091SRichard Lowe	jmpq   *%r10
2393*5d9d9091SRichard Lowe
2394*5d9d9091SRichard Lowe	.balign 16
2395*5d9d9091SRichard LoweL(bk_align):
2396*5d9d9091SRichard Lowe	# only align if len > 8
2397*5d9d9091SRichard Lowe	cmp    $8,%r8
2398*5d9d9091SRichard Lowe	jle    L(bk_qw_aligned)
2399*5d9d9091SRichard Lowe	test   $0x1,%rcx
2400*5d9d9091SRichard Lowe	je     L(bk_tst2)
2401*5d9d9091SRichard Lowe	dec    %rcx
2402*5d9d9091SRichard Lowe	dec    %rdx
2403*5d9d9091SRichard Lowe	dec    %r8
2404*5d9d9091SRichard Lowe	mov    (%rdx),%r9b
2405*5d9d9091SRichard Lowe	mov    %r9b,(%rcx)
2406*5d9d9091SRichard Lowe
2407*5d9d9091SRichard LoweL(bk_tst2):
2408*5d9d9091SRichard Lowe	test   $0x2,%rcx
2409*5d9d9091SRichard Lowe	je     L(bk_tst3)
2410*5d9d9091SRichard Lowe
2411*5d9d9091SRichard LoweL(bk_got2):
2412*5d9d9091SRichard Lowe	sub    $0x2,%rcx
2413*5d9d9091SRichard Lowe	sub    $0x2,%rdx
2414*5d9d9091SRichard Lowe	sub    $0x2,%r8
2415*5d9d9091SRichard Lowe	movzwq (%rdx),%r9
2416*5d9d9091SRichard Lowe	mov    %r9w,(%rcx)
2417*5d9d9091SRichard Lowe
2418*5d9d9091SRichard LoweL(bk_tst3):
2419*5d9d9091SRichard Lowe	test   $0x4,%rcx
2420*5d9d9091SRichard Lowe	je     L(bk_qw_aligned)
2421*5d9d9091SRichard Lowe
2422*5d9d9091SRichard LoweL(bk_got3):
2423*5d9d9091SRichard Lowe	sub    $0x4,%rcx
2424*5d9d9091SRichard Lowe	sub    $0x4,%rdx
2425*5d9d9091SRichard Lowe	sub    $0x4,%r8
2426*5d9d9091SRichard Lowe	mov    (%rdx),%r9d
2427*5d9d9091SRichard Lowe	mov    %r9d,(%rcx)
2428*5d9d9091SRichard Lowe	jmp    L(bk_qw_aligned)
2429*5d9d9091SRichard Lowe
2430*5d9d9091SRichard Lowe	.balign 16
2431*5d9d9091SRichard LoweL(bk_ck_sse2_alignment):
2432*5d9d9091SRichard Lowe	cmpl   $NO_SSE,.memops_method(%rip)
2433*5d9d9091SRichard Lowe	je     L(bk_use_rep)
2434*5d9d9091SRichard Lowe	# check alignment of last byte
2435*5d9d9091SRichard Lowe	test   $0xf,%rcx
2436*5d9d9091SRichard Lowe	jz     L(bk_sse2_cpy)
2437*5d9d9091SRichard Lowe
2438*5d9d9091SRichard LoweL(bk_sse2_align):
2439*5d9d9091SRichard Lowe	# only here if already aligned on at least a qword bndry
2440*5d9d9091SRichard Lowe	sub    $0x8,%rcx
2441*5d9d9091SRichard Lowe	sub    $0x8,%rdx
2442*5d9d9091SRichard Lowe	sub    $0x8,%r8
2443*5d9d9091SRichard Lowe	mov    (%rdx),%r9
2444*5d9d9091SRichard Lowe	mov    %r9,(%rcx)
2445*5d9d9091SRichard Lowe	#jmp   L(bk_sse2_cpy)
2446*5d9d9091SRichard Lowe
2447*5d9d9091SRichard Lowe	.balign 16
2448*5d9d9091SRichard LoweL(bk_sse2_cpy):
2449*5d9d9091SRichard Lowe	sub    $0x80,%rcx		# 128
2450*5d9d9091SRichard Lowe	sub    $0x80,%rdx
2451*5d9d9091SRichard Lowe	movdqu 0x70(%rdx),%xmm3
2452*5d9d9091SRichard Lowe	movdqu 0x60(%rdx),%xmm2
2453*5d9d9091SRichard Lowe	movdqa %xmm3,0x70(%rcx)
2454*5d9d9091SRichard Lowe	movdqa %xmm2,0x60(%rcx)
2455*5d9d9091SRichard Lowe	sub    $0x80,%r8
2456*5d9d9091SRichard Lowe	movdqu 0x50(%rdx),%xmm1
2457*5d9d9091SRichard Lowe	movdqu 0x40(%rdx),%xmm0
2458*5d9d9091SRichard Lowe	movdqa %xmm1,0x50(%rcx)
2459*5d9d9091SRichard Lowe	movdqa %xmm0,0x40(%rcx)
2460*5d9d9091SRichard Lowe
2461*5d9d9091SRichard Lowe	cmp    $0x80,%r8
2462*5d9d9091SRichard Lowe	movdqu 0x30(%rdx),%xmm3
2463*5d9d9091SRichard Lowe	movdqu 0x20(%rdx),%xmm2
2464*5d9d9091SRichard Lowe	movdqa %xmm3,0x30(%rcx)
2465*5d9d9091SRichard Lowe	movdqa %xmm2,0x20(%rcx)
2466*5d9d9091SRichard Lowe	movdqu 0x10(%rdx),%xmm1
2467*5d9d9091SRichard Lowe	movdqu (%rdx),%xmm0
2468*5d9d9091SRichard Lowe	movdqa %xmm1,0x10(%rcx)
2469*5d9d9091SRichard Lowe	movdqa %xmm0,(%rcx)
2470*5d9d9091SRichard Lowe	jge    L(bk_sse2_cpy)
2471*5d9d9091SRichard Lowe
2472*5d9d9091SRichard LoweL(bk_sse2_cpy_end):
2473*5d9d9091SRichard Lowe	lea    L(bkPxQx)(%rip),%r10
2474*5d9d9091SRichard Lowe	sub    %r8,%rdx
2475*5d9d9091SRichard Lowe	sub    %r8,%rcx
2476*5d9d9091SRichard Lowe	movslq (%r10,%r8,4),%r9
2477*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
2478*5d9d9091SRichard Lowe	jmpq   *%r10
2479*5d9d9091SRichard Lowe
2480*5d9d9091SRichard Lowe	.balign 16
2481*5d9d9091SRichard LoweL(bk_use_rep):
2482*5d9d9091SRichard Lowe	xchg   %rcx,%r9
2483*5d9d9091SRichard Lowe	mov    %rdx,%rsi		# source
2484*5d9d9091SRichard Lowe	mov    %r9,%rdi			# destination
2485*5d9d9091SRichard Lowe	mov    %r8,%rcx			# count
2486*5d9d9091SRichard Lowe	sub    $8,%rsi
2487*5d9d9091SRichard Lowe	sub    $8,%rdi
2488*5d9d9091SRichard Lowe	shr    $3,%rcx
2489*5d9d9091SRichard Lowe	std				# reverse direction
2490*5d9d9091SRichard Lowe	rep
2491*5d9d9091SRichard Lowe	  movsq
2492*5d9d9091SRichard Lowe	cld				# reset direction flag
2493*5d9d9091SRichard Lowe
2494*5d9d9091SRichard Lowe	xchg   %rcx,%r9
2495*5d9d9091SRichard Lowe	lea    L(bkPxQx)(%rip),%r10
2496*5d9d9091SRichard Lowe	sub    %r8,%rdx
2497*5d9d9091SRichard Lowe	sub    %r8,%rcx
2498*5d9d9091SRichard Lowe	andq   $7,%r8			# remainder
2499*5d9d9091SRichard Lowe	jz     2f
2500*5d9d9091SRichard Lowe	movslq (%r10,%r8,4),%r9
2501*5d9d9091SRichard Lowe	lea    (%r9,%r10,1),%r10
2502*5d9d9091SRichard Lowe	jmpq   *%r10
2503*5d9d9091SRichard Lowe2:
2504*5d9d9091SRichard Lowe	ret
2505*5d9d9091SRichard Lowe
2506*5d9d9091SRichard Lowe	.balign 16
2507*5d9d9091SRichard LoweL(bkP0QI):
2508*5d9d9091SRichard Lowe	mov    0x88(%rdx),%r10
2509*5d9d9091SRichard Lowe	mov    %r10,0x88(%rcx)
2510*5d9d9091SRichard LoweL(bkP0QH):
2511*5d9d9091SRichard Lowe	mov    0x80(%rdx),%r10
2512*5d9d9091SRichard Lowe	mov    %r10,0x80(%rcx)
2513*5d9d9091SRichard LoweL(bkP0QG):
2514*5d9d9091SRichard Lowe	mov    0x78(%rdx),%r9
2515*5d9d9091SRichard Lowe	mov    %r9,0x78(%rcx)
2516*5d9d9091SRichard LoweL(bkP0QF):
2517*5d9d9091SRichard Lowe	mov    0x70(%rdx),%r11
2518*5d9d9091SRichard Lowe	mov    %r11,0x70(%rcx)
2519*5d9d9091SRichard LoweL(bkP0QE):
2520*5d9d9091SRichard Lowe	mov    0x68(%rdx),%r10
2521*5d9d9091SRichard Lowe	mov    %r10,0x68(%rcx)
2522*5d9d9091SRichard LoweL(bkP0QD):
2523*5d9d9091SRichard Lowe	mov    0x60(%rdx),%r9
2524*5d9d9091SRichard Lowe	mov    %r9,0x60(%rcx)
2525*5d9d9091SRichard LoweL(bkP0QC):
2526*5d9d9091SRichard Lowe	mov    0x58(%rdx),%r11
2527*5d9d9091SRichard Lowe	mov    %r11,0x58(%rcx)
2528*5d9d9091SRichard LoweL(bkP0QB):
2529*5d9d9091SRichard Lowe	mov    0x50(%rdx),%r10
2530*5d9d9091SRichard Lowe	mov    %r10,0x50(%rcx)
2531*5d9d9091SRichard LoweL(bkP0QA):
2532*5d9d9091SRichard Lowe	mov    0x48(%rdx),%r9
2533*5d9d9091SRichard Lowe	mov    %r9,0x48(%rcx)
2534*5d9d9091SRichard LoweL(bkP0Q9):
2535*5d9d9091SRichard Lowe	mov    0x40(%rdx),%r11
2536*5d9d9091SRichard Lowe	mov    %r11,0x40(%rcx)
2537*5d9d9091SRichard LoweL(bkP0Q8):
2538*5d9d9091SRichard Lowe	mov    0x38(%rdx),%r10
2539*5d9d9091SRichard Lowe	mov    %r10,0x38(%rcx)
2540*5d9d9091SRichard LoweL(bkP0Q7):
2541*5d9d9091SRichard Lowe	mov    0x30(%rdx),%r9
2542*5d9d9091SRichard Lowe	mov    %r9,0x30(%rcx)
2543*5d9d9091SRichard LoweL(bkP0Q6):
2544*5d9d9091SRichard Lowe	mov    0x28(%rdx),%r11
2545*5d9d9091SRichard Lowe	mov    %r11,0x28(%rcx)
2546*5d9d9091SRichard LoweL(bkP0Q5):
2547*5d9d9091SRichard Lowe	mov    0x20(%rdx),%r10
2548*5d9d9091SRichard Lowe	mov    %r10,0x20(%rcx)
2549*5d9d9091SRichard LoweL(bkP0Q4):
2550*5d9d9091SRichard Lowe	mov    0x18(%rdx),%r9
2551*5d9d9091SRichard Lowe	mov    %r9,0x18(%rcx)
2552*5d9d9091SRichard LoweL(bkP0Q3):
2553*5d9d9091SRichard Lowe	mov    0x10(%rdx),%r11
2554*5d9d9091SRichard Lowe	mov    %r11,0x10(%rcx)
2555*5d9d9091SRichard LoweL(bkP0Q2):
2556*5d9d9091SRichard Lowe	mov    0x8(%rdx),%r10
2557*5d9d9091SRichard Lowe	mov    %r10,0x8(%rcx)
2558*5d9d9091SRichard LoweL(bkP0Q1):
2559*5d9d9091SRichard Lowe	mov    (%rdx),%r9
2560*5d9d9091SRichard Lowe	mov    %r9,(%rcx)
2561*5d9d9091SRichard LoweL(bkP0Q0):
2562*5d9d9091SRichard Lowe	ret
2563*5d9d9091SRichard Lowe
2564*5d9d9091SRichard Lowe	.balign 16
2565*5d9d9091SRichard LoweL(bkP1QI):
2566*5d9d9091SRichard Lowe	mov    0x89(%rdx),%r10
2567*5d9d9091SRichard Lowe	mov    %r10,0x89(%rcx)
2568*5d9d9091SRichard LoweL(bkP1QH):
2569*5d9d9091SRichard Lowe	mov    0x81(%rdx),%r11
2570*5d9d9091SRichard Lowe	mov    %r11,0x81(%rcx)
2571*5d9d9091SRichard LoweL(bkP1QG):
2572*5d9d9091SRichard Lowe	mov    0x79(%rdx),%r10
2573*5d9d9091SRichard Lowe	mov    %r10,0x79(%rcx)
2574*5d9d9091SRichard LoweL(bkP1QF):
2575*5d9d9091SRichard Lowe	mov    0x71(%rdx),%r9
2576*5d9d9091SRichard Lowe	mov    %r9,0x71(%rcx)
2577*5d9d9091SRichard LoweL(bkP1QE):
2578*5d9d9091SRichard Lowe	mov    0x69(%rdx),%r11
2579*5d9d9091SRichard Lowe	mov    %r11,0x69(%rcx)
2580*5d9d9091SRichard LoweL(bkP1QD):
2581*5d9d9091SRichard Lowe	mov    0x61(%rdx),%r10
2582*5d9d9091SRichard Lowe	mov    %r10,0x61(%rcx)
2583*5d9d9091SRichard LoweL(bkP1QC):
2584*5d9d9091SRichard Lowe	mov    0x59(%rdx),%r9
2585*5d9d9091SRichard Lowe	mov    %r9,0x59(%rcx)
2586*5d9d9091SRichard LoweL(bkP1QB):
2587*5d9d9091SRichard Lowe	mov    0x51(%rdx),%r11
2588*5d9d9091SRichard Lowe	mov    %r11,0x51(%rcx)
2589*5d9d9091SRichard LoweL(bkP1QA):
2590*5d9d9091SRichard Lowe	mov    0x49(%rdx),%r10
2591*5d9d9091SRichard Lowe	mov    %r10,0x49(%rcx)
2592*5d9d9091SRichard LoweL(bkP1Q9):
2593*5d9d9091SRichard Lowe	mov    0x41(%rdx),%r9
2594*5d9d9091SRichard Lowe	mov    %r9,0x41(%rcx)
2595*5d9d9091SRichard LoweL(bkP1Q8):
2596*5d9d9091SRichard Lowe	mov    0x39(%rdx),%r11
2597*5d9d9091SRichard Lowe	mov    %r11,0x39(%rcx)
2598*5d9d9091SRichard LoweL(bkP1Q7):
2599*5d9d9091SRichard Lowe	mov    0x31(%rdx),%r10
2600*5d9d9091SRichard Lowe	mov    %r10,0x31(%rcx)
2601*5d9d9091SRichard LoweL(bkP1Q6):
2602*5d9d9091SRichard Lowe	mov    0x29(%rdx),%r9
2603*5d9d9091SRichard Lowe	mov    %r9,0x29(%rcx)
2604*5d9d9091SRichard LoweL(bkP1Q5):
2605*5d9d9091SRichard Lowe	mov    0x21(%rdx),%r11
2606*5d9d9091SRichard Lowe	mov    %r11,0x21(%rcx)
2607*5d9d9091SRichard LoweL(bkP1Q4):
2608*5d9d9091SRichard Lowe	mov    0x19(%rdx),%r10
2609*5d9d9091SRichard Lowe	mov    %r10,0x19(%rcx)
2610*5d9d9091SRichard LoweL(bkP1Q3):
2611*5d9d9091SRichard Lowe	mov    0x11(%rdx),%r9
2612*5d9d9091SRichard Lowe	mov    %r9,0x11(%rcx)
2613*5d9d9091SRichard LoweL(bkP1Q2):
2614*5d9d9091SRichard Lowe	mov    0x9(%rdx),%r11
2615*5d9d9091SRichard Lowe	mov    %r11,0x9(%rcx)
2616*5d9d9091SRichard LoweL(bkP1Q1):
2617*5d9d9091SRichard Lowe	mov    0x1(%rdx),%r10
2618*5d9d9091SRichard Lowe	mov    %r10,0x1(%rcx)
2619*5d9d9091SRichard LoweL(bkP1Q0):
2620*5d9d9091SRichard Lowe	mov    (%rdx),%r9b
2621*5d9d9091SRichard Lowe	mov    %r9b,(%rcx)
2622*5d9d9091SRichard Lowe	ret
2623*5d9d9091SRichard Lowe
2624*5d9d9091SRichard Lowe	.balign 16
2625*5d9d9091SRichard LoweL(bkP2QI):
2626*5d9d9091SRichard Lowe	mov    0x8a(%rdx),%r10
2627*5d9d9091SRichard Lowe	mov    %r10,0x8a(%rcx)
2628*5d9d9091SRichard LoweL(bkP2QH):
2629*5d9d9091SRichard Lowe	mov    0x82(%rdx),%r11
2630*5d9d9091SRichard Lowe	mov    %r11,0x82(%rcx)
2631*5d9d9091SRichard LoweL(bkP2QG):
2632*5d9d9091SRichard Lowe	mov    0x7a(%rdx),%r10
2633*5d9d9091SRichard Lowe	mov    %r10,0x7a(%rcx)
2634*5d9d9091SRichard LoweL(bkP2QF):
2635*5d9d9091SRichard Lowe	mov    0x72(%rdx),%r9
2636*5d9d9091SRichard Lowe	mov    %r9,0x72(%rcx)
2637*5d9d9091SRichard LoweL(bkP2QE):
2638*5d9d9091SRichard Lowe	mov    0x6a(%rdx),%r11
2639*5d9d9091SRichard Lowe	mov    %r11,0x6a(%rcx)
2640*5d9d9091SRichard LoweL(bkP2QD):
2641*5d9d9091SRichard Lowe	mov    0x62(%rdx),%r10
2642*5d9d9091SRichard Lowe	mov    %r10,0x62(%rcx)
2643*5d9d9091SRichard LoweL(bkP2QC):
2644*5d9d9091SRichard Lowe	mov    0x5a(%rdx),%r9
2645*5d9d9091SRichard Lowe	mov    %r9,0x5a(%rcx)
2646*5d9d9091SRichard LoweL(bkP2QB):
2647*5d9d9091SRichard Lowe	mov    0x52(%rdx),%r11
2648*5d9d9091SRichard Lowe	mov    %r11,0x52(%rcx)
2649*5d9d9091SRichard LoweL(bkP2QA):
2650*5d9d9091SRichard Lowe	mov    0x4a(%rdx),%r10
2651*5d9d9091SRichard Lowe	mov    %r10,0x4a(%rcx)
2652*5d9d9091SRichard LoweL(bkP2Q9):
2653*5d9d9091SRichard Lowe	mov    0x42(%rdx),%r9
2654*5d9d9091SRichard Lowe	mov    %r9,0x42(%rcx)
2655*5d9d9091SRichard LoweL(bkP2Q8):
2656*5d9d9091SRichard Lowe	mov    0x3a(%rdx),%r11
2657*5d9d9091SRichard Lowe	mov    %r11,0x3a(%rcx)
2658*5d9d9091SRichard LoweL(bkP2Q7):
2659*5d9d9091SRichard Lowe	mov    0x32(%rdx),%r10
2660*5d9d9091SRichard Lowe	mov    %r10,0x32(%rcx)
2661*5d9d9091SRichard LoweL(bkP2Q6):
2662*5d9d9091SRichard Lowe	mov    0x2a(%rdx),%r9
2663*5d9d9091SRichard Lowe	mov    %r9,0x2a(%rcx)
2664*5d9d9091SRichard LoweL(bkP2Q5):
2665*5d9d9091SRichard Lowe	mov    0x22(%rdx),%r11
2666*5d9d9091SRichard Lowe	mov    %r11,0x22(%rcx)
2667*5d9d9091SRichard LoweL(bkP2Q4):
2668*5d9d9091SRichard Lowe	mov    0x1a(%rdx),%r10
2669*5d9d9091SRichard Lowe	mov    %r10,0x1a(%rcx)
2670*5d9d9091SRichard LoweL(bkP2Q3):
2671*5d9d9091SRichard Lowe	mov    0x12(%rdx),%r9
2672*5d9d9091SRichard Lowe	mov    %r9,0x12(%rcx)
2673*5d9d9091SRichard LoweL(bkP2Q2):
2674*5d9d9091SRichard Lowe	mov    0xa(%rdx),%r11
2675*5d9d9091SRichard Lowe	mov    %r11,0xa(%rcx)
2676*5d9d9091SRichard LoweL(bkP2Q1):
2677*5d9d9091SRichard Lowe	mov    0x2(%rdx),%r10
2678*5d9d9091SRichard Lowe	mov    %r10,0x2(%rcx)
2679*5d9d9091SRichard LoweL(bkP2Q0):
2680*5d9d9091SRichard Lowe	mov    (%rdx),%r9w
2681*5d9d9091SRichard Lowe	mov    %r9w,(%rcx)
2682*5d9d9091SRichard Lowe	ret
2683*5d9d9091SRichard Lowe
2684*5d9d9091SRichard Lowe	.balign 16
2685*5d9d9091SRichard LoweL(bkP3QI):
2686*5d9d9091SRichard Lowe	mov    0x8b(%rdx),%r10
2687*5d9d9091SRichard Lowe	mov    %r10,0x8b(%rcx)
2688*5d9d9091SRichard LoweL(bkP3QH):
2689*5d9d9091SRichard Lowe	mov    0x83(%rdx),%r11
2690*5d9d9091SRichard Lowe	mov    %r11,0x83(%rcx)
2691*5d9d9091SRichard LoweL(bkP3QG):
2692*5d9d9091SRichard Lowe	mov    0x7b(%rdx),%r10
2693*5d9d9091SRichard Lowe	mov    %r10,0x7b(%rcx)
2694*5d9d9091SRichard LoweL(bkP3QF):
2695*5d9d9091SRichard Lowe	mov    0x73(%rdx),%r9
2696*5d9d9091SRichard Lowe	mov    %r9,0x73(%rcx)
2697*5d9d9091SRichard LoweL(bkP3QE):
2698*5d9d9091SRichard Lowe	mov    0x6b(%rdx),%r11
2699*5d9d9091SRichard Lowe	mov    %r11,0x6b(%rcx)
2700*5d9d9091SRichard LoweL(bkP3QD):
2701*5d9d9091SRichard Lowe	mov    0x63(%rdx),%r10
2702*5d9d9091SRichard Lowe	mov    %r10,0x63(%rcx)
2703*5d9d9091SRichard LoweL(bkP3QC):
2704*5d9d9091SRichard Lowe	mov    0x5b(%rdx),%r9
2705*5d9d9091SRichard Lowe	mov    %r9,0x5b(%rcx)
2706*5d9d9091SRichard LoweL(bkP3QB):
2707*5d9d9091SRichard Lowe	mov    0x53(%rdx),%r11
2708*5d9d9091SRichard Lowe	mov    %r11,0x53(%rcx)
2709*5d9d9091SRichard LoweL(bkP3QA):
2710*5d9d9091SRichard Lowe	mov    0x4b(%rdx),%r10
2711*5d9d9091SRichard Lowe	mov    %r10,0x4b(%rcx)
2712*5d9d9091SRichard LoweL(bkP3Q9):
2713*5d9d9091SRichard Lowe	mov    0x43(%rdx),%r9
2714*5d9d9091SRichard Lowe	mov    %r9,0x43(%rcx)
2715*5d9d9091SRichard LoweL(bkP3Q8):
2716*5d9d9091SRichard Lowe	mov    0x3b(%rdx),%r11
2717*5d9d9091SRichard Lowe	mov    %r11,0x3b(%rcx)
2718*5d9d9091SRichard LoweL(bkP3Q7):
2719*5d9d9091SRichard Lowe	mov    0x33(%rdx),%r10
2720*5d9d9091SRichard Lowe	mov    %r10,0x33(%rcx)
2721*5d9d9091SRichard LoweL(bkP3Q6):
2722*5d9d9091SRichard Lowe	mov    0x2b(%rdx),%r9
2723*5d9d9091SRichard Lowe	mov    %r9,0x2b(%rcx)
2724*5d9d9091SRichard LoweL(bkP3Q5):
2725*5d9d9091SRichard Lowe	mov    0x23(%rdx),%r11
2726*5d9d9091SRichard Lowe	mov    %r11,0x23(%rcx)
2727*5d9d9091SRichard LoweL(bkP3Q4):
2728*5d9d9091SRichard Lowe	mov    0x1b(%rdx),%r10
2729*5d9d9091SRichard Lowe	mov    %r10,0x1b(%rcx)
2730*5d9d9091SRichard LoweL(bkP3Q3):
2731*5d9d9091SRichard Lowe	mov    0x13(%rdx),%r9
2732*5d9d9091SRichard Lowe	mov    %r9,0x13(%rcx)
2733*5d9d9091SRichard LoweL(bkP3Q2):
2734*5d9d9091SRichard Lowe	mov    0xb(%rdx),%r11
2735*5d9d9091SRichard Lowe	mov    %r11,0xb(%rcx)
2736*5d9d9091SRichard LoweL(bkP3Q1):
2737*5d9d9091SRichard Lowe	mov    0x3(%rdx),%r10
2738*5d9d9091SRichard Lowe	mov    %r10,0x3(%rcx)
2739*5d9d9091SRichard LoweL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
2740*5d9d9091SRichard Lowe	mov    0x1(%rdx),%r9w
2741*5d9d9091SRichard Lowe	mov    %r9w,0x1(%rcx)
2742*5d9d9091SRichard Lowe	mov    (%rdx),%r10b
2743*5d9d9091SRichard Lowe	mov    %r10b,(%rcx)
2744*5d9d9091SRichard Lowe	ret
2745*5d9d9091SRichard Lowe
2746*5d9d9091SRichard Lowe	.balign 16
2747*5d9d9091SRichard LoweL(bkP4QI):
2748*5d9d9091SRichard Lowe	mov    0x8c(%rdx),%r10
2749*5d9d9091SRichard Lowe	mov    %r10,0x8c(%rcx)
2750*5d9d9091SRichard LoweL(bkP4QH):
2751*5d9d9091SRichard Lowe	mov    0x84(%rdx),%r11
2752*5d9d9091SRichard Lowe	mov    %r11,0x84(%rcx)
2753*5d9d9091SRichard LoweL(bkP4QG):
2754*5d9d9091SRichard Lowe	mov    0x7c(%rdx),%r10
2755*5d9d9091SRichard Lowe	mov    %r10,0x7c(%rcx)
2756*5d9d9091SRichard LoweL(bkP4QF):
2757*5d9d9091SRichard Lowe	mov    0x74(%rdx),%r9
2758*5d9d9091SRichard Lowe	mov    %r9,0x74(%rcx)
2759*5d9d9091SRichard LoweL(bkP4QE):
2760*5d9d9091SRichard Lowe	mov    0x6c(%rdx),%r11
2761*5d9d9091SRichard Lowe	mov    %r11,0x6c(%rcx)
2762*5d9d9091SRichard LoweL(bkP4QD):
2763*5d9d9091SRichard Lowe	mov    0x64(%rdx),%r10
2764*5d9d9091SRichard Lowe	mov    %r10,0x64(%rcx)
2765*5d9d9091SRichard LoweL(bkP4QC):
2766*5d9d9091SRichard Lowe	mov    0x5c(%rdx),%r9
2767*5d9d9091SRichard Lowe	mov    %r9,0x5c(%rcx)
2768*5d9d9091SRichard LoweL(bkP4QB):
2769*5d9d9091SRichard Lowe	mov    0x54(%rdx),%r11
2770*5d9d9091SRichard Lowe	mov    %r11,0x54(%rcx)
2771*5d9d9091SRichard LoweL(bkP4QA):
2772*5d9d9091SRichard Lowe	mov    0x4c(%rdx),%r10
2773*5d9d9091SRichard Lowe	mov    %r10,0x4c(%rcx)
2774*5d9d9091SRichard LoweL(bkP4Q9):
2775*5d9d9091SRichard Lowe	mov    0x44(%rdx),%r9
2776*5d9d9091SRichard Lowe	mov    %r9,0x44(%rcx)
2777*5d9d9091SRichard LoweL(bkP4Q8):
2778*5d9d9091SRichard Lowe	mov    0x3c(%rdx),%r11
2779*5d9d9091SRichard Lowe	mov    %r11,0x3c(%rcx)
2780*5d9d9091SRichard LoweL(bkP4Q7):
2781*5d9d9091SRichard Lowe	mov    0x34(%rdx),%r10
2782*5d9d9091SRichard Lowe	mov    %r10,0x34(%rcx)
2783*5d9d9091SRichard LoweL(bkP4Q6):
2784*5d9d9091SRichard Lowe	mov    0x2c(%rdx),%r9
2785*5d9d9091SRichard Lowe	mov    %r9,0x2c(%rcx)
2786*5d9d9091SRichard LoweL(bkP4Q5):
2787*5d9d9091SRichard Lowe	mov    0x24(%rdx),%r11
2788*5d9d9091SRichard Lowe	mov    %r11,0x24(%rcx)
2789*5d9d9091SRichard LoweL(bkP4Q4):
2790*5d9d9091SRichard Lowe	mov    0x1c(%rdx),%r10
2791*5d9d9091SRichard Lowe	mov    %r10,0x1c(%rcx)
2792*5d9d9091SRichard LoweL(bkP4Q3):
2793*5d9d9091SRichard Lowe	mov    0x14(%rdx),%r9
2794*5d9d9091SRichard Lowe	mov    %r9,0x14(%rcx)
2795*5d9d9091SRichard LoweL(bkP4Q2):
2796*5d9d9091SRichard Lowe	mov    0xc(%rdx),%r11
2797*5d9d9091SRichard Lowe	mov    %r11,0xc(%rcx)
2798*5d9d9091SRichard LoweL(bkP4Q1):
2799*5d9d9091SRichard Lowe	mov    0x4(%rdx),%r10
2800*5d9d9091SRichard Lowe	mov    %r10,0x4(%rcx)
2801*5d9d9091SRichard LoweL(bkP4Q0):
2802*5d9d9091SRichard Lowe	mov    (%rdx),%r9d
2803*5d9d9091SRichard Lowe	mov    %r9d,(%rcx)
2804*5d9d9091SRichard Lowe	ret
2805*5d9d9091SRichard Lowe
2806*5d9d9091SRichard Lowe	.balign 16
2807*5d9d9091SRichard LoweL(bkP5QI):
2808*5d9d9091SRichard Lowe	mov    0x8d(%rdx),%r10
2809*5d9d9091SRichard Lowe	mov    %r10,0x8d(%rcx)
2810*5d9d9091SRichard LoweL(bkP5QH):
2811*5d9d9091SRichard Lowe	mov    0x85(%rdx),%r9
2812*5d9d9091SRichard Lowe	mov    %r9,0x85(%rcx)
2813*5d9d9091SRichard LoweL(bkP5QG):
2814*5d9d9091SRichard Lowe	mov    0x7d(%rdx),%r11
2815*5d9d9091SRichard Lowe	mov    %r11,0x7d(%rcx)
2816*5d9d9091SRichard LoweL(bkP5QF):
2817*5d9d9091SRichard Lowe	mov    0x75(%rdx),%r10
2818*5d9d9091SRichard Lowe	mov    %r10,0x75(%rcx)
2819*5d9d9091SRichard LoweL(bkP5QE):
2820*5d9d9091SRichard Lowe	mov    0x6d(%rdx),%r9
2821*5d9d9091SRichard Lowe	mov    %r9,0x6d(%rcx)
2822*5d9d9091SRichard LoweL(bkP5QD):
2823*5d9d9091SRichard Lowe	mov    0x65(%rdx),%r11
2824*5d9d9091SRichard Lowe	mov    %r11,0x65(%rcx)
2825*5d9d9091SRichard LoweL(bkP5QC):
2826*5d9d9091SRichard Lowe	mov    0x5d(%rdx),%r10
2827*5d9d9091SRichard Lowe	mov    %r10,0x5d(%rcx)
2828*5d9d9091SRichard LoweL(bkP5QB):
2829*5d9d9091SRichard Lowe	mov    0x55(%rdx),%r9
2830*5d9d9091SRichard Lowe	mov    %r9,0x55(%rcx)
2831*5d9d9091SRichard LoweL(bkP5QA):
2832*5d9d9091SRichard Lowe	mov    0x4d(%rdx),%r11
2833*5d9d9091SRichard Lowe	mov    %r11,0x4d(%rcx)
2834*5d9d9091SRichard LoweL(bkP5Q9):
2835*5d9d9091SRichard Lowe	mov    0x45(%rdx),%r10
2836*5d9d9091SRichard Lowe	mov    %r10,0x45(%rcx)
2837*5d9d9091SRichard LoweL(bkP5Q8):
2838*5d9d9091SRichard Lowe	mov    0x3d(%rdx),%r9
2839*5d9d9091SRichard Lowe	mov    %r9,0x3d(%rcx)
2840*5d9d9091SRichard LoweL(bkP5Q7):
2841*5d9d9091SRichard Lowe	mov    0x35(%rdx),%r11
2842*5d9d9091SRichard Lowe	mov    %r11,0x35(%rcx)
2843*5d9d9091SRichard LoweL(bkP5Q6):
2844*5d9d9091SRichard Lowe	mov    0x2d(%rdx),%r10
2845*5d9d9091SRichard Lowe	mov    %r10,0x2d(%rcx)
2846*5d9d9091SRichard LoweL(bkP5Q5):
2847*5d9d9091SRichard Lowe	mov    0x25(%rdx),%r9
2848*5d9d9091SRichard Lowe	mov    %r9,0x25(%rcx)
2849*5d9d9091SRichard LoweL(bkP5Q4):
2850*5d9d9091SRichard Lowe	mov    0x1d(%rdx),%r11
2851*5d9d9091SRichard Lowe	mov    %r11,0x1d(%rcx)
2852*5d9d9091SRichard LoweL(bkP5Q3):
2853*5d9d9091SRichard Lowe	mov    0x15(%rdx),%r10
2854*5d9d9091SRichard Lowe	mov    %r10,0x15(%rcx)
2855*5d9d9091SRichard LoweL(bkP5Q2):
2856*5d9d9091SRichard Lowe	mov    0xd(%rdx),%r9
2857*5d9d9091SRichard Lowe	mov    %r9,0xd(%rcx)
2858*5d9d9091SRichard LoweL(bkP5Q1):
2859*5d9d9091SRichard Lowe	mov    0x5(%rdx),%r11
2860*5d9d9091SRichard Lowe	mov    %r11,0x5(%rcx)
2861*5d9d9091SRichard LoweL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
2862*5d9d9091SRichard Lowe	mov    0x1(%rdx),%r9d
2863*5d9d9091SRichard Lowe	mov    %r9d,0x1(%rcx)
2864*5d9d9091SRichard Lowe	mov    (%rdx),%r10b
2865*5d9d9091SRichard Lowe	mov    %r10b,(%rcx)
2866*5d9d9091SRichard Lowe	ret
2867*5d9d9091SRichard Lowe
2868*5d9d9091SRichard Lowe	.balign 16
2869*5d9d9091SRichard LoweL(bkP6QI):
2870*5d9d9091SRichard Lowe	mov    0x8e(%rdx),%r10
2871*5d9d9091SRichard Lowe	mov    %r10,0x8e(%rcx)
2872*5d9d9091SRichard LoweL(bkP6QH):
2873*5d9d9091SRichard Lowe	mov    0x86(%rdx),%r11
2874*5d9d9091SRichard Lowe	mov    %r11,0x86(%rcx)
2875*5d9d9091SRichard LoweL(bkP6QG):
2876*5d9d9091SRichard Lowe	mov    0x7e(%rdx),%r10
2877*5d9d9091SRichard Lowe	mov    %r10,0x7e(%rcx)
2878*5d9d9091SRichard LoweL(bkP6QF):
2879*5d9d9091SRichard Lowe	mov    0x76(%rdx),%r9
2880*5d9d9091SRichard Lowe	mov    %r9,0x76(%rcx)
2881*5d9d9091SRichard LoweL(bkP6QE):
2882*5d9d9091SRichard Lowe	mov    0x6e(%rdx),%r11
2883*5d9d9091SRichard Lowe	mov    %r11,0x6e(%rcx)
2884*5d9d9091SRichard LoweL(bkP6QD):
2885*5d9d9091SRichard Lowe	mov    0x66(%rdx),%r10
2886*5d9d9091SRichard Lowe	mov    %r10,0x66(%rcx)
2887*5d9d9091SRichard LoweL(bkP6QC):
2888*5d9d9091SRichard Lowe	mov    0x5e(%rdx),%r9
2889*5d9d9091SRichard Lowe	mov    %r9,0x5e(%rcx)
2890*5d9d9091SRichard LoweL(bkP6QB):
2891*5d9d9091SRichard Lowe	mov    0x56(%rdx),%r11
2892*5d9d9091SRichard Lowe	mov    %r11,0x56(%rcx)
2893*5d9d9091SRichard LoweL(bkP6QA):
2894*5d9d9091SRichard Lowe	mov    0x4e(%rdx),%r10
2895*5d9d9091SRichard Lowe	mov    %r10,0x4e(%rcx)
2896*5d9d9091SRichard LoweL(bkP6Q9):
2897*5d9d9091SRichard Lowe	mov    0x46(%rdx),%r9
2898*5d9d9091SRichard Lowe	mov    %r9,0x46(%rcx)
2899*5d9d9091SRichard LoweL(bkP6Q8):
2900*5d9d9091SRichard Lowe	mov    0x3e(%rdx),%r11
2901*5d9d9091SRichard Lowe	mov    %r11,0x3e(%rcx)
2902*5d9d9091SRichard LoweL(bkP6Q7):
2903*5d9d9091SRichard Lowe	mov    0x36(%rdx),%r10
2904*5d9d9091SRichard Lowe	mov    %r10,0x36(%rcx)
2905*5d9d9091SRichard LoweL(bkP6Q6):
2906*5d9d9091SRichard Lowe	mov    0x2e(%rdx),%r9
2907*5d9d9091SRichard Lowe	mov    %r9,0x2e(%rcx)
2908*5d9d9091SRichard LoweL(bkP6Q5):
2909*5d9d9091SRichard Lowe	mov    0x26(%rdx),%r11
2910*5d9d9091SRichard Lowe	mov    %r11,0x26(%rcx)
2911*5d9d9091SRichard LoweL(bkP6Q4):
2912*5d9d9091SRichard Lowe	mov    0x1e(%rdx),%r10
2913*5d9d9091SRichard Lowe	mov    %r10,0x1e(%rcx)
2914*5d9d9091SRichard LoweL(bkP6Q3):
2915*5d9d9091SRichard Lowe	mov    0x16(%rdx),%r9
2916*5d9d9091SRichard Lowe	mov    %r9,0x16(%rcx)
2917*5d9d9091SRichard LoweL(bkP6Q2):
2918*5d9d9091SRichard Lowe	mov    0xe(%rdx),%r11
2919*5d9d9091SRichard Lowe	mov    %r11,0xe(%rcx)
2920*5d9d9091SRichard LoweL(bkP6Q1):
2921*5d9d9091SRichard Lowe	mov    0x6(%rdx),%r10
2922*5d9d9091SRichard Lowe	mov    %r10,0x6(%rcx)
2923*5d9d9091SRichard LoweL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
2924*5d9d9091SRichard Lowe	mov    0x2(%rdx),%r9d
2925*5d9d9091SRichard Lowe	mov    %r9d,0x2(%rcx)
2926*5d9d9091SRichard Lowe	mov    (%rdx),%r10w
2927*5d9d9091SRichard Lowe	mov    %r10w,(%rcx)
2928*5d9d9091SRichard Lowe	ret
2929*5d9d9091SRichard Lowe
2930*5d9d9091SRichard Lowe	.balign 16
2931*5d9d9091SRichard LoweL(bkP7QI):
2932*5d9d9091SRichard Lowe	mov    0x8f(%rdx),%r10
2933*5d9d9091SRichard Lowe	mov    %r10,0x8f(%rcx)
2934*5d9d9091SRichard LoweL(bkP7QH):
2935*5d9d9091SRichard Lowe	mov    0x87(%rdx),%r11
2936*5d9d9091SRichard Lowe	mov    %r11,0x87(%rcx)
2937*5d9d9091SRichard LoweL(bkP7QG):
2938*5d9d9091SRichard Lowe	mov    0x7f(%rdx),%r10
2939*5d9d9091SRichard Lowe	mov    %r10,0x7f(%rcx)
2940*5d9d9091SRichard LoweL(bkP7QF):
2941*5d9d9091SRichard Lowe	mov    0x77(%rdx),%r9
2942*5d9d9091SRichard Lowe	mov    %r9,0x77(%rcx)
2943*5d9d9091SRichard LoweL(bkP7QE):
2944*5d9d9091SRichard Lowe	mov    0x6f(%rdx),%r11
2945*5d9d9091SRichard Lowe	mov    %r11,0x6f(%rcx)
2946*5d9d9091SRichard LoweL(bkP7QD):
2947*5d9d9091SRichard Lowe	mov    0x67(%rdx),%r10
2948*5d9d9091SRichard Lowe	mov    %r10,0x67(%rcx)
2949*5d9d9091SRichard LoweL(bkP7QC):
2950*5d9d9091SRichard Lowe	mov    0x5f(%rdx),%r9
2951*5d9d9091SRichard Lowe	mov    %r9,0x5f(%rcx)
2952*5d9d9091SRichard LoweL(bkP7QB):
2953*5d9d9091SRichard Lowe	mov    0x57(%rdx),%r11
2954*5d9d9091SRichard Lowe	mov    %r11,0x57(%rcx)
2955*5d9d9091SRichard LoweL(bkP7QA):
2956*5d9d9091SRichard Lowe	mov    0x4f(%rdx),%r10
2957*5d9d9091SRichard Lowe	mov    %r10,0x4f(%rcx)
2958*5d9d9091SRichard LoweL(bkP7Q9):
2959*5d9d9091SRichard Lowe	mov    0x47(%rdx),%r9
2960*5d9d9091SRichard Lowe	mov    %r9,0x47(%rcx)
2961*5d9d9091SRichard LoweL(bkP7Q8):
2962*5d9d9091SRichard Lowe	mov    0x3f(%rdx),%r11
2963*5d9d9091SRichard Lowe	mov    %r11,0x3f(%rcx)
2964*5d9d9091SRichard LoweL(bkP7Q7):
2965*5d9d9091SRichard Lowe	mov    0x37(%rdx),%r10
2966*5d9d9091SRichard Lowe	mov    %r10,0x37(%rcx)
2967*5d9d9091SRichard LoweL(bkP7Q6):
2968*5d9d9091SRichard Lowe	mov    0x2f(%rdx),%r9
2969*5d9d9091SRichard Lowe	mov    %r9,0x2f(%rcx)
2970*5d9d9091SRichard LoweL(bkP7Q5):
2971*5d9d9091SRichard Lowe	mov    0x27(%rdx),%r11
2972*5d9d9091SRichard Lowe	mov    %r11,0x27(%rcx)
2973*5d9d9091SRichard LoweL(bkP7Q4):
2974*5d9d9091SRichard Lowe	mov    0x1f(%rdx),%r10
2975*5d9d9091SRichard Lowe	mov    %r10,0x1f(%rcx)
2976*5d9d9091SRichard LoweL(bkP7Q3):
2977*5d9d9091SRichard Lowe	mov    0x17(%rdx),%r9
2978*5d9d9091SRichard Lowe	mov    %r9,0x17(%rcx)
2979*5d9d9091SRichard LoweL(bkP7Q2):
2980*5d9d9091SRichard Lowe	mov    0xf(%rdx),%r11
2981*5d9d9091SRichard Lowe	mov    %r11,0xf(%rcx)
2982*5d9d9091SRichard LoweL(bkP7Q1):
2983*5d9d9091SRichard Lowe	mov    0x7(%rdx),%r10
2984*5d9d9091SRichard Lowe	mov    %r10,0x7(%rcx)
2985*5d9d9091SRichard LoweL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
2986*5d9d9091SRichard Lowe	mov    0x3(%rdx),%r9d
2987*5d9d9091SRichard Lowe	mov    %r9d,0x3(%rcx)
2988*5d9d9091SRichard Lowe	mov    0x1(%rdx),%r10w
2989*5d9d9091SRichard Lowe	mov    %r10w,0x1(%rcx)
2990*5d9d9091SRichard Lowe	mov    (%rdx),%r11b
2991*5d9d9091SRichard Lowe	mov    %r11b,(%rcx)
2992*5d9d9091SRichard Lowe	ret
2993*5d9d9091SRichard Lowe
2994*5d9d9091SRichard Lowe		.balign 16
2995*5d9d9091SRichard LoweL(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
2996*5d9d9091SRichard Lowe		.int L(bkP1Q0)-L(bkPxQx)
2997*5d9d9091SRichard Lowe		.int L(bkP2Q0)-L(bkPxQx)
2998*5d9d9091SRichard Lowe		.int L(bkP3Q0)-L(bkPxQx)
2999*5d9d9091SRichard Lowe		.int L(bkP4Q0)-L(bkPxQx)
3000*5d9d9091SRichard Lowe		.int L(bkP5Q0)-L(bkPxQx)
3001*5d9d9091SRichard Lowe		.int L(bkP6Q0)-L(bkPxQx)
3002*5d9d9091SRichard Lowe		.int L(bkP7Q0)-L(bkPxQx)
3003*5d9d9091SRichard Lowe
3004*5d9d9091SRichard Lowe		.int L(bkP0Q1)-L(bkPxQx)
3005*5d9d9091SRichard Lowe		.int L(bkP1Q1)-L(bkPxQx)
3006*5d9d9091SRichard Lowe		.int L(bkP2Q1)-L(bkPxQx)
3007*5d9d9091SRichard Lowe		.int L(bkP3Q1)-L(bkPxQx)
3008*5d9d9091SRichard Lowe		.int L(bkP4Q1)-L(bkPxQx)
3009*5d9d9091SRichard Lowe		.int L(bkP5Q1)-L(bkPxQx)
3010*5d9d9091SRichard Lowe		.int L(bkP6Q1)-L(bkPxQx)
3011*5d9d9091SRichard Lowe		.int L(bkP7Q1)-L(bkPxQx)
3012*5d9d9091SRichard Lowe
3013*5d9d9091SRichard Lowe		.int L(bkP0Q2)-L(bkPxQx)
3014*5d9d9091SRichard Lowe		.int L(bkP1Q2)-L(bkPxQx)
3015*5d9d9091SRichard Lowe		.int L(bkP2Q2)-L(bkPxQx)
3016*5d9d9091SRichard Lowe		.int L(bkP3Q2)-L(bkPxQx)
3017*5d9d9091SRichard Lowe		.int L(bkP4Q2)-L(bkPxQx)
3018*5d9d9091SRichard Lowe		.int L(bkP5Q2)-L(bkPxQx)
3019*5d9d9091SRichard Lowe		.int L(bkP6Q2)-L(bkPxQx)
3020*5d9d9091SRichard Lowe		.int L(bkP7Q2)-L(bkPxQx)
3021*5d9d9091SRichard Lowe
3022*5d9d9091SRichard Lowe		.int L(bkP0Q3)-L(bkPxQx)
3023*5d9d9091SRichard Lowe		.int L(bkP1Q3)-L(bkPxQx)
3024*5d9d9091SRichard Lowe		.int L(bkP2Q3)-L(bkPxQx)
3025*5d9d9091SRichard Lowe		.int L(bkP3Q3)-L(bkPxQx)
3026*5d9d9091SRichard Lowe		.int L(bkP4Q3)-L(bkPxQx)
3027*5d9d9091SRichard Lowe		.int L(bkP5Q3)-L(bkPxQx)
3028*5d9d9091SRichard Lowe		.int L(bkP6Q3)-L(bkPxQx)
3029*5d9d9091SRichard Lowe		.int L(bkP7Q3)-L(bkPxQx)
3030*5d9d9091SRichard Lowe
3031*5d9d9091SRichard Lowe		.int L(bkP0Q4)-L(bkPxQx)
3032*5d9d9091SRichard Lowe		.int L(bkP1Q4)-L(bkPxQx)
3033*5d9d9091SRichard Lowe		.int L(bkP2Q4)-L(bkPxQx)
3034*5d9d9091SRichard Lowe		.int L(bkP3Q4)-L(bkPxQx)
3035*5d9d9091SRichard Lowe		.int L(bkP4Q4)-L(bkPxQx)
3036*5d9d9091SRichard Lowe		.int L(bkP5Q4)-L(bkPxQx)
3037*5d9d9091SRichard Lowe		.int L(bkP6Q4)-L(bkPxQx)
3038*5d9d9091SRichard Lowe		.int L(bkP7Q4)-L(bkPxQx)
3039*5d9d9091SRichard Lowe
3040*5d9d9091SRichard Lowe		.int L(bkP0Q5)-L(bkPxQx)
3041*5d9d9091SRichard Lowe		.int L(bkP1Q5)-L(bkPxQx)
3042*5d9d9091SRichard Lowe		.int L(bkP2Q5)-L(bkPxQx)
3043*5d9d9091SRichard Lowe		.int L(bkP3Q5)-L(bkPxQx)
3044*5d9d9091SRichard Lowe		.int L(bkP4Q5)-L(bkPxQx)
3045*5d9d9091SRichard Lowe		.int L(bkP5Q5)-L(bkPxQx)
3046*5d9d9091SRichard Lowe		.int L(bkP6Q5)-L(bkPxQx)
3047*5d9d9091SRichard Lowe		.int L(bkP7Q5)-L(bkPxQx)
3048*5d9d9091SRichard Lowe
3049*5d9d9091SRichard Lowe		.int L(bkP0Q6)-L(bkPxQx)
3050*5d9d9091SRichard Lowe		.int L(bkP1Q6)-L(bkPxQx)
3051*5d9d9091SRichard Lowe		.int L(bkP2Q6)-L(bkPxQx)
3052*5d9d9091SRichard Lowe		.int L(bkP3Q6)-L(bkPxQx)
3053*5d9d9091SRichard Lowe		.int L(bkP4Q6)-L(bkPxQx)
3054*5d9d9091SRichard Lowe		.int L(bkP5Q6)-L(bkPxQx)
3055*5d9d9091SRichard Lowe		.int L(bkP6Q6)-L(bkPxQx)
3056*5d9d9091SRichard Lowe		.int L(bkP7Q6)-L(bkPxQx)
3057*5d9d9091SRichard Lowe
3058*5d9d9091SRichard Lowe		.int L(bkP0Q7)-L(bkPxQx)
3059*5d9d9091SRichard Lowe		.int L(bkP1Q7)-L(bkPxQx)
3060*5d9d9091SRichard Lowe		.int L(bkP2Q7)-L(bkPxQx)
3061*5d9d9091SRichard Lowe		.int L(bkP3Q7)-L(bkPxQx)
3062*5d9d9091SRichard Lowe		.int L(bkP4Q7)-L(bkPxQx)
3063*5d9d9091SRichard Lowe		.int L(bkP5Q7)-L(bkPxQx)
3064*5d9d9091SRichard Lowe		.int L(bkP6Q7)-L(bkPxQx)
3065*5d9d9091SRichard Lowe		.int L(bkP7Q7)-L(bkPxQx)
3066*5d9d9091SRichard Lowe
3067*5d9d9091SRichard Lowe		.int L(bkP0Q8)-L(bkPxQx)
3068*5d9d9091SRichard Lowe		.int L(bkP1Q8)-L(bkPxQx)
3069*5d9d9091SRichard Lowe		.int L(bkP2Q8)-L(bkPxQx)
3070*5d9d9091SRichard Lowe		.int L(bkP3Q8)-L(bkPxQx)
3071*5d9d9091SRichard Lowe		.int L(bkP4Q8)-L(bkPxQx)
3072*5d9d9091SRichard Lowe		.int L(bkP5Q8)-L(bkPxQx)
3073*5d9d9091SRichard Lowe		.int L(bkP6Q8)-L(bkPxQx)
3074*5d9d9091SRichard Lowe		.int L(bkP7Q8)-L(bkPxQx)
3075*5d9d9091SRichard Lowe
3076*5d9d9091SRichard Lowe		.int L(bkP0Q9)-L(bkPxQx)
3077*5d9d9091SRichard Lowe		.int L(bkP1Q9)-L(bkPxQx)
3078*5d9d9091SRichard Lowe		.int L(bkP2Q9)-L(bkPxQx)
3079*5d9d9091SRichard Lowe		.int L(bkP3Q9)-L(bkPxQx)
3080*5d9d9091SRichard Lowe		.int L(bkP4Q9)-L(bkPxQx)
3081*5d9d9091SRichard Lowe		.int L(bkP5Q9)-L(bkPxQx)
3082*5d9d9091SRichard Lowe		.int L(bkP6Q9)-L(bkPxQx)
3083*5d9d9091SRichard Lowe		.int L(bkP7Q9)-L(bkPxQx)
3084*5d9d9091SRichard Lowe
3085*5d9d9091SRichard Lowe		.int L(bkP0QA)-L(bkPxQx)
3086*5d9d9091SRichard Lowe		.int L(bkP1QA)-L(bkPxQx)
3087*5d9d9091SRichard Lowe		.int L(bkP2QA)-L(bkPxQx)
3088*5d9d9091SRichard Lowe		.int L(bkP3QA)-L(bkPxQx)
3089*5d9d9091SRichard Lowe		.int L(bkP4QA)-L(bkPxQx)
3090*5d9d9091SRichard Lowe		.int L(bkP5QA)-L(bkPxQx)
3091*5d9d9091SRichard Lowe		.int L(bkP6QA)-L(bkPxQx)
3092*5d9d9091SRichard Lowe		.int L(bkP7QA)-L(bkPxQx)
3093*5d9d9091SRichard Lowe
3094*5d9d9091SRichard Lowe		.int L(bkP0QB)-L(bkPxQx)
3095*5d9d9091SRichard Lowe		.int L(bkP1QB)-L(bkPxQx)
3096*5d9d9091SRichard Lowe		.int L(bkP2QB)-L(bkPxQx)
3097*5d9d9091SRichard Lowe		.int L(bkP3QB)-L(bkPxQx)
3098*5d9d9091SRichard Lowe		.int L(bkP4QB)-L(bkPxQx)
3099*5d9d9091SRichard Lowe		.int L(bkP5QB)-L(bkPxQx)
3100*5d9d9091SRichard Lowe		.int L(bkP6QB)-L(bkPxQx)
3101*5d9d9091SRichard Lowe		.int L(bkP7QB)-L(bkPxQx)
3102*5d9d9091SRichard Lowe
3103*5d9d9091SRichard Lowe		.int L(bkP0QC)-L(bkPxQx)
3104*5d9d9091SRichard Lowe		.int L(bkP1QC)-L(bkPxQx)
3105*5d9d9091SRichard Lowe		.int L(bkP2QC)-L(bkPxQx)
3106*5d9d9091SRichard Lowe		.int L(bkP3QC)-L(bkPxQx)
3107*5d9d9091SRichard Lowe		.int L(bkP4QC)-L(bkPxQx)
3108*5d9d9091SRichard Lowe		.int L(bkP5QC)-L(bkPxQx)
3109*5d9d9091SRichard Lowe		.int L(bkP6QC)-L(bkPxQx)
3110*5d9d9091SRichard Lowe		.int L(bkP7QC)-L(bkPxQx)
3111*5d9d9091SRichard Lowe
3112*5d9d9091SRichard Lowe		.int L(bkP0QD)-L(bkPxQx)
3113*5d9d9091SRichard Lowe		.int L(bkP1QD)-L(bkPxQx)
3114*5d9d9091SRichard Lowe		.int L(bkP2QD)-L(bkPxQx)
3115*5d9d9091SRichard Lowe		.int L(bkP3QD)-L(bkPxQx)
3116*5d9d9091SRichard Lowe		.int L(bkP4QD)-L(bkPxQx)
3117*5d9d9091SRichard Lowe		.int L(bkP5QD)-L(bkPxQx)
3118*5d9d9091SRichard Lowe		.int L(bkP6QD)-L(bkPxQx)
3119*5d9d9091SRichard Lowe		.int L(bkP7QD)-L(bkPxQx)
3120*5d9d9091SRichard Lowe
3121*5d9d9091SRichard Lowe		.int L(bkP0QE)-L(bkPxQx)
3122*5d9d9091SRichard Lowe		.int L(bkP1QE)-L(bkPxQx)
3123*5d9d9091SRichard Lowe		.int L(bkP2QE)-L(bkPxQx)
3124*5d9d9091SRichard Lowe		.int L(bkP3QE)-L(bkPxQx)
3125*5d9d9091SRichard Lowe		.int L(bkP4QE)-L(bkPxQx)
3126*5d9d9091SRichard Lowe		.int L(bkP5QE)-L(bkPxQx)
3127*5d9d9091SRichard Lowe		.int L(bkP6QE)-L(bkPxQx)
3128*5d9d9091SRichard Lowe		.int L(bkP7QE)-L(bkPxQx)
3129*5d9d9091SRichard Lowe
3130*5d9d9091SRichard Lowe		.int L(bkP0QF)-L(bkPxQx)
3131*5d9d9091SRichard Lowe		.int L(bkP1QF)-L(bkPxQx)
3132*5d9d9091SRichard Lowe		.int L(bkP2QF)-L(bkPxQx)
3133*5d9d9091SRichard Lowe		.int L(bkP3QF)-L(bkPxQx)
3134*5d9d9091SRichard Lowe		.int L(bkP4QF)-L(bkPxQx)
3135*5d9d9091SRichard Lowe		.int L(bkP5QF)-L(bkPxQx)
3136*5d9d9091SRichard Lowe		.int L(bkP6QF)-L(bkPxQx)
3137*5d9d9091SRichard Lowe		.int L(bkP7QF)-L(bkPxQx)
3138*5d9d9091SRichard Lowe
3139*5d9d9091SRichard Lowe		.int L(bkP0QG)-L(bkPxQx)
3140*5d9d9091SRichard Lowe		.int L(bkP1QG)-L(bkPxQx)
3141*5d9d9091SRichard Lowe		.int L(bkP2QG)-L(bkPxQx)
3142*5d9d9091SRichard Lowe		.int L(bkP3QG)-L(bkPxQx)
3143*5d9d9091SRichard Lowe		.int L(bkP4QG)-L(bkPxQx)
3144*5d9d9091SRichard Lowe		.int L(bkP5QG)-L(bkPxQx)
3145*5d9d9091SRichard Lowe		.int L(bkP6QG)-L(bkPxQx)
3146*5d9d9091SRichard Lowe		.int L(bkP7QG)-L(bkPxQx)
3147*5d9d9091SRichard Lowe
3148*5d9d9091SRichard Lowe		.int L(bkP0QH)-L(bkPxQx)
3149*5d9d9091SRichard Lowe		.int L(bkP1QH)-L(bkPxQx)
3150*5d9d9091SRichard Lowe		.int L(bkP2QH)-L(bkPxQx)
3151*5d9d9091SRichard Lowe		.int L(bkP3QH)-L(bkPxQx)
3152*5d9d9091SRichard Lowe		.int L(bkP4QH)-L(bkPxQx)
3153*5d9d9091SRichard Lowe		.int L(bkP5QH)-L(bkPxQx)
3154*5d9d9091SRichard Lowe		.int L(bkP6QH)-L(bkPxQx)
3155*5d9d9091SRichard Lowe		.int L(bkP7QH)-L(bkPxQx)
3156*5d9d9091SRichard Lowe
3157*5d9d9091SRichard Lowe		.int L(bkP0QI)-L(bkPxQx)
3158*5d9d9091SRichard Lowe		.int L(bkP1QI)-L(bkPxQx)
3159*5d9d9091SRichard Lowe		.int L(bkP2QI)-L(bkPxQx)
3160*5d9d9091SRichard Lowe		.int L(bkP3QI)-L(bkPxQx)
3161*5d9d9091SRichard Lowe		.int L(bkP4QI)-L(bkPxQx)
3162*5d9d9091SRichard Lowe		.int L(bkP5QI)-L(bkPxQx)
3163*5d9d9091SRichard Lowe		.int L(bkP6QI)-L(bkPxQx)
3164*5d9d9091SRichard Lowe		.int L(bkP7QI)-L(bkPxQx)
3165*5d9d9091SRichard Lowe
3166*5d9d9091SRichard Lowe	SET_SIZE(memmove)
3167