xref: /titanic_44/usr/src/lib/libc/amd64/gen/memcpy.s (revision fad5204e207119133cdc503293923b09417b233b)
17c478bd9Sstevel@tonic-gate/*
2d0b3732eSbholler * CDDL HEADER START
3d0b3732eSbholler *
4d0b3732eSbholler * The contents of this file are subject to the terms of the
5d0b3732eSbholler * Common Development and Distribution License (the "License").
6d0b3732eSbholler * You may not use this file except in compliance with the License.
7d0b3732eSbholler *
8d0b3732eSbholler * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9d0b3732eSbholler * or http://www.opensolaris.org/os/licensing.
10d0b3732eSbholler * See the License for the specific language governing permissions
11d0b3732eSbholler * and limitations under the License.
12d0b3732eSbholler *
13d0b3732eSbholler * When distributing Covered Code, include this CDDL HEADER in each
14d0b3732eSbholler * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15d0b3732eSbholler * If applicable, add the following below this CDDL HEADER, with the
16d0b3732eSbholler * fields enclosed by brackets "[]" replaced with your own identifying
17d0b3732eSbholler * information: Portions Copyright [yyyy] [name of copyright owner]
18d0b3732eSbholler *
19d0b3732eSbholler * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate
227c478bd9Sstevel@tonic-gate/*
23*fad5204eSbostrovs * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
247257d1b4Sraf * Use is subject to license terms.
257257d1b4Sraf */
267257d1b4Sraf
277257d1b4Sraf/*
28d0b3732eSbholler * Copyright (c) 2008, Intel Corporation
297c478bd9Sstevel@tonic-gate * All rights reserved.
307c478bd9Sstevel@tonic-gate */
317c478bd9Sstevel@tonic-gate
32d0b3732eSbholler/*
33d0b3732eSbholler * memcpy.s - copies two blocks of memory
34d0b3732eSbholler *	Implements memcpy() and memmove() libc primitives.
35d0b3732eSbholler */
367257d1b4Sraf
379a70fc3bSMark J. Nelson	.file	"memcpy.s"
387c478bd9Sstevel@tonic-gate
397c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
407257d1b4Sraf
417c478bd9Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memmove,function)
427c478bd9Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memcpy,function)
437c478bd9Sstevel@tonic-gate
447c478bd9Sstevel@tonic-gate#include "cache.h"
45d0b3732eSbholler#include "proc64_id.h"
467c478bd9Sstevel@tonic-gate
47d0b3732eSbholler#define L(s) .memcpy/**/s
487c478bd9Sstevel@tonic-gate
49d0b3732eSbholler/*
50d0b3732eSbholler * memcpy algorithm overview:
51d0b3732eSbholler *
52d0b3732eSbholler * Thresholds used below were determined experimentally.
53d0b3732eSbholler *
54d0b3732eSbholler * Pseudo code:
55d0b3732eSbholler *
56*fad5204eSbostrovs * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
57*fad5204eSbostrovs * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
58*fad5204eSbostrovs * future AMD processors.
59*fad5204eSbostrovs *
60*fad5204eSbostrovs *
61d0b3732eSbholler * If (size <= 128 bytes) {
62d0b3732eSbholler *	do unrolled code (primarily 8-byte loads/stores) regardless of
63d0b3732eSbholler *	alignment.
64d0b3732eSbholler * } else {
65d0b3732eSbholler *	Align destination to 16-byte boundary
66d0b3732eSbholler *
67d0b3732eSbholler *      if (NO_SSE) {
68d0b3732eSbholler *		If (size > half of the largest level cache) {
69d0b3732eSbholler *			Use 8-byte non-temporal stores (64-bytes/loop)
70d0b3732eSbholler *		} else {
71d0b3732eSbholler *			if (size > 4K && size <= half l1 cache size) {
72d0b3732eSbholler *				Use rep movsq
73d0b3732eSbholler *			} else {
74d0b3732eSbholler *				Use 8-byte loads/stores (64 bytes per loop)
75d0b3732eSbholler *			}
76d0b3732eSbholler *		}
77d0b3732eSbholler *
78d0b3732eSbholler *	} else { **USE SSE**
79d0b3732eSbholler *		If (size > half of the largest level cache) {
80d0b3732eSbholler *			Use 16-byte non-temporal stores (128-bytes per loop)
81d0b3732eSbholler *		} else {
82d0b3732eSbholler *			If (both source and destination are aligned) {
83d0b3732eSbholler *			    Use 16-byte aligned loads and stores (128 bytes/loop)
84d0b3732eSbholler *			} else {
85d0b3732eSbholler *			    use pairs of xmm registers with SSE2 or SSSE3
86d0b3732eSbholler *			    instructions to concatenate and shift appropriately
87d0b3732eSbholler *			    to account for source unalignment. This enables
88d0b3732eSbholler *			    16-byte aligned loads to be done.
89d0b3732eSbholler *			}
90d0b3732eSbholler *		}
91d0b3732eSbholler	}
92d0b3732eSbholler *
93d0b3732eSbholler *	Finish any remaining bytes via unrolled code above.
94d0b3732eSbholler * }
95d0b3732eSbholler *
96d0b3732eSbholler * memmove overview:
97d0b3732eSbholler *	memmove is the same as memcpy except one case where copy needs to be
98d0b3732eSbholler *	done backwards. The copy backwards code is done in a similar manner.
99d0b3732eSbholler */
1007c478bd9Sstevel@tonic-gate
101d0b3732eSbholler	ENTRY(memmove)
102d0b3732eSbholler	cmp	%rsi,%rdi		# if dst <= src
103d0b3732eSbholler	jbe	L(CopyForward)		# then do copy forward
104d0b3732eSbholler	mov	%rsi,%r9		# move src to r9
105d0b3732eSbholler	add	%rdx,%r9		# add len to get addr of end of src
106d0b3732eSbholler	cmp	%r9,%rdi		# if dst < end of src
107d0b3732eSbholler	jb	L(CopyBackwards)	# then do copy backwards
108d0b3732eSbholler	jmp	L(CopyForward)
1097c478bd9Sstevel@tonic-gate
110d0b3732eSbholler	ENTRY (memcpy)
111d0b3732eSbhollerL(CopyForward):
112d0b3732eSbholler	mov    %rdx,%r8
113d0b3732eSbholler	mov    %rdi,%rcx
114d0b3732eSbholler	mov    %rsi,%rdx
1157c478bd9Sstevel@tonic-gate	mov    %rdi,%rax
116d0b3732eSbholler	lea    L(fwdPxQx)(%rip),%r11
117d0b3732eSbholler	cmp    $0x80,%r8		# 128
118d0b3732eSbholler	jg     L(ck_use_sse2)
119d0b3732eSbholler	add    %r8,%rcx
120d0b3732eSbholler	add    %r8,%rdx
1217c478bd9Sstevel@tonic-gate
122d0b3732eSbholler	movslq (%r11,%r8,4),%r10
123d0b3732eSbholler	lea    (%r10,%r11,1),%r11
124d0b3732eSbholler	jmpq   *%r11
1257c478bd9Sstevel@tonic-gate
126d0b3732eSbholler	.balign 16
127d0b3732eSbhollerL(ShrtAlignNew):
128d0b3732eSbholler	lea    L(AliPxQx)(%rip),%r11
129d0b3732eSbholler	mov    %rcx,%r9
130d0b3732eSbholler	and    $0xf,%r9
1317c478bd9Sstevel@tonic-gate
132d0b3732eSbholler	movslq (%r11,%r9,4),%r10
133d0b3732eSbholler	lea    (%r10,%r11,1),%r11
134d0b3732eSbholler	jmpq   *%r11
1357c478bd9Sstevel@tonic-gate
136d0b3732eSbholler	.balign 16
137d0b3732eSbhollerL(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
138d0b3732eSbholler           .int        L(P1Q0)-L(fwdPxQx)
139d0b3732eSbholler           .int        L(P2Q0)-L(fwdPxQx)
140d0b3732eSbholler           .int        L(P3Q0)-L(fwdPxQx)
141d0b3732eSbholler           .int        L(P4Q0)-L(fwdPxQx)
142d0b3732eSbholler           .int        L(P5Q0)-L(fwdPxQx)
143d0b3732eSbholler           .int        L(P6Q0)-L(fwdPxQx)
144d0b3732eSbholler           .int        L(P7Q0)-L(fwdPxQx)
1457c478bd9Sstevel@tonic-gate
146d0b3732eSbholler           .int        L(P0Q1)-L(fwdPxQx)
147d0b3732eSbholler           .int        L(P1Q1)-L(fwdPxQx)
148d0b3732eSbholler           .int        L(P2Q1)-L(fwdPxQx)
149d0b3732eSbholler           .int        L(P3Q1)-L(fwdPxQx)
150d0b3732eSbholler           .int        L(P4Q1)-L(fwdPxQx)
151d0b3732eSbholler           .int        L(P5Q1)-L(fwdPxQx)
152d0b3732eSbholler           .int        L(P6Q1)-L(fwdPxQx)
153d0b3732eSbholler           .int        L(P7Q1)-L(fwdPxQx)
1547c478bd9Sstevel@tonic-gate
155d0b3732eSbholler           .int        L(P0Q2)-L(fwdPxQx)
156d0b3732eSbholler           .int        L(P1Q2)-L(fwdPxQx)
157d0b3732eSbholler           .int        L(P2Q2)-L(fwdPxQx)
158d0b3732eSbholler           .int        L(P3Q2)-L(fwdPxQx)
159d0b3732eSbholler           .int        L(P4Q2)-L(fwdPxQx)
160d0b3732eSbholler           .int        L(P5Q2)-L(fwdPxQx)
161d0b3732eSbholler           .int        L(P6Q2)-L(fwdPxQx)
162d0b3732eSbholler           .int        L(P7Q2)-L(fwdPxQx)
1637c478bd9Sstevel@tonic-gate
164d0b3732eSbholler           .int        L(P0Q3)-L(fwdPxQx)
165d0b3732eSbholler           .int        L(P1Q3)-L(fwdPxQx)
166d0b3732eSbholler           .int        L(P2Q3)-L(fwdPxQx)
167d0b3732eSbholler           .int        L(P3Q3)-L(fwdPxQx)
168d0b3732eSbholler           .int        L(P4Q3)-L(fwdPxQx)
169d0b3732eSbholler           .int        L(P5Q3)-L(fwdPxQx)
170d0b3732eSbholler           .int        L(P6Q3)-L(fwdPxQx)
171d0b3732eSbholler           .int        L(P7Q3)-L(fwdPxQx)
1727c478bd9Sstevel@tonic-gate
173d0b3732eSbholler           .int        L(P0Q4)-L(fwdPxQx)
174d0b3732eSbholler           .int        L(P1Q4)-L(fwdPxQx)
175d0b3732eSbholler           .int        L(P2Q4)-L(fwdPxQx)
176d0b3732eSbholler           .int        L(P3Q4)-L(fwdPxQx)
177d0b3732eSbholler           .int        L(P4Q4)-L(fwdPxQx)
178d0b3732eSbholler           .int        L(P5Q4)-L(fwdPxQx)
179d0b3732eSbholler           .int        L(P6Q4)-L(fwdPxQx)
180d0b3732eSbholler           .int        L(P7Q4)-L(fwdPxQx)
1817c478bd9Sstevel@tonic-gate
182d0b3732eSbholler           .int        L(P0Q5)-L(fwdPxQx)
183d0b3732eSbholler           .int        L(P1Q5)-L(fwdPxQx)
184d0b3732eSbholler           .int        L(P2Q5)-L(fwdPxQx)
185d0b3732eSbholler           .int        L(P3Q5)-L(fwdPxQx)
186d0b3732eSbholler           .int        L(P4Q5)-L(fwdPxQx)
187d0b3732eSbholler           .int        L(P5Q5)-L(fwdPxQx)
188d0b3732eSbholler           .int        L(P6Q5)-L(fwdPxQx)
189d0b3732eSbholler           .int        L(P7Q5)-L(fwdPxQx)
1907c478bd9Sstevel@tonic-gate
191d0b3732eSbholler           .int        L(P0Q6)-L(fwdPxQx)
192d0b3732eSbholler           .int        L(P1Q6)-L(fwdPxQx)
193d0b3732eSbholler           .int        L(P2Q6)-L(fwdPxQx)
194d0b3732eSbholler           .int        L(P3Q6)-L(fwdPxQx)
195d0b3732eSbholler           .int        L(P4Q6)-L(fwdPxQx)
196d0b3732eSbholler           .int        L(P5Q6)-L(fwdPxQx)
197d0b3732eSbholler           .int        L(P6Q6)-L(fwdPxQx)
198d0b3732eSbholler           .int        L(P7Q6)-L(fwdPxQx)
1997c478bd9Sstevel@tonic-gate
200d0b3732eSbholler           .int        L(P0Q7)-L(fwdPxQx)
201d0b3732eSbholler           .int        L(P1Q7)-L(fwdPxQx)
202d0b3732eSbholler           .int        L(P2Q7)-L(fwdPxQx)
203d0b3732eSbholler           .int        L(P3Q7)-L(fwdPxQx)
204d0b3732eSbholler           .int        L(P4Q7)-L(fwdPxQx)
205d0b3732eSbholler           .int        L(P5Q7)-L(fwdPxQx)
206d0b3732eSbholler           .int        L(P6Q7)-L(fwdPxQx)
207d0b3732eSbholler           .int        L(P7Q7)-L(fwdPxQx)
2087c478bd9Sstevel@tonic-gate
209d0b3732eSbholler           .int        L(P0Q8)-L(fwdPxQx)
210d0b3732eSbholler           .int        L(P1Q8)-L(fwdPxQx)
211d0b3732eSbholler           .int        L(P2Q8)-L(fwdPxQx)
212d0b3732eSbholler           .int        L(P3Q8)-L(fwdPxQx)
213d0b3732eSbholler           .int        L(P4Q8)-L(fwdPxQx)
214d0b3732eSbholler           .int        L(P5Q8)-L(fwdPxQx)
215d0b3732eSbholler           .int        L(P6Q8)-L(fwdPxQx)
216d0b3732eSbholler           .int        L(P7Q8)-L(fwdPxQx)
2177c478bd9Sstevel@tonic-gate
218d0b3732eSbholler           .int        L(P0Q9)-L(fwdPxQx)
219d0b3732eSbholler           .int        L(P1Q9)-L(fwdPxQx)
220d0b3732eSbholler           .int        L(P2Q9)-L(fwdPxQx)
221d0b3732eSbholler           .int        L(P3Q9)-L(fwdPxQx)
222d0b3732eSbholler           .int        L(P4Q9)-L(fwdPxQx)
223d0b3732eSbholler           .int        L(P5Q9)-L(fwdPxQx)
224d0b3732eSbholler           .int        L(P6Q9)-L(fwdPxQx)
225d0b3732eSbholler           .int        L(P7Q9)-L(fwdPxQx)
2267c478bd9Sstevel@tonic-gate
227d0b3732eSbholler           .int        L(P0QA)-L(fwdPxQx)
228d0b3732eSbholler           .int        L(P1QA)-L(fwdPxQx)
229d0b3732eSbholler           .int        L(P2QA)-L(fwdPxQx)
230d0b3732eSbholler           .int        L(P3QA)-L(fwdPxQx)
231d0b3732eSbholler           .int        L(P4QA)-L(fwdPxQx)
232d0b3732eSbholler           .int        L(P5QA)-L(fwdPxQx)
233d0b3732eSbholler           .int        L(P6QA)-L(fwdPxQx)
234d0b3732eSbholler           .int        L(P7QA)-L(fwdPxQx)
2357c478bd9Sstevel@tonic-gate
236d0b3732eSbholler           .int        L(P0QB)-L(fwdPxQx)
237d0b3732eSbholler           .int        L(P1QB)-L(fwdPxQx)
238d0b3732eSbholler           .int        L(P2QB)-L(fwdPxQx)
239d0b3732eSbholler           .int        L(P3QB)-L(fwdPxQx)
240d0b3732eSbholler           .int        L(P4QB)-L(fwdPxQx)
241d0b3732eSbholler           .int        L(P5QB)-L(fwdPxQx)
242d0b3732eSbholler           .int        L(P6QB)-L(fwdPxQx)
243d0b3732eSbholler           .int        L(P7QB)-L(fwdPxQx)
2447c478bd9Sstevel@tonic-gate
245d0b3732eSbholler           .int        L(P0QC)-L(fwdPxQx)
246d0b3732eSbholler           .int        L(P1QC)-L(fwdPxQx)
247d0b3732eSbholler           .int        L(P2QC)-L(fwdPxQx)
248d0b3732eSbholler           .int        L(P3QC)-L(fwdPxQx)
249d0b3732eSbholler           .int        L(P4QC)-L(fwdPxQx)
250d0b3732eSbholler           .int        L(P5QC)-L(fwdPxQx)
251d0b3732eSbholler           .int        L(P6QC)-L(fwdPxQx)
252d0b3732eSbholler           .int        L(P7QC)-L(fwdPxQx)
2537c478bd9Sstevel@tonic-gate
254d0b3732eSbholler           .int        L(P0QD)-L(fwdPxQx)
255d0b3732eSbholler           .int        L(P1QD)-L(fwdPxQx)
256d0b3732eSbholler           .int        L(P2QD)-L(fwdPxQx)
257d0b3732eSbholler           .int        L(P3QD)-L(fwdPxQx)
258d0b3732eSbholler           .int        L(P4QD)-L(fwdPxQx)
259d0b3732eSbholler           .int        L(P5QD)-L(fwdPxQx)
260d0b3732eSbholler           .int        L(P6QD)-L(fwdPxQx)
261d0b3732eSbholler           .int        L(P7QD)-L(fwdPxQx)
2627c478bd9Sstevel@tonic-gate
263d0b3732eSbholler           .int        L(P0QE)-L(fwdPxQx)
264d0b3732eSbholler           .int        L(P1QE)-L(fwdPxQx)
265d0b3732eSbholler           .int        L(P2QE)-L(fwdPxQx)
266d0b3732eSbholler           .int        L(P3QE)-L(fwdPxQx)
267d0b3732eSbholler           .int        L(P4QE)-L(fwdPxQx)
268d0b3732eSbholler           .int        L(P5QE)-L(fwdPxQx)
269d0b3732eSbholler           .int        L(P6QE)-L(fwdPxQx)
270d0b3732eSbholler           .int        L(P7QE)-L(fwdPxQx)
2717c478bd9Sstevel@tonic-gate
272d0b3732eSbholler           .int        L(P0QF)-L(fwdPxQx)
273d0b3732eSbholler           .int        L(P1QF)-L(fwdPxQx)
274d0b3732eSbholler           .int        L(P2QF)-L(fwdPxQx)
275d0b3732eSbholler           .int        L(P3QF)-L(fwdPxQx)
276d0b3732eSbholler           .int        L(P4QF)-L(fwdPxQx)
277d0b3732eSbholler           .int        L(P5QF)-L(fwdPxQx)
278d0b3732eSbholler           .int        L(P6QF)-L(fwdPxQx)
279d0b3732eSbholler           .int        L(P7QF)-L(fwdPxQx)
280d0b3732eSbholler
281d0b3732eSbholler           .int        L(P0QG)-L(fwdPxQx)	# 0x80
282d0b3732eSbholler
283d0b3732eSbholler	   .balign 16
284d0b3732eSbhollerL(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
285d0b3732eSbholler           .int        L(A1Q0)-L(AliPxQx)
286d0b3732eSbholler           .int        L(A2Q0)-L(AliPxQx)
287d0b3732eSbholler           .int        L(A3Q0)-L(AliPxQx)
288d0b3732eSbholler           .int        L(A4Q0)-L(AliPxQx)
289d0b3732eSbholler           .int        L(A5Q0)-L(AliPxQx)
290d0b3732eSbholler           .int        L(A6Q0)-L(AliPxQx)
291d0b3732eSbholler           .int        L(A7Q0)-L(AliPxQx)
292d0b3732eSbholler           .int        L(A0Q1)-L(AliPxQx)
293d0b3732eSbholler           .int        L(A1Q1)-L(AliPxQx)
294d0b3732eSbholler           .int        L(A2Q1)-L(AliPxQx)
295d0b3732eSbholler           .int        L(A3Q1)-L(AliPxQx)
296d0b3732eSbholler           .int        L(A4Q1)-L(AliPxQx)
297d0b3732eSbholler           .int        L(A5Q1)-L(AliPxQx)
298d0b3732eSbholler           .int        L(A6Q1)-L(AliPxQx)
299d0b3732eSbholler           .int        L(A7Q1)-L(AliPxQx)
300d0b3732eSbholler
301d0b3732eSbholler	.balign 16
302d0b3732eSbhollerL(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
303d0b3732eSbholler	movzbq (%rdx),%r11
304d0b3732eSbholler	sub    $0xf,%r8
305d0b3732eSbholler	mov    %r11b,(%rcx)
306d0b3732eSbholler
307d0b3732eSbholler	movzwq 0x1(%rdx),%r10
308d0b3732eSbholler	mov    %r10w,0x1(%rcx)
309d0b3732eSbholler
310d0b3732eSbholler	mov    0x3(%rdx),%r9d
311d0b3732eSbholler	mov    %r9d,0x3(%rcx)
312d0b3732eSbholler
313d0b3732eSbholler	mov    0x7(%rdx),%r11
314d0b3732eSbholler	add    $0xf,%rdx
315d0b3732eSbholler	mov    %r11,0x7(%rcx)
316d0b3732eSbholler
317d0b3732eSbholler	add    $0xf,%rcx
318d0b3732eSbholler	jmp    L(now_qw_aligned)
319d0b3732eSbholler
320d0b3732eSbholler	.balign 16
321d0b3732eSbhollerL(A2Q0):			# ; need to move 8+ 6=2+4 bytes
322d0b3732eSbholler	movzwq (%rdx),%r10
323d0b3732eSbholler	sub    $0xe,%r8
324d0b3732eSbholler	mov    %r10w,(%rcx)
325d0b3732eSbholler
326d0b3732eSbholler	mov    0x2(%rdx),%r9d
327d0b3732eSbholler	mov    %r9d,0x2(%rcx)
328d0b3732eSbholler
329d0b3732eSbholler	mov    0x6(%rdx),%r11
330d0b3732eSbholler	add    $0xe,%rdx
331d0b3732eSbholler	mov    %r11,0x6(%rcx)
332d0b3732eSbholler	add    $0xe,%rcx
333d0b3732eSbholler	jmp    L(now_qw_aligned)
334d0b3732eSbholler
335d0b3732eSbholler	.balign 16
336d0b3732eSbhollerL(A3Q0):			# ; need to move 8+ 5=1+4 bytes
337d0b3732eSbholler	movzbq (%rdx),%r11
338d0b3732eSbholler	sub    $0xd,%r8
339d0b3732eSbholler	mov    %r11b,(%rcx)
340d0b3732eSbholler
341d0b3732eSbholler	mov    0x1(%rdx),%r9d
342d0b3732eSbholler	mov    %r9d,0x1(%rcx)
343d0b3732eSbholler
344d0b3732eSbholler	mov    0x5(%rdx),%r10
345d0b3732eSbholler	add    $0xd,%rdx
346d0b3732eSbholler	mov    %r10,0x5(%rcx)
347d0b3732eSbholler
348d0b3732eSbholler	add    $0xd,%rcx
349d0b3732eSbholler	jmp    L(now_qw_aligned)
350d0b3732eSbholler
351d0b3732eSbholler	.balign 16
352d0b3732eSbhollerL(A4Q0):			# ; need to move 8+4 bytes
353d0b3732eSbholler	mov    (%rdx),%r9d
354d0b3732eSbholler	sub    $0xc,%r8
355d0b3732eSbholler	mov    %r9d,(%rcx)
356d0b3732eSbholler
357d0b3732eSbholler	mov    0x4(%rdx),%r10
358d0b3732eSbholler	add    $0xc,%rdx
359d0b3732eSbholler	mov    %r10,0x4(%rcx)
360d0b3732eSbholler
361d0b3732eSbholler	add    $0xc,%rcx
362d0b3732eSbholler	jmp    L(now_qw_aligned)
363d0b3732eSbholler
364d0b3732eSbholler	.balign 16
365d0b3732eSbhollerL(A5Q0):			# ; need to move 8+ 3=1+2 bytes
366d0b3732eSbholler	movzbq (%rdx),%r11
367d0b3732eSbholler	sub    $0xb,%r8
368d0b3732eSbholler	mov    %r11b,(%rcx)
369d0b3732eSbholler
370d0b3732eSbholler	movzwq 0x1(%rdx),%r10
371d0b3732eSbholler	mov    %r10w,0x1(%rcx)
372d0b3732eSbholler
373d0b3732eSbholler	mov    0x3(%rdx),%r9
374d0b3732eSbholler	add    $0xb,%rdx
375d0b3732eSbholler	mov    %r9,0x3(%rcx)
376d0b3732eSbholler
377d0b3732eSbholler	add    $0xb,%rcx
378d0b3732eSbholler	jmp    L(now_qw_aligned)
379d0b3732eSbholler
380d0b3732eSbholler	.balign 16
381d0b3732eSbhollerL(A6Q0):			# ; need to move 8+2 bytes
382d0b3732eSbholler	movzwq (%rdx),%r10
383d0b3732eSbholler	sub    $0xa,%r8
384d0b3732eSbholler	mov    %r10w,(%rcx)
385d0b3732eSbholler
386d0b3732eSbholler	mov    0x2(%rdx),%r9
387d0b3732eSbholler	add    $0xa,%rdx
388d0b3732eSbholler	mov    %r9,0x2(%rcx)
389d0b3732eSbholler
390d0b3732eSbholler	add    $0xa,%rcx
391d0b3732eSbholler	jmp    L(now_qw_aligned)
392d0b3732eSbholler
393d0b3732eSbholler	.balign 16
394d0b3732eSbhollerL(A7Q0):			# ; need to move 8+1 byte
395d0b3732eSbholler	movzbq (%rdx),%r11
396d0b3732eSbholler	sub    $0x9,%r8
397d0b3732eSbholler	mov    %r11b,(%rcx)
398d0b3732eSbholler
399d0b3732eSbholler	mov    0x1(%rdx),%r10
400d0b3732eSbholler	add    $0x9,%rdx
401d0b3732eSbholler	mov    %r10,0x1(%rcx)
402d0b3732eSbholler
403d0b3732eSbholler	add    $0x9,%rcx
404d0b3732eSbholler	jmp    L(now_qw_aligned)
405d0b3732eSbholler
406d0b3732eSbholler	.balign 16
407d0b3732eSbhollerL(A0Q1):			# ; need to move 8 bytes
408d0b3732eSbholler
409d0b3732eSbholler	mov    (%rdx),%r10
410d0b3732eSbholler	add    $0x8,%rdx
411d0b3732eSbholler	sub    $0x8,%r8
412d0b3732eSbholler	mov    %r10,(%rcx)
413d0b3732eSbholler
414d0b3732eSbholler	add    $0x8,%rcx
415d0b3732eSbholler	jmp    L(now_qw_aligned)
416d0b3732eSbholler
417d0b3732eSbholler	.balign 16
418d0b3732eSbhollerL(A1Q1):			# ; need to move 7=1+2+4 bytes
419d0b3732eSbholler	movzbq (%rdx),%r11
420d0b3732eSbholler	sub    $0x7,%r8
421d0b3732eSbholler	mov    %r11b,(%rcx)
422d0b3732eSbholler
423d0b3732eSbholler	movzwq 0x1(%rdx),%r10
424d0b3732eSbholler	mov    %r10w,0x1(%rcx)
425d0b3732eSbholler
426d0b3732eSbholler	mov    0x3(%rdx),%r9d
427d0b3732eSbholler	add    $0x7,%rdx
428d0b3732eSbholler	mov    %r9d,0x3(%rcx)
429d0b3732eSbholler	add    $0x7,%rcx
430d0b3732eSbholler	jmp    L(now_qw_aligned)
431d0b3732eSbholler
432d0b3732eSbholler	.balign 16
433d0b3732eSbhollerL(A2Q1):			# ; need to move 6=2+4 bytes
434d0b3732eSbholler	movzwq (%rdx),%r10
435d0b3732eSbholler	sub    $0x6,%r8
436d0b3732eSbholler	mov    %r10w,(%rcx)
437d0b3732eSbholler	mov    0x2(%rdx),%r9d
438d0b3732eSbholler	add    $0x6,%rdx
439d0b3732eSbholler	mov    %r9d,0x2(%rcx)
440d0b3732eSbholler	add    $0x6,%rcx
441d0b3732eSbholler	jmp    L(now_qw_aligned)
442d0b3732eSbholler
443d0b3732eSbholler	.balign 16
444d0b3732eSbhollerL(A3Q1):			# ; need to move 5=1+4 bytes
445d0b3732eSbholler	movzbq (%rdx),%r11
446d0b3732eSbholler	sub    $0x5,%r8
447d0b3732eSbholler	mov    %r11b,(%rcx)
448d0b3732eSbholler	mov    0x1(%rdx),%r9d
449d0b3732eSbholler	add    $0x5,%rdx
450d0b3732eSbholler	mov    %r9d,0x1(%rcx)
451d0b3732eSbholler	add    $0x5,%rcx
452d0b3732eSbholler	jmp    L(now_qw_aligned)
453d0b3732eSbholler
454d0b3732eSbholler	.balign 16
455d0b3732eSbhollerL(A4Q1):			# ; need to move 4 bytes
456d0b3732eSbholler	mov    (%rdx),%r9d
457d0b3732eSbholler	sub    $0x4,%r8
458d0b3732eSbholler	add    $0x4,%rdx
459d0b3732eSbholler	mov    %r9d,(%rcx)
460d0b3732eSbholler	add    $0x4,%rcx
461d0b3732eSbholler	jmp    L(now_qw_aligned)
462d0b3732eSbholler
463d0b3732eSbholler	.balign 16
464d0b3732eSbhollerL(A5Q1):			# ; need to move 3=1+2 bytes
465d0b3732eSbholler	movzbq (%rdx),%r11
466d0b3732eSbholler	sub    $0x3,%r8
467d0b3732eSbholler	mov    %r11b,(%rcx)
468d0b3732eSbholler
469d0b3732eSbholler	movzwq 0x1(%rdx),%r10
470d0b3732eSbholler	add    $0x3,%rdx
471d0b3732eSbholler	mov    %r10w,0x1(%rcx)
472d0b3732eSbholler
473d0b3732eSbholler	add    $0x3,%rcx
474d0b3732eSbholler	jmp    L(now_qw_aligned)
475d0b3732eSbholler
476d0b3732eSbholler	.balign 16
477d0b3732eSbhollerL(A6Q1):			# ; need to move 2 bytes
478d0b3732eSbholler	movzwq (%rdx),%r10
479d0b3732eSbholler	sub    $0x2,%r8
480d0b3732eSbholler	add    $0x2,%rdx
481d0b3732eSbholler	mov    %r10w,(%rcx)
482d0b3732eSbholler	add    $0x2,%rcx
483d0b3732eSbholler	jmp    L(now_qw_aligned)
484d0b3732eSbholler
485d0b3732eSbholler	.balign 16
486d0b3732eSbhollerL(A7Q1):			# ; need to move 1 byte
487d0b3732eSbholler	movzbq (%rdx),%r11
488d0b3732eSbholler	dec    %r8
489d0b3732eSbholler	inc    %rdx
490d0b3732eSbholler	mov    %r11b,(%rcx)
491d0b3732eSbholler	inc    %rcx
492d0b3732eSbholler	jmp    L(now_qw_aligned)
493d0b3732eSbholler
494d0b3732eSbholler
495d0b3732eSbholler	.balign 16
496d0b3732eSbhollerL(P0QG):
497d0b3732eSbholler	mov    -0x80(%rdx),%r9
498d0b3732eSbholler	mov    %r9,-0x80(%rcx)
499d0b3732eSbhollerL(P0QF):
500d0b3732eSbholler	mov    -0x78(%rdx),%r10
501d0b3732eSbholler	mov    %r10,-0x78(%rcx)
502d0b3732eSbhollerL(P0QE):
503d0b3732eSbholler	mov    -0x70(%rdx),%r9
504d0b3732eSbholler	mov    %r9,-0x70(%rcx)
505d0b3732eSbhollerL(P0QD):
506d0b3732eSbholler	mov    -0x68(%rdx),%r10
507d0b3732eSbholler	mov    %r10,-0x68(%rcx)
508d0b3732eSbhollerL(P0QC):
509d0b3732eSbholler	mov    -0x60(%rdx),%r9
510d0b3732eSbholler	mov    %r9,-0x60(%rcx)
511d0b3732eSbhollerL(P0QB):
512d0b3732eSbholler	mov    -0x58(%rdx),%r10
513d0b3732eSbholler	mov    %r10,-0x58(%rcx)
514d0b3732eSbhollerL(P0QA):
515d0b3732eSbholler	mov    -0x50(%rdx),%r9
516d0b3732eSbholler	mov    %r9,-0x50(%rcx)
517d0b3732eSbhollerL(P0Q9):
518d0b3732eSbholler	mov    -0x48(%rdx),%r10
519d0b3732eSbholler	mov    %r10,-0x48(%rcx)
520d0b3732eSbhollerL(P0Q8):
521d0b3732eSbholler	mov    -0x40(%rdx),%r9
522d0b3732eSbholler	mov    %r9,-0x40(%rcx)
523d0b3732eSbhollerL(P0Q7):
524d0b3732eSbholler	mov    -0x38(%rdx),%r10
525d0b3732eSbholler	mov    %r10,-0x38(%rcx)
526d0b3732eSbhollerL(P0Q6):
527d0b3732eSbholler	mov    -0x30(%rdx),%r9
528d0b3732eSbholler	mov    %r9,-0x30(%rcx)
529d0b3732eSbhollerL(P0Q5):
530d0b3732eSbholler	mov    -0x28(%rdx),%r10
531d0b3732eSbholler	mov    %r10,-0x28(%rcx)
532d0b3732eSbhollerL(P0Q4):
533d0b3732eSbholler	mov    -0x20(%rdx),%r9
534d0b3732eSbholler	mov    %r9,-0x20(%rcx)
535d0b3732eSbhollerL(P0Q3):
536d0b3732eSbholler	mov    -0x18(%rdx),%r10
537d0b3732eSbholler	mov    %r10,-0x18(%rcx)
538d0b3732eSbhollerL(P0Q2):
539d0b3732eSbholler	mov    -0x10(%rdx),%r9
540d0b3732eSbholler	mov    %r9,-0x10(%rcx)
541d0b3732eSbhollerL(P0Q1):
542d0b3732eSbholler	mov    -0x8(%rdx),%r10
543d0b3732eSbholler	mov    %r10,-0x8(%rcx)
544d0b3732eSbhollerL(P0Q0):
545d0b3732eSbholler	ret
546d0b3732eSbholler
547d0b3732eSbholler	.balign 16
548d0b3732eSbhollerL(P1QF):
549d0b3732eSbholler	mov    -0x79(%rdx),%r9
550d0b3732eSbholler	mov    %r9,-0x79(%rcx)
551d0b3732eSbhollerL(P1QE):
552d0b3732eSbholler	mov    -0x71(%rdx),%r11
553d0b3732eSbholler	mov    %r11,-0x71(%rcx)
554d0b3732eSbhollerL(P1QD):
555d0b3732eSbholler	mov    -0x69(%rdx),%r10
556d0b3732eSbholler	mov    %r10,-0x69(%rcx)
557d0b3732eSbhollerL(P1QC):
558d0b3732eSbholler	mov    -0x61(%rdx),%r9
559d0b3732eSbholler	mov    %r9,-0x61(%rcx)
560d0b3732eSbhollerL(P1QB):
561d0b3732eSbholler	mov    -0x59(%rdx),%r11
562d0b3732eSbholler	mov    %r11,-0x59(%rcx)
563d0b3732eSbhollerL(P1QA):
564d0b3732eSbholler	mov    -0x51(%rdx),%r10
565d0b3732eSbholler	mov    %r10,-0x51(%rcx)
566d0b3732eSbhollerL(P1Q9):
567d0b3732eSbholler	mov    -0x49(%rdx),%r9
568d0b3732eSbholler	mov    %r9,-0x49(%rcx)
569d0b3732eSbhollerL(P1Q8):
570d0b3732eSbholler	mov    -0x41(%rdx),%r11
571d0b3732eSbholler	mov    %r11,-0x41(%rcx)
572d0b3732eSbhollerL(P1Q7):
573d0b3732eSbholler	mov    -0x39(%rdx),%r10
574d0b3732eSbholler	mov    %r10,-0x39(%rcx)
575d0b3732eSbhollerL(P1Q6):
576d0b3732eSbholler	mov    -0x31(%rdx),%r9
577d0b3732eSbholler	mov    %r9,-0x31(%rcx)
578d0b3732eSbhollerL(P1Q5):
579d0b3732eSbholler	mov    -0x29(%rdx),%r11
580d0b3732eSbholler	mov    %r11,-0x29(%rcx)
581d0b3732eSbhollerL(P1Q4):
582d0b3732eSbholler	mov    -0x21(%rdx),%r10
583d0b3732eSbholler	mov    %r10,-0x21(%rcx)
584d0b3732eSbhollerL(P1Q3):
585d0b3732eSbholler	mov    -0x19(%rdx),%r9
586d0b3732eSbholler	mov    %r9,-0x19(%rcx)
587d0b3732eSbhollerL(P1Q2):
588d0b3732eSbholler	mov    -0x11(%rdx),%r11
589d0b3732eSbholler	mov    %r11,-0x11(%rcx)
590d0b3732eSbhollerL(P1Q1):
591d0b3732eSbholler	mov    -0x9(%rdx),%r10
592d0b3732eSbholler	mov    %r10,-0x9(%rcx)
593d0b3732eSbhollerL(P1Q0):
594d0b3732eSbholler	movzbq -0x1(%rdx),%r9
595d0b3732eSbholler	mov    %r9b,-0x1(%rcx)
596d0b3732eSbholler	ret
597d0b3732eSbholler
598d0b3732eSbholler	.balign 16
599d0b3732eSbhollerL(P2QF):
600d0b3732eSbholler	mov    -0x7a(%rdx),%r9
601d0b3732eSbholler	mov    %r9,-0x7a(%rcx)
602d0b3732eSbhollerL(P2QE):
603d0b3732eSbholler	mov    -0x72(%rdx),%r11
604d0b3732eSbholler	mov    %r11,-0x72(%rcx)
605d0b3732eSbhollerL(P2QD):
606d0b3732eSbholler	mov    -0x6a(%rdx),%r10
607d0b3732eSbholler	mov    %r10,-0x6a(%rcx)
608d0b3732eSbhollerL(P2QC):
609d0b3732eSbholler	mov    -0x62(%rdx),%r9
610d0b3732eSbholler	mov    %r9,-0x62(%rcx)
611d0b3732eSbhollerL(P2QB):
612d0b3732eSbholler	mov    -0x5a(%rdx),%r11
613d0b3732eSbholler	mov    %r11,-0x5a(%rcx)
614d0b3732eSbhollerL(P2QA):
615d0b3732eSbholler	mov    -0x52(%rdx),%r10
616d0b3732eSbholler	mov    %r10,-0x52(%rcx)
617d0b3732eSbhollerL(P2Q9):
618d0b3732eSbholler	mov    -0x4a(%rdx),%r9
619d0b3732eSbholler	mov    %r9,-0x4a(%rcx)
620d0b3732eSbhollerL(P2Q8):
621d0b3732eSbholler	mov    -0x42(%rdx),%r11
622d0b3732eSbholler	mov    %r11,-0x42(%rcx)
623d0b3732eSbhollerL(P2Q7):
624d0b3732eSbholler	mov    -0x3a(%rdx),%r10
625d0b3732eSbholler	mov    %r10,-0x3a(%rcx)
626d0b3732eSbhollerL(P2Q6):
627d0b3732eSbholler	mov    -0x32(%rdx),%r9
628d0b3732eSbholler	mov    %r9,-0x32(%rcx)
629d0b3732eSbhollerL(P2Q5):
630d0b3732eSbholler	mov    -0x2a(%rdx),%r11
631d0b3732eSbholler	mov    %r11,-0x2a(%rcx)
632d0b3732eSbhollerL(P2Q4):
633d0b3732eSbholler	mov    -0x22(%rdx),%r10
634d0b3732eSbholler	mov    %r10,-0x22(%rcx)
635d0b3732eSbhollerL(P2Q3):
636d0b3732eSbholler	mov    -0x1a(%rdx),%r9
637d0b3732eSbholler	mov    %r9,-0x1a(%rcx)
638d0b3732eSbhollerL(P2Q2):
639d0b3732eSbholler	mov    -0x12(%rdx),%r11
640d0b3732eSbholler	mov    %r11,-0x12(%rcx)
641d0b3732eSbhollerL(P2Q1):
642d0b3732eSbholler	mov    -0xa(%rdx),%r10
643d0b3732eSbholler	mov    %r10,-0xa(%rcx)
644d0b3732eSbhollerL(P2Q0):
645d0b3732eSbholler	movzwq -0x2(%rdx),%r9
646d0b3732eSbholler	mov    %r9w,-0x2(%rcx)
647d0b3732eSbholler	ret
648d0b3732eSbholler
649d0b3732eSbholler	.balign 16
650d0b3732eSbhollerL(P3QF):
651d0b3732eSbholler	mov    -0x7b(%rdx),%r9
652d0b3732eSbholler	mov    %r9,-0x7b(%rcx)
653d0b3732eSbhollerL(P3QE):
654d0b3732eSbholler	mov    -0x73(%rdx),%r11
655d0b3732eSbholler	mov    %r11,-0x73(%rcx)
656d0b3732eSbhollerL(P3QD):
657d0b3732eSbholler	mov    -0x6b(%rdx),%r10
658d0b3732eSbholler	mov    %r10,-0x6b(%rcx)
659d0b3732eSbhollerL(P3QC):
660d0b3732eSbholler	mov    -0x63(%rdx),%r9
661d0b3732eSbholler	mov    %r9,-0x63(%rcx)
662d0b3732eSbhollerL(P3QB):
663d0b3732eSbholler	mov    -0x5b(%rdx),%r11
664d0b3732eSbholler	mov    %r11,-0x5b(%rcx)
665d0b3732eSbhollerL(P3QA):
666d0b3732eSbholler	mov    -0x53(%rdx),%r10
667d0b3732eSbholler	mov    %r10,-0x53(%rcx)
668d0b3732eSbhollerL(P3Q9):
669d0b3732eSbholler	mov    -0x4b(%rdx),%r9
670d0b3732eSbholler	mov    %r9,-0x4b(%rcx)
671d0b3732eSbhollerL(P3Q8):
672d0b3732eSbholler	mov    -0x43(%rdx),%r11
673d0b3732eSbholler	mov    %r11,-0x43(%rcx)
674d0b3732eSbhollerL(P3Q7):
675d0b3732eSbholler	mov    -0x3b(%rdx),%r10
676d0b3732eSbholler	mov    %r10,-0x3b(%rcx)
677d0b3732eSbhollerL(P3Q6):
678d0b3732eSbholler	mov    -0x33(%rdx),%r9
679d0b3732eSbholler	mov    %r9,-0x33(%rcx)
680d0b3732eSbhollerL(P3Q5):
681d0b3732eSbholler	mov    -0x2b(%rdx),%r11
682d0b3732eSbholler	mov    %r11,-0x2b(%rcx)
683d0b3732eSbhollerL(P3Q4):
684d0b3732eSbholler	mov    -0x23(%rdx),%r10
685d0b3732eSbholler	mov    %r10,-0x23(%rcx)
686d0b3732eSbhollerL(P3Q3):
687d0b3732eSbholler	mov    -0x1b(%rdx),%r9
688d0b3732eSbholler	mov    %r9,-0x1b(%rcx)
689d0b3732eSbhollerL(P3Q2):
690d0b3732eSbholler	mov    -0x13(%rdx),%r11
691d0b3732eSbholler	mov    %r11,-0x13(%rcx)
692d0b3732eSbhollerL(P3Q1):
693d0b3732eSbholler	mov    -0xb(%rdx),%r10
694d0b3732eSbholler	mov    %r10,-0xb(%rcx)
695d0b3732eSbholler	/*
696d0b3732eSbholler	 * These trailing loads/stores have to do all their loads 1st,
697d0b3732eSbholler	 * then do the stores.
698d0b3732eSbholler	 */
699d0b3732eSbhollerL(P3Q0):
700d0b3732eSbholler	movzwq -0x3(%rdx),%r9
701d0b3732eSbholler	movzbq -0x1(%rdx),%r10
702d0b3732eSbholler	mov    %r9w,-0x3(%rcx)
703d0b3732eSbholler	mov    %r10b,-0x1(%rcx)
704d0b3732eSbholler	ret
705d0b3732eSbholler
706d0b3732eSbholler	.balign 16
707d0b3732eSbhollerL(P4QF):
708d0b3732eSbholler	mov    -0x7c(%rdx),%r9
709d0b3732eSbholler	mov    %r9,-0x7c(%rcx)
710d0b3732eSbhollerL(P4QE):
711d0b3732eSbholler	mov    -0x74(%rdx),%r11
712d0b3732eSbholler	mov    %r11,-0x74(%rcx)
713d0b3732eSbhollerL(P4QD):
714d0b3732eSbholler	mov    -0x6c(%rdx),%r10
715d0b3732eSbholler	mov    %r10,-0x6c(%rcx)
716d0b3732eSbhollerL(P4QC):
717d0b3732eSbholler	mov    -0x64(%rdx),%r9
718d0b3732eSbholler	mov    %r9,-0x64(%rcx)
719d0b3732eSbhollerL(P4QB):
720d0b3732eSbholler	mov    -0x5c(%rdx),%r11
721d0b3732eSbholler	mov    %r11,-0x5c(%rcx)
722d0b3732eSbhollerL(P4QA):
723d0b3732eSbholler	mov    -0x54(%rdx),%r10
724d0b3732eSbholler	mov    %r10,-0x54(%rcx)
725d0b3732eSbhollerL(P4Q9):
726d0b3732eSbholler	mov    -0x4c(%rdx),%r9
727d0b3732eSbholler	mov    %r9,-0x4c(%rcx)
728d0b3732eSbhollerL(P4Q8):
729d0b3732eSbholler	mov    -0x44(%rdx),%r11
730d0b3732eSbholler	mov    %r11,-0x44(%rcx)
731d0b3732eSbhollerL(P4Q7):
732d0b3732eSbholler	mov    -0x3c(%rdx),%r10
733d0b3732eSbholler	mov    %r10,-0x3c(%rcx)
734d0b3732eSbhollerL(P4Q6):
735d0b3732eSbholler	mov    -0x34(%rdx),%r9
736d0b3732eSbholler	mov    %r9,-0x34(%rcx)
737d0b3732eSbhollerL(P4Q5):
738d0b3732eSbholler	mov    -0x2c(%rdx),%r11
739d0b3732eSbholler	mov    %r11,-0x2c(%rcx)
740d0b3732eSbhollerL(P4Q4):
741d0b3732eSbholler	mov    -0x24(%rdx),%r10
742d0b3732eSbholler	mov    %r10,-0x24(%rcx)
743d0b3732eSbhollerL(P4Q3):
744d0b3732eSbholler	mov    -0x1c(%rdx),%r9
745d0b3732eSbholler	mov    %r9,-0x1c(%rcx)
746d0b3732eSbhollerL(P4Q2):
747d0b3732eSbholler	mov    -0x14(%rdx),%r11
748d0b3732eSbholler	mov    %r11,-0x14(%rcx)
749d0b3732eSbhollerL(P4Q1):
750d0b3732eSbholler	mov    -0xc(%rdx),%r10
751d0b3732eSbholler	mov    %r10,-0xc(%rcx)
752d0b3732eSbhollerL(P4Q0):
753d0b3732eSbholler	mov    -0x4(%rdx),%r9d
754d0b3732eSbholler	mov    %r9d,-0x4(%rcx)
755d0b3732eSbholler	ret
756d0b3732eSbholler
757d0b3732eSbholler	.balign 16
758d0b3732eSbhollerL(P5QF):
759d0b3732eSbholler	mov    -0x7d(%rdx),%r9
760d0b3732eSbholler	mov    %r9,-0x7d(%rcx)
761d0b3732eSbhollerL(P5QE):
762d0b3732eSbholler	mov    -0x75(%rdx),%r11
763d0b3732eSbholler	mov    %r11,-0x75(%rcx)
764d0b3732eSbhollerL(P5QD):
765d0b3732eSbholler	mov    -0x6d(%rdx),%r10
766d0b3732eSbholler	mov    %r10,-0x6d(%rcx)
767d0b3732eSbhollerL(P5QC):
768d0b3732eSbholler	mov    -0x65(%rdx),%r9
769d0b3732eSbholler	mov    %r9,-0x65(%rcx)
770d0b3732eSbhollerL(P5QB):
771d0b3732eSbholler	mov    -0x5d(%rdx),%r11
772d0b3732eSbholler	mov    %r11,-0x5d(%rcx)
773d0b3732eSbhollerL(P5QA):
774d0b3732eSbholler	mov    -0x55(%rdx),%r10
775d0b3732eSbholler	mov    %r10,-0x55(%rcx)
776d0b3732eSbhollerL(P5Q9):
777d0b3732eSbholler	mov    -0x4d(%rdx),%r9
778d0b3732eSbholler	mov    %r9,-0x4d(%rcx)
779d0b3732eSbhollerL(P5Q8):
780d0b3732eSbholler	mov    -0x45(%rdx),%r11
781d0b3732eSbholler	mov    %r11,-0x45(%rcx)
782d0b3732eSbhollerL(P5Q7):
783d0b3732eSbholler	mov    -0x3d(%rdx),%r10
784d0b3732eSbholler	mov    %r10,-0x3d(%rcx)
785d0b3732eSbhollerL(P5Q6):
786d0b3732eSbholler	mov    -0x35(%rdx),%r9
787d0b3732eSbholler	mov    %r9,-0x35(%rcx)
788d0b3732eSbhollerL(P5Q5):
789d0b3732eSbholler	mov    -0x2d(%rdx),%r11
790d0b3732eSbholler	mov    %r11,-0x2d(%rcx)
791d0b3732eSbhollerL(P5Q4):
792d0b3732eSbholler	mov    -0x25(%rdx),%r10
793d0b3732eSbholler	mov    %r10,-0x25(%rcx)
794d0b3732eSbhollerL(P5Q3):
795d0b3732eSbholler	mov    -0x1d(%rdx),%r9
796d0b3732eSbholler	mov    %r9,-0x1d(%rcx)
797d0b3732eSbhollerL(P5Q2):
798d0b3732eSbholler	mov    -0x15(%rdx),%r11
799d0b3732eSbholler	mov    %r11,-0x15(%rcx)
800d0b3732eSbhollerL(P5Q1):
801d0b3732eSbholler	mov    -0xd(%rdx),%r10
802d0b3732eSbholler	mov    %r10,-0xd(%rcx)
803d0b3732eSbholler	/*
804d0b3732eSbholler	 * These trailing loads/stores have to do all their loads 1st,
805d0b3732eSbholler	 * then do the stores.
806d0b3732eSbholler	 */
807d0b3732eSbhollerL(P5Q0):
808d0b3732eSbholler	mov    -0x5(%rdx),%r9d
809d0b3732eSbholler	movzbq -0x1(%rdx),%r10
810d0b3732eSbholler	mov    %r9d,-0x5(%rcx)
811d0b3732eSbholler	mov    %r10b,-0x1(%rcx)
812d0b3732eSbholler	ret
813d0b3732eSbholler
814d0b3732eSbholler	.balign 16
815d0b3732eSbhollerL(P6QF):
816d0b3732eSbholler	mov    -0x7e(%rdx),%r9
817d0b3732eSbholler	mov    %r9,-0x7e(%rcx)
818d0b3732eSbhollerL(P6QE):
819d0b3732eSbholler	mov    -0x76(%rdx),%r11
820d0b3732eSbholler	mov    %r11,-0x76(%rcx)
821d0b3732eSbhollerL(P6QD):
822d0b3732eSbholler	mov    -0x6e(%rdx),%r10
823d0b3732eSbholler	mov    %r10,-0x6e(%rcx)
824d0b3732eSbhollerL(P6QC):
825d0b3732eSbholler	mov    -0x66(%rdx),%r9
826d0b3732eSbholler	mov    %r9,-0x66(%rcx)
827d0b3732eSbhollerL(P6QB):
828d0b3732eSbholler	mov    -0x5e(%rdx),%r11
829d0b3732eSbholler	mov    %r11,-0x5e(%rcx)
830d0b3732eSbhollerL(P6QA):
831d0b3732eSbholler	mov    -0x56(%rdx),%r10
832d0b3732eSbholler	mov    %r10,-0x56(%rcx)
833d0b3732eSbhollerL(P6Q9):
834d0b3732eSbholler	mov    -0x4e(%rdx),%r9
835d0b3732eSbholler	mov    %r9,-0x4e(%rcx)
836d0b3732eSbhollerL(P6Q8):
837d0b3732eSbholler	mov    -0x46(%rdx),%r11
838d0b3732eSbholler	mov    %r11,-0x46(%rcx)
839d0b3732eSbhollerL(P6Q7):
840d0b3732eSbholler	mov    -0x3e(%rdx),%r10
841d0b3732eSbholler	mov    %r10,-0x3e(%rcx)
842d0b3732eSbhollerL(P6Q6):
843d0b3732eSbholler	mov    -0x36(%rdx),%r9
844d0b3732eSbholler	mov    %r9,-0x36(%rcx)
845d0b3732eSbhollerL(P6Q5):
846d0b3732eSbholler	mov    -0x2e(%rdx),%r11
847d0b3732eSbholler	mov    %r11,-0x2e(%rcx)
848d0b3732eSbhollerL(P6Q4):
849d0b3732eSbholler	mov    -0x26(%rdx),%r10
850d0b3732eSbholler	mov    %r10,-0x26(%rcx)
851d0b3732eSbhollerL(P6Q3):
852d0b3732eSbholler	mov    -0x1e(%rdx),%r9
853d0b3732eSbholler	mov    %r9,-0x1e(%rcx)
854d0b3732eSbhollerL(P6Q2):
855d0b3732eSbholler	mov    -0x16(%rdx),%r11
856d0b3732eSbholler	mov    %r11,-0x16(%rcx)
857d0b3732eSbhollerL(P6Q1):
858d0b3732eSbholler	mov    -0xe(%rdx),%r10
859d0b3732eSbholler	mov    %r10,-0xe(%rcx)
860d0b3732eSbholler	/*
861d0b3732eSbholler	 * These trailing loads/stores have to do all their loads 1st,
862d0b3732eSbholler	 * then do the stores.
863d0b3732eSbholler	 */
864d0b3732eSbhollerL(P6Q0):
865d0b3732eSbholler	mov    -0x6(%rdx),%r9d
866d0b3732eSbholler	movzwq -0x2(%rdx),%r10
867d0b3732eSbholler	mov    %r9d,-0x6(%rcx)
868d0b3732eSbholler	mov    %r10w,-0x2(%rcx)
869d0b3732eSbholler	ret
870d0b3732eSbholler
871d0b3732eSbholler	.balign 16
872d0b3732eSbhollerL(P7QF):
873d0b3732eSbholler	mov    -0x7f(%rdx),%r9
874d0b3732eSbholler	mov    %r9,-0x7f(%rcx)
875d0b3732eSbhollerL(P7QE):
876d0b3732eSbholler	mov    -0x77(%rdx),%r11
877d0b3732eSbholler	mov    %r11,-0x77(%rcx)
878d0b3732eSbhollerL(P7QD):
879d0b3732eSbholler	mov    -0x6f(%rdx),%r10
880d0b3732eSbholler	mov    %r10,-0x6f(%rcx)
881d0b3732eSbhollerL(P7QC):
882d0b3732eSbholler	mov    -0x67(%rdx),%r9
883d0b3732eSbholler	mov    %r9,-0x67(%rcx)
884d0b3732eSbhollerL(P7QB):
885d0b3732eSbholler	mov    -0x5f(%rdx),%r11
886d0b3732eSbholler	mov    %r11,-0x5f(%rcx)
887d0b3732eSbhollerL(P7QA):
888d0b3732eSbholler	mov    -0x57(%rdx),%r10
889d0b3732eSbholler	mov    %r10,-0x57(%rcx)
890d0b3732eSbhollerL(P7Q9):
891d0b3732eSbholler	mov    -0x4f(%rdx),%r9
892d0b3732eSbholler	mov    %r9,-0x4f(%rcx)
893d0b3732eSbhollerL(P7Q8):
894d0b3732eSbholler	mov    -0x47(%rdx),%r11
895d0b3732eSbholler	mov    %r11,-0x47(%rcx)
896d0b3732eSbhollerL(P7Q7):
897d0b3732eSbholler	mov    -0x3f(%rdx),%r10
898d0b3732eSbholler	mov    %r10,-0x3f(%rcx)
899d0b3732eSbhollerL(P7Q6):
900d0b3732eSbholler	mov    -0x37(%rdx),%r9
901d0b3732eSbholler	mov    %r9,-0x37(%rcx)
902d0b3732eSbhollerL(P7Q5):
903d0b3732eSbholler	mov    -0x2f(%rdx),%r11
904d0b3732eSbholler	mov    %r11,-0x2f(%rcx)
905d0b3732eSbhollerL(P7Q4):
906d0b3732eSbholler	mov    -0x27(%rdx),%r10
907d0b3732eSbholler	mov    %r10,-0x27(%rcx)
908d0b3732eSbhollerL(P7Q3):
909d0b3732eSbholler	mov    -0x1f(%rdx),%r9
910d0b3732eSbholler	mov    %r9,-0x1f(%rcx)
911d0b3732eSbhollerL(P7Q2):
912d0b3732eSbholler	mov    -0x17(%rdx),%r11
913d0b3732eSbholler	mov    %r11,-0x17(%rcx)
914d0b3732eSbhollerL(P7Q1):
915d0b3732eSbholler	mov    -0xf(%rdx),%r10
916d0b3732eSbholler	mov    %r10,-0xf(%rcx)
917d0b3732eSbholler	/*
918d0b3732eSbholler	 * These trailing loads/stores have to do all their loads 1st,
919d0b3732eSbholler	 * then do the stores.
920d0b3732eSbholler	 */
921d0b3732eSbhollerL(P7Q0):
922d0b3732eSbholler	mov    -0x7(%rdx),%r9d
923d0b3732eSbholler	movzwq -0x3(%rdx),%r10
924d0b3732eSbholler	movzbq -0x1(%rdx),%r11
925d0b3732eSbholler	mov    %r9d,-0x7(%rcx)
926d0b3732eSbholler	mov    %r10w,-0x3(%rcx)
927d0b3732eSbholler	mov    %r11b,-0x1(%rcx)
928d0b3732eSbholler	ret
929d0b3732eSbholler
930d0b3732eSbholler	.balign 16
931d0b3732eSbhollerL(ck_use_sse2):
932d0b3732eSbholler	/*
933d0b3732eSbholler	 * Align dest to 16 byte boundary.
934d0b3732eSbholler	 */
935d0b3732eSbholler	test   $0xf,%rcx
936d0b3732eSbholler	jnz    L(ShrtAlignNew)
937d0b3732eSbholler
938d0b3732eSbhollerL(now_qw_aligned):
939d0b3732eSbholler	cmpl   $NO_SSE,.memops_method(%rip)
940d0b3732eSbholler	je     L(Loop8byte_pre)
941d0b3732eSbholler
942d0b3732eSbholler	/*
943d0b3732eSbholler	 * The fall-through path is to do SSE2 16-byte load/stores
944d0b3732eSbholler	 */
945d0b3732eSbholler
946d0b3732eSbholler	/*
947d0b3732eSbholler	 * If current move size is larger than half of the highest level cache
948d0b3732eSbholler	 * size, then do non-temporal moves.
949d0b3732eSbholler	 */
950d0b3732eSbholler	mov    .largest_level_cache_size(%rip),%r9d
951d0b3732eSbholler	shr    %r9		# take half of it
952d0b3732eSbholler	cmp    %r9,%r8
953d0b3732eSbholler	jg     L(sse2_nt_move)
954d0b3732eSbholler
955d0b3732eSbholler	/*
956d0b3732eSbholler	 * If both the source and dest are aligned, then use the both aligned
957d0b3732eSbholler	 * logic. Well aligned data should reap the rewards.
958d0b3732eSbholler	 */
959d0b3732eSbholler	test   $0xf,%rdx
960d0b3732eSbholler	jz     L(pre_both_aligned)
961d0b3732eSbholler
962d0b3732eSbholler	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
963d0b3732eSbholler	testl  $USE_SSSE3,.memops_method(%rip)
964d0b3732eSbholler	jz     1f
965d0b3732eSbholler	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
966d0b3732eSbholler
967d0b3732eSbholler1:
968d0b3732eSbholler	/*
969d0b3732eSbholler	 * if the src is not 16 byte aligned...
970d0b3732eSbholler	 */
971d0b3732eSbholler	mov    %rdx,%r11
972d0b3732eSbholler	and    $0xf,%r11
973d0b3732eSbholler	movdqu (%rdx),%xmm0
974d0b3732eSbholler	movdqa %xmm0,(%rcx)
975d0b3732eSbholler	add    $0x10,%rdx
976d0b3732eSbholler	sub    %r11,%rdx
977d0b3732eSbholler	add    $0x10,%rcx
978d0b3732eSbholler	sub    $0x10,%r8
979d0b3732eSbholler	movdqa (%rdx),%xmm1
980d0b3732eSbholler
981d0b3732eSbholler	movslq (%r10,%r11,4),%r9
982d0b3732eSbholler	lea    (%r9,%r10,1),%r10
983d0b3732eSbholler	jmpq   *%r10
984d0b3732eSbholler
985d0b3732eSbholler	    .balign 16
986d0b3732eSbhollerL(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
987d0b3732eSbholler	    .int        L(mov3dqa1) -L(SSSE3_src)
988d0b3732eSbholler	    .int        L(mov3dqa2) -L(SSSE3_src)
989d0b3732eSbholler	    .int        L(mov3dqa3) -L(SSSE3_src)
990d0b3732eSbholler	    .int        L(mov3dqa4) -L(SSSE3_src)
991d0b3732eSbholler	    .int        L(mov3dqa5) -L(SSSE3_src)
992d0b3732eSbholler	    .int        L(mov3dqa6) -L(SSSE3_src)
993d0b3732eSbholler	    .int        L(mov3dqa7) -L(SSSE3_src)
994d0b3732eSbholler	    .int        L(movdqa8)  -L(SSSE3_src)
995d0b3732eSbholler	    .int        L(mov3dqa9) -L(SSSE3_src)
996d0b3732eSbholler	    .int        L(mov3dqa10)-L(SSSE3_src)
997d0b3732eSbholler	    .int        L(mov3dqa11)-L(SSSE3_src)
998d0b3732eSbholler	    .int        L(mov3dqa12)-L(SSSE3_src)
999d0b3732eSbholler	    .int        L(mov3dqa13)-L(SSSE3_src)
1000d0b3732eSbholler	    .int        L(mov3dqa14)-L(SSSE3_src)
1001d0b3732eSbholler	    .int        L(mov3dqa15)-L(SSSE3_src)
1002d0b3732eSbhollerL(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
1003d0b3732eSbholler	    .int        L(movdqa1) -L(SSE_src)
1004d0b3732eSbholler	    .int        L(movdqa2) -L(SSE_src)
1005d0b3732eSbholler	    .int        L(movdqa3) -L(SSE_src)
1006d0b3732eSbholler	    .int        L(movdqa4) -L(SSE_src)
1007d0b3732eSbholler	    .int        L(movdqa5) -L(SSE_src)
1008d0b3732eSbholler	    .int        L(movdqa6) -L(SSE_src)
1009d0b3732eSbholler	    .int        L(movdqa7) -L(SSE_src)
1010d0b3732eSbholler	    .int        L(movdqa8) -L(SSE_src)
1011d0b3732eSbholler	    .int        L(movdqa9) -L(SSE_src)
1012d0b3732eSbholler	    .int        L(movdqa10)-L(SSE_src)
1013d0b3732eSbholler	    .int        L(movdqa11)-L(SSE_src)
1014d0b3732eSbholler	    .int        L(movdqa12)-L(SSE_src)
1015d0b3732eSbholler	    .int        L(movdqa13)-L(SSE_src)
1016d0b3732eSbholler	    .int        L(movdqa14)-L(SSE_src)
1017d0b3732eSbholler	    .int        L(movdqa15)-L(SSE_src)
1018d0b3732eSbholler
1019d0b3732eSbholler	.balign 16
1020d0b3732eSbhollerL(movdqa1):
1021d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
1022d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
1023d0b3732eSbholler	lea    0x20(%rdx),%rdx
1024d0b3732eSbholler	lea    -0x20(%r8),%r8
1025d0b3732eSbholler
1026d0b3732eSbholler	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
1027d0b3732eSbholler	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
1028d0b3732eSbholler	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
1029d0b3732eSbholler	por    %xmm1,%xmm3 # OR them together
1030d0b3732eSbholler	cmp    $0x20,%r8
1031d0b3732eSbholler
1032d0b3732eSbholler	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
1033d0b3732eSbholler	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
1034d0b3732eSbholler	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
1035d0b3732eSbholler	por    %xmm2,%xmm0 # OR them together
1036d0b3732eSbholler	movdqa %xmm3,(%rcx)     # store it
1037d0b3732eSbholler	movdqa %xmm0,0x10(%rcx) # store it
1038d0b3732eSbholler	lea    0x20(%rcx),%rcx
1039d0b3732eSbholler
1040d0b3732eSbholler	jge    L(movdqa1)
1041d0b3732eSbholler	jmp    L(movdqa_epi)
1042d0b3732eSbholler
1043d0b3732eSbholler	.balign 16
1044d0b3732eSbhollerL(movdqa2):
1045d0b3732eSbholler	sub    $0x20,%r8
1046d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1047d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1048d0b3732eSbholler	add    $0x20,%rdx
1049d0b3732eSbholler
1050d0b3732eSbholler	psrldq $0x2,%xmm1
1051d0b3732eSbholler	movdqa %xmm3,%xmm2
1052d0b3732eSbholler	pslldq $0xe,%xmm3
1053d0b3732eSbholler	por    %xmm1,%xmm3
1054d0b3732eSbholler
1055d0b3732eSbholler	psrldq $0x2,%xmm2
1056d0b3732eSbholler	movdqa %xmm0,%xmm1
1057d0b3732eSbholler	pslldq $0xe,%xmm0
1058d0b3732eSbholler	por    %xmm2,%xmm0
1059d0b3732eSbholler	movdqa %xmm3,(%rcx)
1060d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1061d0b3732eSbholler
1062d0b3732eSbholler	add    $0x20,%rcx
1063d0b3732eSbholler	cmp    $0x20,%r8
1064d0b3732eSbholler	jge    L(movdqa2)
1065d0b3732eSbholler	jmp    L(movdqa_epi)
1066d0b3732eSbholler
1067d0b3732eSbholler	.balign 16
1068d0b3732eSbhollerL(movdqa3):
1069d0b3732eSbholler	sub    $0x20,%r8
1070d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1071d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1072d0b3732eSbholler	add    $0x20,%rdx
1073d0b3732eSbholler
1074d0b3732eSbholler	psrldq $0x3,%xmm1
1075d0b3732eSbholler	movdqa %xmm3,%xmm2
1076d0b3732eSbholler	pslldq $0xd,%xmm3
1077d0b3732eSbholler	por    %xmm1,%xmm3
1078d0b3732eSbholler
1079d0b3732eSbholler	psrldq $0x3,%xmm2
1080d0b3732eSbholler	movdqa %xmm0,%xmm1
1081d0b3732eSbholler	pslldq $0xd,%xmm0
1082d0b3732eSbholler	por    %xmm2,%xmm0
1083d0b3732eSbholler	movdqa %xmm3,(%rcx)
1084d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1085d0b3732eSbholler
1086d0b3732eSbholler	add    $0x20,%rcx
1087d0b3732eSbholler	cmp    $0x20,%r8
1088d0b3732eSbholler	jge    L(movdqa3)
1089d0b3732eSbholler	jmp    L(movdqa_epi)
1090d0b3732eSbholler
1091d0b3732eSbholler	.balign 16
1092d0b3732eSbhollerL(movdqa4):
1093d0b3732eSbholler	sub    $0x20,%r8
1094d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1095d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1096d0b3732eSbholler	add    $0x20,%rdx
1097d0b3732eSbholler
1098d0b3732eSbholler	psrldq $0x4,%xmm1
1099d0b3732eSbholler	movdqa %xmm3,%xmm2
1100d0b3732eSbholler	pslldq $0xc,%xmm3
1101d0b3732eSbholler	por    %xmm1,%xmm3
1102d0b3732eSbholler
1103d0b3732eSbholler	psrldq $0x4,%xmm2
1104d0b3732eSbholler	movdqa %xmm0,%xmm1
1105d0b3732eSbholler	pslldq $0xc,%xmm0
1106d0b3732eSbholler	por    %xmm2,%xmm0
1107d0b3732eSbholler
1108d0b3732eSbholler	movdqa %xmm3,(%rcx)
1109d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1110d0b3732eSbholler
1111d0b3732eSbholler	add    $0x20,%rcx
1112d0b3732eSbholler	cmp    $0x20,%r8
1113d0b3732eSbholler	jge    L(movdqa4)
1114d0b3732eSbholler	jmp    L(movdqa_epi)
1115d0b3732eSbholler
1116d0b3732eSbholler	.balign 16
1117d0b3732eSbhollerL(movdqa5):
1118d0b3732eSbholler	sub    $0x20,%r8
1119d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1120d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1121d0b3732eSbholler	add    $0x20,%rdx
1122d0b3732eSbholler
1123d0b3732eSbholler	psrldq $0x5,%xmm1
1124d0b3732eSbholler	movdqa %xmm3,%xmm2
1125d0b3732eSbholler	pslldq $0xb,%xmm3
1126d0b3732eSbholler	por    %xmm1,%xmm3
1127d0b3732eSbholler
1128d0b3732eSbholler	psrldq $0x5,%xmm2
1129d0b3732eSbholler	movdqa %xmm0,%xmm1
1130d0b3732eSbholler	pslldq $0xb,%xmm0
1131d0b3732eSbholler	por    %xmm2,%xmm0
1132d0b3732eSbholler
1133d0b3732eSbholler	movdqa %xmm3,(%rcx)
1134d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1135d0b3732eSbholler
1136d0b3732eSbholler	add    $0x20,%rcx
1137d0b3732eSbholler	cmp    $0x20,%r8
1138d0b3732eSbholler	jge    L(movdqa5)
1139d0b3732eSbholler	jmp    L(movdqa_epi)
1140d0b3732eSbholler
1141d0b3732eSbholler	.balign 16
1142d0b3732eSbhollerL(movdqa6):
1143d0b3732eSbholler	sub    $0x20,%r8
1144d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1145d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1146d0b3732eSbholler	add    $0x20,%rdx
1147d0b3732eSbholler
1148d0b3732eSbholler	psrldq $0x6,%xmm1
1149d0b3732eSbholler	movdqa %xmm3,%xmm2
1150d0b3732eSbholler	pslldq $0xa,%xmm3
1151d0b3732eSbholler	por    %xmm1,%xmm3
1152d0b3732eSbholler
1153d0b3732eSbholler	psrldq $0x6,%xmm2
1154d0b3732eSbholler	movdqa %xmm0,%xmm1
1155d0b3732eSbholler	pslldq $0xa,%xmm0
1156d0b3732eSbholler	por    %xmm2,%xmm0
1157d0b3732eSbholler	movdqa %xmm3,(%rcx)
1158d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1159d0b3732eSbholler
1160d0b3732eSbholler	add    $0x20,%rcx
1161d0b3732eSbholler	cmp    $0x20,%r8
1162d0b3732eSbholler	jge    L(movdqa6)
1163d0b3732eSbholler	jmp    L(movdqa_epi)
1164d0b3732eSbholler
1165d0b3732eSbholler	.balign 16
1166d0b3732eSbhollerL(movdqa7):
1167d0b3732eSbholler	sub    $0x20,%r8
1168d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1169d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1170d0b3732eSbholler	add    $0x20,%rdx
1171d0b3732eSbholler
1172d0b3732eSbholler	psrldq $0x7,%xmm1
1173d0b3732eSbholler	movdqa %xmm3,%xmm2
1174d0b3732eSbholler	pslldq $0x9,%xmm3
1175d0b3732eSbholler	por    %xmm1,%xmm3
1176d0b3732eSbholler
1177d0b3732eSbholler	psrldq $0x7,%xmm2
1178d0b3732eSbholler	movdqa %xmm0,%xmm1
1179d0b3732eSbholler	pslldq $0x9,%xmm0
1180d0b3732eSbholler	por    %xmm2,%xmm0
1181d0b3732eSbholler	movdqa %xmm3,(%rcx)
1182d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1183d0b3732eSbholler
1184d0b3732eSbholler	add    $0x20,%rcx
1185d0b3732eSbholler	cmp    $0x20,%r8
1186d0b3732eSbholler	jge    L(movdqa7)
1187d0b3732eSbholler	jmp    L(movdqa_epi)
1188d0b3732eSbholler
1189d0b3732eSbholler	.balign 16
1190d0b3732eSbhollerL(movdqa8):
1191d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1192d0b3732eSbholler	sub    $0x30,%r8
1193d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1194d0b3732eSbholler	movdqa 0x30(%rdx),%xmm5
1195d0b3732eSbholler	lea    0x30(%rdx),%rdx
1196d0b3732eSbholler
1197d0b3732eSbholler	shufpd $0x1,%xmm3,%xmm1
1198d0b3732eSbholler	movdqa %xmm1,(%rcx)
1199d0b3732eSbholler
1200d0b3732eSbholler	cmp    $0x30,%r8
1201d0b3732eSbholler
1202d0b3732eSbholler	shufpd $0x1,%xmm0,%xmm3
1203d0b3732eSbholler	movdqa %xmm3,0x10(%rcx)
1204d0b3732eSbholler
1205d0b3732eSbholler	movdqa %xmm5,%xmm1
1206d0b3732eSbholler	shufpd $0x1,%xmm5,%xmm0
1207d0b3732eSbholler	movdqa %xmm0,0x20(%rcx)
1208d0b3732eSbholler
1209d0b3732eSbholler	lea    0x30(%rcx),%rcx
1210d0b3732eSbholler
1211d0b3732eSbholler	jge    L(movdqa8)
1212d0b3732eSbholler	jmp    L(movdqa_epi)
1213d0b3732eSbholler
1214d0b3732eSbholler	.balign 16
1215d0b3732eSbhollerL(movdqa9):
1216d0b3732eSbholler	sub    $0x20,%r8
1217d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1218d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1219d0b3732eSbholler	add    $0x20,%rdx
1220d0b3732eSbholler
1221d0b3732eSbholler	psrldq $0x9,%xmm1
1222d0b3732eSbholler	movdqa %xmm3,%xmm2
1223d0b3732eSbholler	pslldq $0x7,%xmm3
1224d0b3732eSbholler	por    %xmm1,%xmm3
1225d0b3732eSbholler
1226d0b3732eSbholler	psrldq $0x9,%xmm2
1227d0b3732eSbholler	movdqa %xmm0,%xmm1
1228d0b3732eSbholler	pslldq $0x7,%xmm0
1229d0b3732eSbholler	por    %xmm2,%xmm0
1230d0b3732eSbholler	movdqa %xmm3,(%rcx)
1231d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1232d0b3732eSbholler
1233d0b3732eSbholler	add    $0x20,%rcx
1234d0b3732eSbholler	cmp    $0x20,%r8
1235d0b3732eSbholler	jge    L(movdqa9)
1236d0b3732eSbholler	jmp    L(movdqa_epi)
1237d0b3732eSbholler
1238d0b3732eSbholler	.balign 16
1239d0b3732eSbhollerL(movdqa10):
1240d0b3732eSbholler	sub    $0x20,%r8
1241d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1242d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1243d0b3732eSbholler	add    $0x20,%rdx
1244d0b3732eSbholler
1245d0b3732eSbholler	psrldq $0xa,%xmm1
1246d0b3732eSbholler	movdqa %xmm3,%xmm2
1247d0b3732eSbholler	pslldq $0x6,%xmm3
1248d0b3732eSbholler	por    %xmm1,%xmm3
1249d0b3732eSbholler
1250d0b3732eSbholler	psrldq $0xa,%xmm2
1251d0b3732eSbholler	movdqa %xmm0,%xmm1
1252d0b3732eSbholler	pslldq $0x6,%xmm0
1253d0b3732eSbholler	por    %xmm2,%xmm0
1254d0b3732eSbholler	movdqa %xmm3,(%rcx)
1255d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1256d0b3732eSbholler
1257d0b3732eSbholler	add    $0x20,%rcx
1258d0b3732eSbholler	cmp    $0x20,%r8
1259d0b3732eSbholler	jge    L(movdqa10)
1260d0b3732eSbholler	jmp    L(movdqa_epi)
1261d0b3732eSbholler
1262d0b3732eSbholler	.balign 16
1263d0b3732eSbhollerL(movdqa11):
1264d0b3732eSbholler	sub    $0x20,%r8
1265d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1266d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1267d0b3732eSbholler	add    $0x20,%rdx
1268d0b3732eSbholler
1269d0b3732eSbholler	psrldq $0xb,%xmm1
1270d0b3732eSbholler	movdqa %xmm3,%xmm2
1271d0b3732eSbholler	pslldq $0x5,%xmm3
1272d0b3732eSbholler	por    %xmm1,%xmm3
1273d0b3732eSbholler
1274d0b3732eSbholler	psrldq $0xb,%xmm2
1275d0b3732eSbholler	movdqa %xmm0,%xmm1
1276d0b3732eSbholler	pslldq $0x5,%xmm0
1277d0b3732eSbholler	por    %xmm2,%xmm0
1278d0b3732eSbholler	movdqa %xmm3,(%rcx)
1279d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1280d0b3732eSbholler
1281d0b3732eSbholler	add    $0x20,%rcx
1282d0b3732eSbholler	cmp    $0x20,%r8
1283d0b3732eSbholler	jge    L(movdqa11)
1284d0b3732eSbholler	jmp    L(movdqa_epi)
1285d0b3732eSbholler
1286d0b3732eSbholler	.balign 16
1287d0b3732eSbhollerL(movdqa12):
1288d0b3732eSbholler	sub    $0x20,%r8
1289d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1290d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1291d0b3732eSbholler	add    $0x20,%rdx
1292d0b3732eSbholler
1293d0b3732eSbholler	psrldq $0xc,%xmm1
1294d0b3732eSbholler	movdqa %xmm3,%xmm2
1295d0b3732eSbholler	pslldq $0x4,%xmm3
1296d0b3732eSbholler	por    %xmm1,%xmm3
1297d0b3732eSbholler
1298d0b3732eSbholler	psrldq $0xc,%xmm2
1299d0b3732eSbholler	movdqa %xmm0,%xmm1
1300d0b3732eSbholler	pslldq $0x4,%xmm0
1301d0b3732eSbholler	por    %xmm2,%xmm0
1302d0b3732eSbholler	movdqa %xmm3,(%rcx)
1303d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1304d0b3732eSbholler
1305d0b3732eSbholler	add    $0x20,%rcx
1306d0b3732eSbholler	cmp    $0x20,%r8
1307d0b3732eSbholler	jge    L(movdqa12)
1308d0b3732eSbholler	jmp    L(movdqa_epi)
1309d0b3732eSbholler
1310d0b3732eSbholler	.balign 16
1311d0b3732eSbhollerL(movdqa13):
1312d0b3732eSbholler	sub    $0x20,%r8
1313d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1314d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1315d0b3732eSbholler	add    $0x20,%rdx
1316d0b3732eSbholler
1317d0b3732eSbholler	psrldq $0xd,%xmm1
1318d0b3732eSbholler	movdqa %xmm3,%xmm2
1319d0b3732eSbholler	pslldq $0x3,%xmm3
1320d0b3732eSbholler	por    %xmm1,%xmm3
1321d0b3732eSbholler
1322d0b3732eSbholler	psrldq $0xd,%xmm2
1323d0b3732eSbholler	movdqa %xmm0,%xmm1
1324d0b3732eSbholler	pslldq $0x3,%xmm0
1325d0b3732eSbholler	por    %xmm2,%xmm0
1326d0b3732eSbholler	movdqa %xmm3,(%rcx)
1327d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1328d0b3732eSbholler
1329d0b3732eSbholler	add    $0x20,%rcx
1330d0b3732eSbholler	cmp    $0x20,%r8
1331d0b3732eSbholler	jge    L(movdqa13)
1332d0b3732eSbholler	jmp    L(movdqa_epi)
1333d0b3732eSbholler
1334d0b3732eSbholler	.balign 16
1335d0b3732eSbhollerL(movdqa14):
1336d0b3732eSbholler	sub    $0x20,%r8
1337d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1338d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1339d0b3732eSbholler	add    $0x20,%rdx
1340d0b3732eSbholler
1341d0b3732eSbholler	psrldq $0xe,%xmm1
1342d0b3732eSbholler	movdqa %xmm3,%xmm2
1343d0b3732eSbholler	pslldq $0x2,%xmm3
1344d0b3732eSbholler	por    %xmm1,%xmm3
1345d0b3732eSbholler
1346d0b3732eSbholler	psrldq $0xe,%xmm2
1347d0b3732eSbholler	movdqa %xmm0,%xmm1
1348d0b3732eSbholler	pslldq $0x2,%xmm0
1349d0b3732eSbholler	por    %xmm2,%xmm0
1350d0b3732eSbholler	movdqa %xmm3,(%rcx)
1351d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1352d0b3732eSbholler
1353d0b3732eSbholler	add    $0x20,%rcx
1354d0b3732eSbholler	cmp    $0x20,%r8
1355d0b3732eSbholler	jge    L(movdqa14)
1356d0b3732eSbholler	jmp    L(movdqa_epi)
1357d0b3732eSbholler
1358d0b3732eSbholler	.balign 16
1359d0b3732eSbhollerL(movdqa15):
1360d0b3732eSbholler	sub    $0x20,%r8
1361d0b3732eSbholler	movdqa 0x10(%rdx),%xmm3
1362d0b3732eSbholler	movdqa 0x20(%rdx),%xmm0
1363d0b3732eSbholler	add    $0x20,%rdx
1364d0b3732eSbholler
1365d0b3732eSbholler	psrldq $0xf,%xmm1
1366d0b3732eSbholler	movdqa %xmm3,%xmm2
1367d0b3732eSbholler	pslldq $0x1,%xmm3
1368d0b3732eSbholler	por    %xmm1,%xmm3
1369d0b3732eSbholler
1370d0b3732eSbholler	psrldq $0xf,%xmm2
1371d0b3732eSbholler	movdqa %xmm0,%xmm1
1372d0b3732eSbholler	pslldq $0x1,%xmm0
1373d0b3732eSbholler	por    %xmm2,%xmm0
1374d0b3732eSbholler	movdqa %xmm3,(%rcx)
1375d0b3732eSbholler	movdqa %xmm0,0x10(%rcx)
1376d0b3732eSbholler
1377d0b3732eSbholler	add    $0x20,%rcx
1378d0b3732eSbholler	cmp    $0x20,%r8
1379d0b3732eSbholler	jge    L(movdqa15)
1380d0b3732eSbholler	#jmp   L(movdqa_epi)
1381d0b3732eSbholler
1382d0b3732eSbholler	.balign 16
1383d0b3732eSbhollerL(movdqa_epi):
1384d0b3732eSbholler	lea    L(fwdPxQx)(%rip),%r10
1385d0b3732eSbholler	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
1386d0b3732eSbholler	add    %r8,%rcx
1387d0b3732eSbholler	add    %r8,%rdx
1388d0b3732eSbholler
1389d0b3732eSbholler	movslq (%r10,%r8,4),%r9
1390d0b3732eSbholler	lea    (%r9,%r10,1),%r10
1391d0b3732eSbholler	jmpq   *%r10
1392d0b3732eSbholler
1393d0b3732eSbholler	.balign 16
1394d0b3732eSbhollerL(mov3dqa1):
1395d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
1396d0b3732eSbholler	sub	$0x30,%r8
1397d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
1398d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
1399d0b3732eSbholler	lea	0x30(%rdx),%rdx
1400d0b3732eSbholler	cmp	$0x30,%r8
1401d0b3732eSbholler
1402d0b3732eSbholler	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
1403d0b3732eSbholler	#palignr	$0x1,%xmm1,%xmm3
1404d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1405d0b3732eSbholler	.byte	0xd9,0x01
1406d0b3732eSbholler	movdqa	%xmm3,(%rcx)      # store it
1407d0b3732eSbholler
1408d0b3732eSbholler	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
1409d0b3732eSbholler	#palignr	$0x1,%xmm2,%xmm0
1410d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1411d0b3732eSbholler	.byte	0xc2,0x01
1412d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)  # store it
1413d0b3732eSbholler
1414d0b3732eSbholler	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
1415d0b3732eSbholler	#palignr	$0x1,%xmm4,%xmm5
1416d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1417d0b3732eSbholler	.byte	0xec,0x01
1418d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)  # store it
1419d0b3732eSbholler
1420d0b3732eSbholler	lea	0x30(%rcx),%rcx
1421d0b3732eSbholler	jge	L(mov3dqa1)
1422d0b3732eSbholler
1423d0b3732eSbholler	cmp	$0x10,%r8
1424d0b3732eSbholler	jl	L(movdqa_epi)
1425d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1426d0b3732eSbholler	sub	$0x10,%r8
1427d0b3732eSbholler	lea	0x10(%rdx),%rdx
1428d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1429d0b3732eSbholler	#palignr	$0x1,%xmm1,%xmm3
1430d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1431d0b3732eSbholler	.byte	0xd9,0x01
1432d0b3732eSbholler
1433d0b3732eSbholler	cmp	$0x10,%r8
1434d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1435d0b3732eSbholler	lea	0x10(%rcx),%rcx
1436d0b3732eSbholler	jl	L(movdqa_epi)
1437d0b3732eSbholler
1438d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1439d0b3732eSbholler	sub	$0x10,%r8
1440d0b3732eSbholler	lea	0x10(%rdx),%rdx
1441d0b3732eSbholler	#palignr	$0x1,%xmm2,%xmm0
1442d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1443d0b3732eSbholler	.byte	0xc2,0x01
1444d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1445d0b3732eSbholler	lea	0x10(%rcx),%rcx
1446d0b3732eSbholler	jmp	L(movdqa_epi)
1447d0b3732eSbholler
1448d0b3732eSbholler	.balign 16
1449d0b3732eSbhollerL(mov3dqa2):
1450d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1451d0b3732eSbholler	sub	$0x30,%r8
1452d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1453d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1454d0b3732eSbholler	lea	0x30(%rdx),%rdx
1455d0b3732eSbholler	cmp	$0x30,%r8
1456d0b3732eSbholler
1457d0b3732eSbholler	movdqa	%xmm3,%xmm2
1458d0b3732eSbholler	#palignr	$0x2,%xmm1,%xmm3
1459d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1460d0b3732eSbholler	.byte	0xd9,0x02
1461d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1462d0b3732eSbholler
1463d0b3732eSbholler	movdqa	%xmm0,%xmm4
1464d0b3732eSbholler	#palignr	$0x2,%xmm2,%xmm0
1465d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1466d0b3732eSbholler	.byte	0xc2,0x02
1467d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1468d0b3732eSbholler
1469d0b3732eSbholler	movdqa	%xmm5,%xmm1
1470d0b3732eSbholler	#palignr	$0x2,%xmm4,%xmm5
1471d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1472d0b3732eSbholler	.byte	0xec,0x02
1473d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1474d0b3732eSbholler
1475d0b3732eSbholler	lea	0x30(%rcx),%rcx
1476d0b3732eSbholler	jge	L(mov3dqa2)
1477d0b3732eSbholler
1478d0b3732eSbholler	cmp	$0x10,%r8
1479d0b3732eSbholler	jl	L(movdqa_epi)
1480d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1481d0b3732eSbholler	sub	$0x10,%r8
1482d0b3732eSbholler	lea	0x10(%rdx),%rdx
1483d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1484d0b3732eSbholler	#palignr	$0x2,%xmm1,%xmm3
1485d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1486d0b3732eSbholler	.byte	0xd9,0x02
1487d0b3732eSbholler
1488d0b3732eSbholler	cmp	$0x10,%r8
1489d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1490d0b3732eSbholler	lea	0x10(%rcx),%rcx
1491d0b3732eSbholler	jl	L(movdqa_epi)
1492d0b3732eSbholler
1493d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1494d0b3732eSbholler	sub	$0x10,%r8
1495d0b3732eSbholler	lea	0x10(%rdx),%rdx
1496d0b3732eSbholler	#palignr	$0x2,%xmm2,%xmm0
1497d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1498d0b3732eSbholler	.byte	0xc2,0x02
1499d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1500d0b3732eSbholler	lea	0x10(%rcx),%rcx
1501d0b3732eSbholler	jmp	L(movdqa_epi)
1502d0b3732eSbholler
1503d0b3732eSbholler	.balign 16
1504d0b3732eSbhollerL(mov3dqa3):
1505d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1506d0b3732eSbholler	sub	$0x30,%r8
1507d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1508d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1509d0b3732eSbholler	lea	0x30(%rdx),%rdx
1510d0b3732eSbholler	cmp	$0x30,%r8
1511d0b3732eSbholler
1512d0b3732eSbholler	movdqa	%xmm3,%xmm2
1513d0b3732eSbholler	#palignr	$0x3,%xmm1,%xmm3
1514d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1515d0b3732eSbholler	.byte	0xd9,0x03
1516d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1517d0b3732eSbholler
1518d0b3732eSbholler	movdqa	%xmm0,%xmm4
1519d0b3732eSbholler	#palignr	$0x3,%xmm2,%xmm0
1520d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1521d0b3732eSbholler	.byte	0xc2,0x03
1522d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1523d0b3732eSbholler
1524d0b3732eSbholler	movdqa	%xmm5,%xmm1
1525d0b3732eSbholler	#palignr	$0x3,%xmm4,%xmm5
1526d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1527d0b3732eSbholler	.byte	0xec,0x03
1528d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1529d0b3732eSbholler
1530d0b3732eSbholler	lea	0x30(%rcx),%rcx
1531d0b3732eSbholler	jge	L(mov3dqa3)
1532d0b3732eSbholler
1533d0b3732eSbholler	cmp	$0x10,%r8
1534d0b3732eSbholler	jl	L(movdqa_epi)
1535d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1536d0b3732eSbholler	sub	$0x10,%r8
1537d0b3732eSbholler	lea	0x10(%rdx),%rdx
1538d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1539d0b3732eSbholler	#palignr	$0x3,%xmm1,%xmm3
1540d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1541d0b3732eSbholler	.byte	0xd9,0x03
1542d0b3732eSbholler
1543d0b3732eSbholler	cmp	$0x10,%r8
1544d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1545d0b3732eSbholler	lea	0x10(%rcx),%rcx
1546d0b3732eSbholler	jl	L(movdqa_epi)
1547d0b3732eSbholler
1548d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1549d0b3732eSbholler	sub	$0x10,%r8
1550d0b3732eSbholler	lea	0x10(%rdx),%rdx
1551d0b3732eSbholler	#palignr	$0x3,%xmm2,%xmm0
1552d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1553d0b3732eSbholler	.byte	0xc2,0x03
1554d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1555d0b3732eSbholler	lea	0x10(%rcx),%rcx
1556d0b3732eSbholler	jmp	L(movdqa_epi)
1557d0b3732eSbholler
1558d0b3732eSbholler	.balign 16
1559d0b3732eSbhollerL(mov3dqa4):
1560d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1561d0b3732eSbholler	sub	$0x30,%r8
1562d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1563d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1564d0b3732eSbholler	lea	0x30(%rdx),%rdx
1565d0b3732eSbholler	cmp	$0x30,%r8
1566d0b3732eSbholler
1567d0b3732eSbholler	movdqa	%xmm3,%xmm2
1568d0b3732eSbholler	#palignr	$0x4,%xmm1,%xmm3
1569d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1570d0b3732eSbholler	.byte	0xd9,0x04
1571d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1572d0b3732eSbholler
1573d0b3732eSbholler	movdqa	%xmm0,%xmm4
1574d0b3732eSbholler	#palignr	$0x4,%xmm2,%xmm0
1575d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1576d0b3732eSbholler	.byte	0xc2,0x04
1577d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1578d0b3732eSbholler
1579d0b3732eSbholler	movdqa	%xmm5,%xmm1
1580d0b3732eSbholler	#palignr	$0x4,%xmm4,%xmm5
1581d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1582d0b3732eSbholler	.byte	0xec,0x04
1583d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1584d0b3732eSbholler
1585d0b3732eSbholler	lea	0x30(%rcx),%rcx
1586d0b3732eSbholler	jge	L(mov3dqa4)
1587d0b3732eSbholler
1588d0b3732eSbholler	cmp	$0x10,%r8
1589d0b3732eSbholler	jl	L(movdqa_epi)
1590d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1591d0b3732eSbholler	sub	$0x10,%r8
1592d0b3732eSbholler	lea	0x10(%rdx),%rdx
1593d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1594d0b3732eSbholler	#palignr	$0x4,%xmm1,%xmm3
1595d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1596d0b3732eSbholler	.byte	0xd9,0x04
1597d0b3732eSbholler
1598d0b3732eSbholler	cmp	$0x10,%r8
1599d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1600d0b3732eSbholler	lea	0x10(%rcx),%rcx
1601d0b3732eSbholler	jl	L(movdqa_epi)
1602d0b3732eSbholler
1603d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1604d0b3732eSbholler	sub	$0x10,%r8
1605d0b3732eSbholler	lea	0x10(%rdx),%rdx
1606d0b3732eSbholler	#palignr	$0x4,%xmm2,%xmm0
1607d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1608d0b3732eSbholler	.byte	0xc2,0x04
1609d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1610d0b3732eSbholler	lea	0x10(%rcx),%rcx
1611d0b3732eSbholler	jmp	L(movdqa_epi)
1612d0b3732eSbholler
1613d0b3732eSbholler	.balign 16
1614d0b3732eSbhollerL(mov3dqa5):
1615d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1616d0b3732eSbholler	sub	$0x30,%r8
1617d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1618d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1619d0b3732eSbholler	lea	0x30(%rdx),%rdx
1620d0b3732eSbholler	cmp	$0x30,%r8
1621d0b3732eSbholler
1622d0b3732eSbholler	movdqa	%xmm3,%xmm2
1623d0b3732eSbholler	#palignr	$0x5,%xmm1,%xmm3
1624d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1625d0b3732eSbholler	.byte	0xd9,0x05
1626d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1627d0b3732eSbholler
1628d0b3732eSbholler	movdqa	%xmm0,%xmm4
1629d0b3732eSbholler	#palignr	$0x5,%xmm2,%xmm0
1630d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1631d0b3732eSbholler	.byte	0xc2,0x05
1632d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1633d0b3732eSbholler
1634d0b3732eSbholler	movdqa	%xmm5,%xmm1
1635d0b3732eSbholler	#palignr	$0x5,%xmm4,%xmm5
1636d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1637d0b3732eSbholler	.byte	0xec,0x05
1638d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1639d0b3732eSbholler
1640d0b3732eSbholler	lea	0x30(%rcx),%rcx
1641d0b3732eSbholler	jge	L(mov3dqa5)
1642d0b3732eSbholler
1643d0b3732eSbholler	cmp	$0x10,%r8
1644d0b3732eSbholler	jl	L(movdqa_epi)
1645d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1646d0b3732eSbholler	sub	$0x10,%r8
1647d0b3732eSbholler	lea	0x10(%rdx),%rdx
1648d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1649d0b3732eSbholler	#palignr	$0x5,%xmm1,%xmm3
1650d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1651d0b3732eSbholler	.byte	0xd9,0x05
1652d0b3732eSbholler
1653d0b3732eSbholler	cmp	$0x10,%r8
1654d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1655d0b3732eSbholler	lea	0x10(%rcx),%rcx
1656d0b3732eSbholler	jl	L(movdqa_epi)
1657d0b3732eSbholler
1658d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1659d0b3732eSbholler	sub	$0x10,%r8
1660d0b3732eSbholler	lea	0x10(%rdx),%rdx
1661d0b3732eSbholler	#palignr	$0x5,%xmm2,%xmm0
1662d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1663d0b3732eSbholler	.byte	0xc2,0x05
1664d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1665d0b3732eSbholler	lea	0x10(%rcx),%rcx
1666d0b3732eSbholler	jmp	L(movdqa_epi)
1667d0b3732eSbholler
1668d0b3732eSbholler	.balign 16
1669d0b3732eSbhollerL(mov3dqa6):
1670d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1671d0b3732eSbholler	sub	$0x30,%r8
1672d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1673d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1674d0b3732eSbholler	lea	0x30(%rdx),%rdx
1675d0b3732eSbholler	cmp	$0x30,%r8
1676d0b3732eSbholler
1677d0b3732eSbholler	movdqa	%xmm3,%xmm2
1678d0b3732eSbholler	#palignr	$0x6,%xmm1,%xmm3
1679d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1680d0b3732eSbholler	.byte	0xd9,0x06
1681d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1682d0b3732eSbholler
1683d0b3732eSbholler	movdqa	%xmm0,%xmm4
1684d0b3732eSbholler	#palignr	$0x6,%xmm2,%xmm0
1685d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1686d0b3732eSbholler	.byte	0xc2,0x06
1687d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1688d0b3732eSbholler
1689d0b3732eSbholler	movdqa	%xmm5,%xmm1
1690d0b3732eSbholler	#palignr	$0x6,%xmm4,%xmm5
1691d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1692d0b3732eSbholler	.byte	0xec,0x06
1693d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1694d0b3732eSbholler
1695d0b3732eSbholler	lea	0x30(%rcx),%rcx
1696d0b3732eSbholler	jge	L(mov3dqa6)
1697d0b3732eSbholler
1698d0b3732eSbholler	cmp	$0x10,%r8
1699d0b3732eSbholler	jl	L(movdqa_epi)
1700d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1701d0b3732eSbholler	sub	$0x10,%r8
1702d0b3732eSbholler	lea	0x10(%rdx),%rdx
1703d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1704d0b3732eSbholler	#palignr	$0x6,%xmm1,%xmm3
1705d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1706d0b3732eSbholler	.byte	0xd9,0x06
1707d0b3732eSbholler
1708d0b3732eSbholler	cmp	$0x10,%r8
1709d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1710d0b3732eSbholler	lea	0x10(%rcx),%rcx
1711d0b3732eSbholler	jl	L(movdqa_epi)
1712d0b3732eSbholler
1713d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1714d0b3732eSbholler	sub	$0x10,%r8
1715d0b3732eSbholler	lea	0x10(%rdx),%rdx
1716d0b3732eSbholler	#palignr	$0x6,%xmm2,%xmm0
1717d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1718d0b3732eSbholler	.byte	0xc2,0x06
1719d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1720d0b3732eSbholler	lea	0x10(%rcx),%rcx
1721d0b3732eSbholler	jmp	L(movdqa_epi)
1722d0b3732eSbholler
1723d0b3732eSbholler	.balign 16
1724d0b3732eSbhollerL(mov3dqa7):
1725d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1726d0b3732eSbholler	sub	$0x30,%r8
1727d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1728d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1729d0b3732eSbholler	lea	0x30(%rdx),%rdx
1730d0b3732eSbholler	cmp	$0x30,%r8
1731d0b3732eSbholler
1732d0b3732eSbholler	movdqa	%xmm3,%xmm2
1733d0b3732eSbholler	#palignr	$0x7,%xmm1,%xmm3
1734d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1735d0b3732eSbholler	.byte	0xd9,0x07
1736d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1737d0b3732eSbholler
1738d0b3732eSbholler	movdqa	%xmm0,%xmm4
1739d0b3732eSbholler	#palignr	$0x7,%xmm2,%xmm0
1740d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1741d0b3732eSbholler	.byte	0xc2,0x07
1742d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1743d0b3732eSbholler
1744d0b3732eSbholler	movdqa	%xmm5,%xmm1
1745d0b3732eSbholler	#palignr	$0x7,%xmm4,%xmm5
1746d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1747d0b3732eSbholler	.byte	0xec,0x07
1748d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1749d0b3732eSbholler
1750d0b3732eSbholler	lea	0x30(%rcx),%rcx
1751d0b3732eSbholler	jge	L(mov3dqa7)
1752d0b3732eSbholler
1753d0b3732eSbholler	cmp	$0x10,%r8
1754d0b3732eSbholler	jl	L(movdqa_epi)
1755d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1756d0b3732eSbholler	sub	$0x10,%r8
1757d0b3732eSbholler	lea	0x10(%rdx),%rdx
1758d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1759d0b3732eSbholler	#palignr	$0x7,%xmm1,%xmm3
1760d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1761d0b3732eSbholler	.byte	0xd9,0x07
1762d0b3732eSbholler
1763d0b3732eSbholler	cmp	$0x10,%r8
1764d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1765d0b3732eSbholler	lea	0x10(%rcx),%rcx
1766d0b3732eSbholler	jl	L(movdqa_epi)
1767d0b3732eSbholler
1768d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1769d0b3732eSbholler	sub	$0x10,%r8
1770d0b3732eSbholler	lea	0x10(%rdx),%rdx
1771d0b3732eSbholler	#palignr	$0x7,%xmm2,%xmm0
1772d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1773d0b3732eSbholler	.byte	0xc2,0x07
1774d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1775d0b3732eSbholler	lea	0x10(%rcx),%rcx
1776d0b3732eSbholler	jmp	L(movdqa_epi)
1777d0b3732eSbholler
1778d0b3732eSbholler	.balign 16
1779d0b3732eSbhollerL(mov3dqa9):
1780d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1781d0b3732eSbholler	sub	$0x30,%r8
1782d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1783d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1784d0b3732eSbholler	lea	0x30(%rdx),%rdx
1785d0b3732eSbholler	cmp	$0x30,%r8
1786d0b3732eSbholler
1787d0b3732eSbholler	movdqa	%xmm3,%xmm2
1788d0b3732eSbholler	#palignr	$0x9,%xmm1,%xmm3
1789d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1790d0b3732eSbholler	.byte	0xd9,0x09
1791d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1792d0b3732eSbholler
1793d0b3732eSbholler	movdqa	%xmm0,%xmm4
1794d0b3732eSbholler	#palignr	$0x9,%xmm2,%xmm0
1795d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1796d0b3732eSbholler	.byte	0xc2,0x09
1797d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1798d0b3732eSbholler
1799d0b3732eSbholler	movdqa	%xmm5,%xmm1
1800d0b3732eSbholler	#palignr	$0x9,%xmm4,%xmm5
1801d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1802d0b3732eSbholler	.byte	0xec,0x09
1803d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1804d0b3732eSbholler
1805d0b3732eSbholler	lea	0x30(%rcx),%rcx
1806d0b3732eSbholler	jge	L(mov3dqa9)
1807d0b3732eSbholler
1808d0b3732eSbholler	cmp	$0x10,%r8
1809d0b3732eSbholler	jl	L(movdqa_epi)
1810d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1811d0b3732eSbholler	sub	$0x10,%r8
1812d0b3732eSbholler	lea	0x10(%rdx),%rdx
1813d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1814d0b3732eSbholler	#palignr	$0x9,%xmm1,%xmm3
1815d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1816d0b3732eSbholler	.byte	0xd9,0x09
1817d0b3732eSbholler
1818d0b3732eSbholler	cmp	$0x10,%r8
1819d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1820d0b3732eSbholler	lea	0x10(%rcx),%rcx
1821d0b3732eSbholler	jl	L(movdqa_epi)
1822d0b3732eSbholler
1823d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1824d0b3732eSbholler	sub	$0x10,%r8
1825d0b3732eSbholler	lea	0x10(%rdx),%rdx
1826d0b3732eSbholler	#palignr	$0x9,%xmm2,%xmm0
1827d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1828d0b3732eSbholler	.byte	0xc2,0x09
1829d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1830d0b3732eSbholler	lea	0x10(%rcx),%rcx
1831d0b3732eSbholler	jmp	L(movdqa_epi)
1832d0b3732eSbholler
1833d0b3732eSbholler	.balign 16
1834d0b3732eSbhollerL(mov3dqa10):
1835d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1836d0b3732eSbholler	sub	$0x30,%r8
1837d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1838d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1839d0b3732eSbholler	lea	0x30(%rdx),%rdx
1840d0b3732eSbholler	cmp	$0x30,%r8
1841d0b3732eSbholler
1842d0b3732eSbholler	movdqa	%xmm3,%xmm2
1843d0b3732eSbholler	#palignr	$0xa,%xmm1,%xmm3
1844d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1845d0b3732eSbholler	.byte	0xd9,0x0a
1846d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1847d0b3732eSbholler
1848d0b3732eSbholler	movdqa	%xmm0,%xmm4
1849d0b3732eSbholler	#palignr	$0xa,%xmm2,%xmm0
1850d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1851d0b3732eSbholler	.byte	0xc2,0x0a
1852d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1853d0b3732eSbholler
1854d0b3732eSbholler	movdqa	%xmm5,%xmm1
1855d0b3732eSbholler	#palignr	$0xa,%xmm4,%xmm5
1856d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1857d0b3732eSbholler	.byte	0xec,0x0a
1858d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1859d0b3732eSbholler
1860d0b3732eSbholler	lea	0x30(%rcx),%rcx
1861d0b3732eSbholler	jge	L(mov3dqa10)
1862d0b3732eSbholler
1863d0b3732eSbholler	cmp	$0x10,%r8
1864d0b3732eSbholler	jl	L(movdqa_epi)
1865d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1866d0b3732eSbholler	sub	$0x10,%r8
1867d0b3732eSbholler	lea	0x10(%rdx),%rdx
1868d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1869d0b3732eSbholler	#palignr	$0xa,%xmm1,%xmm3
1870d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1871d0b3732eSbholler	.byte	0xd9,0x0a
1872d0b3732eSbholler
1873d0b3732eSbholler	cmp	$0x10,%r8
1874d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1875d0b3732eSbholler	lea	0x10(%rcx),%rcx
1876d0b3732eSbholler	jl	L(movdqa_epi)
1877d0b3732eSbholler
1878d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1879d0b3732eSbholler	sub	$0x10,%r8
1880d0b3732eSbholler	lea	0x10(%rdx),%rdx
1881d0b3732eSbholler	#palignr	$0xa,%xmm2,%xmm0
1882d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1883d0b3732eSbholler	.byte	0xc2,0x0a
1884d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1885d0b3732eSbholler	lea	0x10(%rcx),%rcx
1886d0b3732eSbholler	jmp	L(movdqa_epi)
1887d0b3732eSbholler
1888d0b3732eSbholler	.balign 16
1889d0b3732eSbhollerL(mov3dqa11):
1890d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1891d0b3732eSbholler	sub	$0x30,%r8
1892d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1893d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1894d0b3732eSbholler	lea	0x30(%rdx),%rdx
1895d0b3732eSbholler	cmp	$0x30,%r8
1896d0b3732eSbholler
1897d0b3732eSbholler	movdqa	%xmm3,%xmm2
1898d0b3732eSbholler	#palignr	$0xb,%xmm1,%xmm3
1899d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1900d0b3732eSbholler	.byte	0xd9,0x0b
1901d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1902d0b3732eSbholler
1903d0b3732eSbholler	movdqa	%xmm0,%xmm4
1904d0b3732eSbholler	#palignr	$0xb,%xmm2,%xmm0
1905d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1906d0b3732eSbholler	.byte	0xc2,0x0b
1907d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1908d0b3732eSbholler
1909d0b3732eSbholler	movdqa	%xmm5,%xmm1
1910d0b3732eSbholler	#palignr	$0xb,%xmm4,%xmm5
1911d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1912d0b3732eSbholler	.byte	0xec,0x0b
1913d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1914d0b3732eSbholler
1915d0b3732eSbholler	lea	0x30(%rcx),%rcx
1916d0b3732eSbholler	jge	L(mov3dqa11)
1917d0b3732eSbholler
1918d0b3732eSbholler	cmp	$0x10,%r8
1919d0b3732eSbholler	jl	L(movdqa_epi)
1920d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1921d0b3732eSbholler	sub	$0x10,%r8
1922d0b3732eSbholler	lea	0x10(%rdx),%rdx
1923d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1924d0b3732eSbholler	#palignr	$0xb,%xmm1,%xmm3
1925d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1926d0b3732eSbholler	.byte	0xd9,0x0b
1927d0b3732eSbholler
1928d0b3732eSbholler	cmp	$0x10,%r8
1929d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1930d0b3732eSbholler	lea	0x10(%rcx),%rcx
1931d0b3732eSbholler	jl	L(movdqa_epi)
1932d0b3732eSbholler
1933d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1934d0b3732eSbholler	sub	$0x10,%r8
1935d0b3732eSbholler	lea	0x10(%rdx),%rdx
1936d0b3732eSbholler	#palignr	$0xb,%xmm2,%xmm0
1937d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1938d0b3732eSbholler	.byte	0xc2,0x0b
1939d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1940d0b3732eSbholler	lea	0x10(%rcx),%rcx
1941d0b3732eSbholler	jmp	L(movdqa_epi)
1942d0b3732eSbholler
1943d0b3732eSbholler	.balign 16
1944d0b3732eSbhollerL(mov3dqa12):
1945d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
1946d0b3732eSbholler	sub	$0x30,%r8
1947d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
1948d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
1949d0b3732eSbholler	lea	0x30(%rdx),%rdx
1950d0b3732eSbholler	cmp	$0x30,%r8
1951d0b3732eSbholler
1952d0b3732eSbholler	movdqa	%xmm3,%xmm2
1953d0b3732eSbholler	#palignr	$0xc,%xmm1,%xmm3
1954d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1955d0b3732eSbholler	.byte	0xd9,0x0c
1956d0b3732eSbholler	movdqa	%xmm3,(%rcx)
1957d0b3732eSbholler
1958d0b3732eSbholler	movdqa	%xmm0,%xmm4
1959d0b3732eSbholler	#palignr	$0xc,%xmm2,%xmm0
1960d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1961d0b3732eSbholler	.byte	0xc2,0x0c
1962d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
1963d0b3732eSbholler
1964d0b3732eSbholler	movdqa	%xmm5,%xmm1
1965d0b3732eSbholler	#palignr	$0xc,%xmm4,%xmm5
1966d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1967d0b3732eSbholler	.byte	0xec,0x0c
1968d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
1969d0b3732eSbholler
1970d0b3732eSbholler	lea	0x30(%rcx),%rcx
1971d0b3732eSbholler	jge	L(mov3dqa12)
1972d0b3732eSbholler
1973d0b3732eSbholler	cmp	$0x10,%r8
1974d0b3732eSbholler	jl	L(movdqa_epi)
1975d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1976d0b3732eSbholler	sub	$0x10,%r8
1977d0b3732eSbholler	lea	0x10(%rdx),%rdx
1978d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
1979d0b3732eSbholler	#palignr	$0xc,%xmm1,%xmm3
1980d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1981d0b3732eSbholler	.byte	0xd9,0x0c
1982d0b3732eSbholler
1983d0b3732eSbholler	cmp	$0x10,%r8
1984d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
1985d0b3732eSbholler	lea	0x10(%rcx),%rcx
1986d0b3732eSbholler	jl	L(movdqa_epi)
1987d0b3732eSbholler
1988d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1989d0b3732eSbholler	sub	$0x10,%r8
1990d0b3732eSbholler	lea	0x10(%rdx),%rdx
1991d0b3732eSbholler	#palignr	$0xc,%xmm2,%xmm0
1992d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
1993d0b3732eSbholler	.byte	0xc2,0x0c
1994d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
1995d0b3732eSbholler	lea	0x10(%rcx),%rcx
1996d0b3732eSbholler	jmp	L(movdqa_epi)
1997d0b3732eSbholler
1998d0b3732eSbholler	.balign 16
1999d0b3732eSbhollerL(mov3dqa13):
2000d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
2001d0b3732eSbholler	sub	$0x30,%r8
2002d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
2003d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
2004d0b3732eSbholler	lea	0x30(%rdx),%rdx
2005d0b3732eSbholler	cmp	$0x30,%r8
2006d0b3732eSbholler
2007d0b3732eSbholler	movdqa	%xmm3,%xmm2
2008d0b3732eSbholler	#palignr	$0xd,%xmm1,%xmm3
2009d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2010d0b3732eSbholler	.byte	0xd9,0x0d
2011d0b3732eSbholler	movdqa	%xmm3,(%rcx)
2012d0b3732eSbholler
2013d0b3732eSbholler	movdqa	%xmm0,%xmm4
2014d0b3732eSbholler	#palignr	$0xd,%xmm2,%xmm0
2015d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2016d0b3732eSbholler	.byte	0xc2,0x0d
2017d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
2018d0b3732eSbholler
2019d0b3732eSbholler	movdqa	%xmm5,%xmm1
2020d0b3732eSbholler	#palignr	$0xd,%xmm4,%xmm5
2021d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2022d0b3732eSbholler	.byte	0xec,0x0d
2023d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
2024d0b3732eSbholler
2025d0b3732eSbholler	lea	0x30(%rcx),%rcx
2026d0b3732eSbholler	jge	L(mov3dqa13)
2027d0b3732eSbholler
2028d0b3732eSbholler	cmp	$0x10,%r8
2029d0b3732eSbholler	jl	L(movdqa_epi)
2030d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2031d0b3732eSbholler	sub	$0x10,%r8
2032d0b3732eSbholler	lea	0x10(%rdx),%rdx
2033d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
2034d0b3732eSbholler	#palignr	$0xd,%xmm1,%xmm3
2035d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2036d0b3732eSbholler	.byte	0xd9,0x0d
2037d0b3732eSbholler
2038d0b3732eSbholler	cmp	$0x10,%r8
2039d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
2040d0b3732eSbholler	lea	0x10(%rcx),%rcx
2041d0b3732eSbholler	jl	L(movdqa_epi)
2042d0b3732eSbholler
2043d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2044d0b3732eSbholler	sub	$0x10,%r8
2045d0b3732eSbholler	lea	0x10(%rdx),%rdx
2046d0b3732eSbholler	#palignr	$0xd,%xmm2,%xmm0
2047d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2048d0b3732eSbholler	.byte	0xc2,0x0d
2049d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
2050d0b3732eSbholler	lea	0x10(%rcx),%rcx
2051d0b3732eSbholler	jmp	L(movdqa_epi)
2052d0b3732eSbholler
2053d0b3732eSbholler	.balign 16
2054d0b3732eSbhollerL(mov3dqa14):
2055d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
2056d0b3732eSbholler	sub	$0x30,%r8
2057d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
2058d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
2059d0b3732eSbholler	lea	0x30(%rdx),%rdx
2060d0b3732eSbholler	cmp	$0x30,%r8
2061d0b3732eSbholler
2062d0b3732eSbholler	movdqa	%xmm3,%xmm2
2063d0b3732eSbholler	#palignr	$0xe,%xmm1,%xmm3
2064d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2065d0b3732eSbholler	.byte	0xd9,0x0e
2066d0b3732eSbholler	movdqa	%xmm3,(%rcx)
2067d0b3732eSbholler
2068d0b3732eSbholler	movdqa	%xmm0,%xmm4
2069d0b3732eSbholler	#palignr	$0xe,%xmm2,%xmm0
2070d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2071d0b3732eSbholler	.byte	0xc2,0x0e
2072d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
2073d0b3732eSbholler
2074d0b3732eSbholler	movdqa	%xmm5,%xmm1
2075d0b3732eSbholler	#palignr	$0xe,%xmm4,%xmm5
2076d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2077d0b3732eSbholler	.byte	0xec,0x0e
2078d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
2079d0b3732eSbholler
2080d0b3732eSbholler	lea	0x30(%rcx),%rcx
2081d0b3732eSbholler	jge	L(mov3dqa14)
2082d0b3732eSbholler
2083d0b3732eSbholler	cmp	$0x10,%r8
2084d0b3732eSbholler	jl	L(movdqa_epi)
2085d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2086d0b3732eSbholler	sub	$0x10,%r8
2087d0b3732eSbholler	lea	0x10(%rdx),%rdx
2088d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
2089d0b3732eSbholler	#palignr	$0xe,%xmm1,%xmm3
2090d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2091d0b3732eSbholler	.byte	0xd9,0x0e
2092d0b3732eSbholler
2093d0b3732eSbholler	cmp	$0x10,%r8
2094d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
2095d0b3732eSbholler	lea	0x10(%rcx),%rcx
2096d0b3732eSbholler	jl	L(movdqa_epi)
2097d0b3732eSbholler
2098d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2099d0b3732eSbholler	sub	$0x10,%r8
2100d0b3732eSbholler	lea	0x10(%rdx),%rdx
2101d0b3732eSbholler	#palignr	$0xe,%xmm2,%xmm0
2102d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2103d0b3732eSbholler	.byte	0xc2,0x0e
2104d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
2105d0b3732eSbholler	lea	0x10(%rcx),%rcx
2106d0b3732eSbholler	jmp	L(movdqa_epi)
2107d0b3732eSbholler
2108d0b3732eSbholler	.balign 16
2109d0b3732eSbhollerL(mov3dqa15):
2110d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3
2111d0b3732eSbholler	sub	$0x30,%r8
2112d0b3732eSbholler	movdqa	0x20(%rdx),%xmm0
2113d0b3732eSbholler	movdqa	0x30(%rdx),%xmm5
2114d0b3732eSbholler	lea	0x30(%rdx),%rdx
2115d0b3732eSbholler	cmp	$0x30,%r8
2116d0b3732eSbholler
2117d0b3732eSbholler	movdqa	%xmm3,%xmm2
2118d0b3732eSbholler	#palignr	$0xf,%xmm1,%xmm3
2119d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2120d0b3732eSbholler	.byte	0xd9,0x0f
2121d0b3732eSbholler	movdqa	%xmm3,(%rcx)
2122d0b3732eSbholler
2123d0b3732eSbholler	movdqa	%xmm0,%xmm4
2124d0b3732eSbholler	#palignr	$0xf,%xmm2,%xmm0
2125d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2126d0b3732eSbholler	.byte	0xc2,0x0f
2127d0b3732eSbholler	movdqa	%xmm0,0x10(%rcx)
2128d0b3732eSbholler
2129d0b3732eSbholler	movdqa	%xmm5,%xmm1
2130d0b3732eSbholler	#palignr	$0xf,%xmm4,%xmm5
2131d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2132d0b3732eSbholler	.byte	0xec,0x0f
2133d0b3732eSbholler	movdqa	%xmm5,0x20(%rcx)
2134d0b3732eSbholler
2135d0b3732eSbholler	lea	0x30(%rcx),%rcx
2136d0b3732eSbholler	jge	L(mov3dqa15)
2137d0b3732eSbholler
2138d0b3732eSbholler	cmp	$0x10,%r8
2139d0b3732eSbholler	jl	L(movdqa_epi)
2140d0b3732eSbholler	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2141d0b3732eSbholler	sub	$0x10,%r8
2142d0b3732eSbholler	lea	0x10(%rdx),%rdx
2143d0b3732eSbholler	movdqa	%xmm3,%xmm2		# save for use next concat
2144d0b3732eSbholler	#palignr	$0xf,%xmm1,%xmm3
2145d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2146d0b3732eSbholler	.byte	0xd9,0x0f
2147d0b3732eSbholler
2148d0b3732eSbholler	cmp	$0x10,%r8
2149d0b3732eSbholler	movdqa	%xmm3,(%rcx)      	# store it
2150d0b3732eSbholler	lea	0x10(%rcx),%rcx
2151d0b3732eSbholler	jl	L(movdqa_epi)
2152d0b3732eSbholler
2153d0b3732eSbholler	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2154d0b3732eSbholler	sub	$0x10,%r8
2155d0b3732eSbholler	lea	0x10(%rdx),%rdx
2156d0b3732eSbholler	#palignr	$0xf,%xmm2,%xmm0
2157d0b3732eSbholler	.byte	0x66,0x0f,0x3a,0x0f
2158d0b3732eSbholler	.byte	0xc2,0x0f
2159d0b3732eSbholler	movdqa	%xmm0,(%rcx)      	# store it
2160d0b3732eSbholler	lea	0x10(%rcx),%rcx
2161d0b3732eSbholler	jmp	L(movdqa_epi)
2162d0b3732eSbholler
2163d0b3732eSbholler	.balign 16
2164d0b3732eSbhollerL(sse2_nt_move):
2165d0b3732eSbholler	lea	0x40(%rcx),%rcx
2166d0b3732eSbholler	lea	0x40(%rdx),%rdx
2167d0b3732eSbholler	lea	-0x40(%r8),%r8
2168d0b3732eSbholler
2169d0b3732eSbholler	/*
2170d0b3732eSbholler	 * doesn't matter if source is aligned for stuff out of cache.
2171d0b3732eSbholler	 * the mis-aligned penalty is masked by the slowness of main memory.
2172d0b3732eSbholler	 */
2173d0b3732eSbholler	prefetchnta 0x180(%rdx)
2174d0b3732eSbholler	movdqu	-0x40(%rdx),%xmm0
2175d0b3732eSbholler	movdqu	-0x30(%rdx),%xmm1
2176d0b3732eSbholler
2177d0b3732eSbholler	cmp	$0x40,%r8
2178d0b3732eSbholler	movntdq	%xmm0,-0x40(%rcx)
2179d0b3732eSbholler	movntdq	%xmm1,-0x30(%rcx)
2180d0b3732eSbholler
2181d0b3732eSbholler	movdqu	-0x20(%rdx),%xmm2
2182d0b3732eSbholler	movdqu	-0x10(%rdx),%xmm3
2183d0b3732eSbholler
2184d0b3732eSbholler	movntdq	%xmm2,-0x20(%rcx)
2185d0b3732eSbholler	movntdq	%xmm3,-0x10(%rcx)
2186d0b3732eSbholler
2187d0b3732eSbholler	jge	L(sse2_nt_move)
2188d0b3732eSbholler
2189d0b3732eSbholler	lea	L(Fix16EndTable)(%rip),%r10
2190d0b3732eSbholler	mov	%r8,%r9
2191d0b3732eSbholler	and	$0xFFFFFFFFFFFFFFF0,%r9
2192d0b3732eSbholler	add	%r9,%rcx
2193d0b3732eSbholler	add	%r9,%rdx
2194d0b3732eSbholler	sub	%r9,%r8
2195d0b3732eSbholler	shr	$0x4,%r9
2196d0b3732eSbholler	sfence
2197d0b3732eSbholler
2198d0b3732eSbholler	movslq	(%r10,%r9,4),%r11
2199d0b3732eSbholler	lea	(%r11,%r10,1),%r10
2200d0b3732eSbholler	jmpq	*%r10
2201d0b3732eSbholler
2202d0b3732eSbholler	.balign 16
2203d0b3732eSbhollerL(Fix16EndTable):
2204d0b3732eSbholler	.int    L(fix16_0)-L(Fix16EndTable)
2205d0b3732eSbholler	.int    L(fix16_1)-L(Fix16EndTable)
2206d0b3732eSbholler	.int    L(fix16_2)-L(Fix16EndTable)
2207d0b3732eSbholler	.int    L(fix16_3)-L(Fix16EndTable)
2208d0b3732eSbholler
2209d0b3732eSbholler	.balign 16
2210d0b3732eSbhollerL(fix16_3):
2211d0b3732eSbholler	movdqu -0x30(%rdx),%xmm1
2212d0b3732eSbholler	movdqa %xmm1,-0x30(%rcx)
2213d0b3732eSbhollerL(fix16_2):
2214d0b3732eSbholler	movdqu -0x20(%rdx),%xmm2
2215d0b3732eSbholler	movdqa %xmm2,-0x20(%rcx)
2216d0b3732eSbhollerL(fix16_1):
2217d0b3732eSbholler	movdqu -0x10(%rdx),%xmm3
2218d0b3732eSbholler	movdqa %xmm3,-0x10(%rcx)
2219d0b3732eSbhollerL(fix16_0):
2220d0b3732eSbholler	lea    L(fwdPxQx)(%rip),%r10
2221d0b3732eSbholler	add    %r8,%rdx
2222d0b3732eSbholler	add    %r8,%rcx
2223d0b3732eSbholler
2224d0b3732eSbholler	movslq (%r10,%r8,4),%r9
2225d0b3732eSbholler	lea    (%r9,%r10,1),%r10
2226d0b3732eSbholler	jmpq   *%r10
2227d0b3732eSbholler
2228d0b3732eSbholler	.balign 16
2229d0b3732eSbhollerL(pre_both_aligned):
2230d0b3732eSbholler	cmp    $0x80,%r8
2231d0b3732eSbholler	jl     L(fix_16b)
2232d0b3732eSbholler
2233d0b3732eSbholler	.balign 16
2234d0b3732eSbhollerL(both_aligned):
2235d0b3732eSbholler
2236d0b3732eSbholler	/*
2237d0b3732eSbholler	 * this 'paired' load/load/store/store seems to do best.
2238d0b3732eSbholler	 */
2239d0b3732eSbholler	movdqa (%rdx),%xmm0
2240d0b3732eSbholler	movdqa 0x10(%rdx),%xmm1
2241d0b3732eSbholler
2242d0b3732eSbholler	movdqa %xmm0,(%rcx)
2243d0b3732eSbholler	movdqa %xmm1,0x10(%rcx)
2244d0b3732eSbholler	lea    -0x80(%r8),%r8
2245d0b3732eSbholler
2246d0b3732eSbholler	movdqa 0x20(%rdx),%xmm2
2247d0b3732eSbholler	movdqa 0x30(%rdx),%xmm3
2248d0b3732eSbholler
2249d0b3732eSbholler	movdqa %xmm2,0x20(%rcx)
2250d0b3732eSbholler	movdqa %xmm3,0x30(%rcx)
2251d0b3732eSbholler
2252d0b3732eSbholler	movdqa 0x40(%rdx),%xmm0
2253d0b3732eSbholler	movdqa 0x50(%rdx),%xmm1
2254d0b3732eSbholler	cmp    $0x80,%r8
2255d0b3732eSbholler
2256d0b3732eSbholler	movdqa %xmm0,0x40(%rcx)
2257d0b3732eSbholler	movdqa %xmm1,0x50(%rcx)
2258d0b3732eSbholler
2259d0b3732eSbholler	movdqa 0x60(%rdx),%xmm2
2260d0b3732eSbholler	movdqa 0x70(%rdx),%xmm3
2261d0b3732eSbholler	lea    0x80(%rdx),%rdx
2262d0b3732eSbholler	movdqa %xmm2,0x60(%rcx)
2263d0b3732eSbholler	movdqa %xmm3,0x70(%rcx)
2264d0b3732eSbholler	lea    0x80(%rcx),%rcx
2265d0b3732eSbholler	jge    L(both_aligned)
2266d0b3732eSbholler
2267d0b3732eSbhollerL(fix_16b):
2268d0b3732eSbholler	add    %r8,%rcx
2269d0b3732eSbholler	lea    L(fwdPxQx)(%rip),%r10
2270d0b3732eSbholler	add    %r8,%rdx
2271d0b3732eSbholler
2272d0b3732eSbholler	movslq (%r10,%r8,4),%r9
2273d0b3732eSbholler	lea    (%r9,%r10,1),%r10
2274d0b3732eSbholler	jmpq   *%r10
2275d0b3732eSbholler
2276d0b3732eSbholler	.balign 16
2277d0b3732eSbhollerL(Loop8byte_pre):
2278d0b3732eSbholler	# Use 8-byte moves
2279d0b3732eSbholler	mov    .largest_level_cache_size(%rip),%r9d
2280d0b3732eSbholler	shr    %r9		# take half of it
2281d0b3732eSbholler	cmp    %r9,%r8
2282*fad5204eSbostrovs	jge    L(byte8_nt_top)
2283d0b3732eSbholler	# Find out whether to use rep movsq
2284d0b3732eSbholler	cmp    $4096,%r8
2285d0b3732eSbholler	jle    L(byte8_top)
2286d0b3732eSbholler	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
2287d0b3732eSbholler	cmp    %r9,%r8
2288d0b3732eSbholler	jle    L(use_rep)
2289d0b3732eSbholler
2290d0b3732eSbholler	.balign     16
2291d0b3732eSbhollerL(byte8_top):
2292d0b3732eSbholler	mov    (%rdx),%r9
2293d0b3732eSbholler	mov    0x8(%rdx),%r10
2294d0b3732eSbholler	lea    -0x40(%r8),%r8
2295d0b3732eSbholler	mov    %r9,(%rcx)
2296d0b3732eSbholler	mov    %r10,0x8(%rcx)
2297d0b3732eSbholler	mov    0x10(%rdx),%r11
2298d0b3732eSbholler	mov    0x18(%rdx),%r9
2299d0b3732eSbholler	mov    %r11,0x10(%rcx)
2300d0b3732eSbholler	mov    %r9,0x18(%rcx)
2301d0b3732eSbholler
2302d0b3732eSbholler	cmp    $0x40,%r8
2303d0b3732eSbholler	mov    0x20(%rdx),%r10
2304d0b3732eSbholler	mov    0x28(%rdx),%r11
2305d0b3732eSbholler	mov    %r10,0x20(%rcx)
2306d0b3732eSbholler	mov    %r11,0x28(%rcx)
2307d0b3732eSbholler	mov    0x30(%rdx),%r9
2308d0b3732eSbholler	mov    0x38(%rdx),%r10
2309d0b3732eSbholler	lea    0x40(%rdx),%rdx
2310d0b3732eSbholler	mov    %r9,0x30(%rcx)
2311d0b3732eSbholler	mov    %r10,0x38(%rcx)
2312d0b3732eSbholler	lea    0x40(%rcx),%rcx
2313d0b3732eSbholler	jg     L(byte8_top)
2314d0b3732eSbholler
2315d0b3732eSbhollerL(byte8_end):
2316d0b3732eSbholler	lea    L(fwdPxQx)(%rip),%r10
2317d0b3732eSbholler	lea    (%rdx,%r8,1),%rdx
2318d0b3732eSbholler	lea    (%rcx,%r8,1),%rcx
2319d0b3732eSbholler
2320d0b3732eSbholler	movslq (%r10,%r8,4),%r9
2321d0b3732eSbholler	lea    (%r9,%r10,1),%r10
2322d0b3732eSbholler	jmpq   *%r10
2323d0b3732eSbholler
2324d0b3732eSbholler	.balign	16
2325d0b3732eSbhollerL(use_rep):
2326d0b3732eSbholler	mov    %rdx,%rsi		# %rsi = source
2327d0b3732eSbholler	mov    %rcx,%rdi		# %rdi = destination
2328d0b3732eSbholler	mov    %r8,%rcx			# %rcx = count
2329d0b3732eSbholler	shrq   $3,%rcx			# 8-byte word count
23307c478bd9Sstevel@tonic-gate	rep
23317c478bd9Sstevel@tonic-gate	  movsq
2332d0b3732eSbholler	mov    %rsi,%rdx		# source
2333d0b3732eSbholler	mov    %rdi,%rcx		# destination
2334d0b3732eSbholler	andq   $7,%r8			# remainder
2335d0b3732eSbholler	jnz    L(byte8_end)
23367c478bd9Sstevel@tonic-gate	ret
23377c478bd9Sstevel@tonic-gate
2338d0b3732eSbholler	.balign 16
2339d0b3732eSbhollerL(byte8_nt_top):
2340d0b3732eSbholler	sub    $0x40,%r8
2341d0b3732eSbholler	prefetchnta 0x180(%rdx)
2342d0b3732eSbholler	mov    (%rdx),%r9
2343d0b3732eSbholler	movnti %r9,(%rcx)
2344d0b3732eSbholler	mov    0x8(%rdx),%r10
2345d0b3732eSbholler	movnti %r10,0x8(%rcx)
2346d0b3732eSbholler	mov    0x10(%rdx),%r11
2347d0b3732eSbholler	movnti %r11,0x10(%rcx)
2348d0b3732eSbholler	mov    0x18(%rdx),%r9
2349d0b3732eSbholler	movnti %r9,0x18(%rcx)
2350d0b3732eSbholler	mov    0x20(%rdx),%r10
2351d0b3732eSbholler	movnti %r10,0x20(%rcx)
2352d0b3732eSbholler	mov    0x28(%rdx),%r11
2353d0b3732eSbholler	movnti %r11,0x28(%rcx)
2354d0b3732eSbholler	mov    0x30(%rdx),%r9
2355d0b3732eSbholler	movnti %r9,0x30(%rcx)
2356d0b3732eSbholler	mov    0x38(%rdx),%r10
2357d0b3732eSbholler	movnti %r10,0x38(%rcx)
23587c478bd9Sstevel@tonic-gate
2359d0b3732eSbholler	lea    0x40(%rdx),%rdx
2360d0b3732eSbholler	lea    0x40(%rcx),%rcx
2361d0b3732eSbholler	cmp    $0x40,%r8
2362d0b3732eSbholler	jge    L(byte8_nt_top)
2363d0b3732eSbholler	sfence
2364d0b3732eSbholler	jmp    L(byte8_end)
23657c478bd9Sstevel@tonic-gate
2366d0b3732eSbholler	SET_SIZE(memcpy)
23677c478bd9Sstevel@tonic-gate
2368d0b3732eSbholler	.balign 16
2369d0b3732eSbhollerL(CopyBackwards):
2370d0b3732eSbholler	mov    %rdx,%r8
2371d0b3732eSbholler	mov    %rdi,%rcx
2372d0b3732eSbholler	mov    %rsi,%rdx
2373d0b3732eSbholler	mov    %rdi,%rax		# return value
23747c478bd9Sstevel@tonic-gate
2375d0b3732eSbholler	# ck alignment of last byte
2376d0b3732eSbholler	lea    (%rcx,%r8,1),%rcx
2377d0b3732eSbholler	test   $0x7,%rcx
2378d0b3732eSbholler	lea    (%rdx,%r8,1),%rdx
2379d0b3732eSbholler	jne    L(bk_align)
23807c478bd9Sstevel@tonic-gate
2381d0b3732eSbhollerL(bk_qw_aligned):
2382d0b3732eSbholler	lea    L(bkPxQx)(%rip),%r10
23837c478bd9Sstevel@tonic-gate
2384d0b3732eSbholler	cmp    $0x90,%r8		# 144
2385d0b3732eSbholler	jg     L(bk_ck_sse2_alignment)
23867c478bd9Sstevel@tonic-gate
2387d0b3732eSbholler	sub    %r8,%rcx
23887c478bd9Sstevel@tonic-gate	sub    %r8,%rdx
23897c478bd9Sstevel@tonic-gate
2390d0b3732eSbholler	movslq (%r10,%r8,4),%r9
2391d0b3732eSbholler	lea    (%r9,%r10,1),%r10
2392d0b3732eSbholler	jmpq   *%r10
23937c478bd9Sstevel@tonic-gate
2394d0b3732eSbholler	.balign 16
2395d0b3732eSbhollerL(bk_align):
2396d0b3732eSbholler	# only align if len > 8
2397d0b3732eSbholler	cmp    $8,%r8
2398d0b3732eSbholler	jle    L(bk_qw_aligned)
2399d0b3732eSbholler	test   $0x1,%rcx
2400d0b3732eSbholler	je     L(bk_tst2)
24017c478bd9Sstevel@tonic-gate	dec    %rcx
2402d0b3732eSbholler	dec    %rdx
2403d0b3732eSbholler	dec    %r8
2404d0b3732eSbholler	mov    (%rdx),%r9b
2405d0b3732eSbholler	mov    %r9b,(%rcx)
24067c478bd9Sstevel@tonic-gate
2407d0b3732eSbhollerL(bk_tst2):
2408d0b3732eSbholler	test   $0x2,%rcx
2409d0b3732eSbholler	je     L(bk_tst3)
24107c478bd9Sstevel@tonic-gate
2411d0b3732eSbhollerL(bk_got2):
2412d0b3732eSbholler	sub    $0x2,%rcx
2413d0b3732eSbholler	sub    $0x2,%rdx
2414d0b3732eSbholler	sub    $0x2,%r8
2415d0b3732eSbholler	movzwq (%rdx),%r9
2416d0b3732eSbholler	mov    %r9w,(%rcx)
24177c478bd9Sstevel@tonic-gate
2418d0b3732eSbhollerL(bk_tst3):
2419d0b3732eSbholler	test   $0x4,%rcx
2420d0b3732eSbholler	je     L(bk_qw_aligned)
24217c478bd9Sstevel@tonic-gate
2422d0b3732eSbhollerL(bk_got3):
2423d0b3732eSbholler	sub    $0x4,%rcx
2424d0b3732eSbholler	sub    $0x4,%rdx
2425d0b3732eSbholler	sub    $0x4,%r8
2426d0b3732eSbholler	mov    (%rdx),%r9d
2427d0b3732eSbholler	mov    %r9d,(%rcx)
2428d0b3732eSbholler	jmp    L(bk_qw_aligned)
24297c478bd9Sstevel@tonic-gate
2430d0b3732eSbholler	.balign 16
2431d0b3732eSbhollerL(bk_ck_sse2_alignment):
2432d0b3732eSbholler	cmpl   $NO_SSE,.memops_method(%rip)
2433d0b3732eSbholler	je     L(bk_use_rep)
2434d0b3732eSbholler	# check alignment of last byte
2435d0b3732eSbholler	test   $0xf,%rcx
2436d0b3732eSbholler	jz     L(bk_sse2_cpy)
24377c478bd9Sstevel@tonic-gate
2438d0b3732eSbhollerL(bk_sse2_align):
2439d0b3732eSbholler	# only here if already aligned on at least a qword bndry
2440d0b3732eSbholler	sub    $0x8,%rcx
2441d0b3732eSbholler	sub    $0x8,%rdx
2442d0b3732eSbholler	sub    $0x8,%r8
2443d0b3732eSbholler	mov    (%rdx),%r9
2444d0b3732eSbholler	mov    %r9,(%rcx)
2445d0b3732eSbholler	#jmp   L(bk_sse2_cpy)
24467c478bd9Sstevel@tonic-gate
2447d0b3732eSbholler	.balign 16
2448d0b3732eSbhollerL(bk_sse2_cpy):
2449d0b3732eSbholler	sub    $0x80,%rcx		# 128
2450d0b3732eSbholler	sub    $0x80,%rdx
2451d0b3732eSbholler	movdqu 0x70(%rdx),%xmm3
2452d0b3732eSbholler	movdqu 0x60(%rdx),%xmm2
2453d0b3732eSbholler	movdqa %xmm3,0x70(%rcx)
2454d0b3732eSbholler	movdqa %xmm2,0x60(%rcx)
2455d0b3732eSbholler	sub    $0x80,%r8
2456d0b3732eSbholler	movdqu 0x50(%rdx),%xmm1
2457d0b3732eSbholler	movdqu 0x40(%rdx),%xmm0
2458d0b3732eSbholler	movdqa %xmm1,0x50(%rcx)
2459d0b3732eSbholler	movdqa %xmm0,0x40(%rcx)
24607c478bd9Sstevel@tonic-gate
2461d0b3732eSbholler	cmp    $0x80,%r8
2462d0b3732eSbholler	movdqu 0x30(%rdx),%xmm3
2463d0b3732eSbholler	movdqu 0x20(%rdx),%xmm2
2464d0b3732eSbholler	movdqa %xmm3,0x30(%rcx)
2465d0b3732eSbholler	movdqa %xmm2,0x20(%rcx)
2466d0b3732eSbholler	movdqu 0x10(%rdx),%xmm1
2467d0b3732eSbholler	movdqu (%rdx),%xmm0
2468d0b3732eSbholler	movdqa %xmm1,0x10(%rcx)
2469d0b3732eSbholler	movdqa %xmm0,(%rcx)
2470d0b3732eSbholler	jge    L(bk_sse2_cpy)
24717c478bd9Sstevel@tonic-gate
2472d0b3732eSbhollerL(bk_sse2_cpy_end):
2473d0b3732eSbholler	lea    L(bkPxQx)(%rip),%r10
2474d0b3732eSbholler	sub    %r8,%rdx
2475d0b3732eSbholler	sub    %r8,%rcx
2476d0b3732eSbholler	movslq (%r10,%r8,4),%r9
2477d0b3732eSbholler	lea    (%r9,%r10,1),%r10
2478d0b3732eSbholler	jmpq   *%r10
24797c478bd9Sstevel@tonic-gate
2480d0b3732eSbholler	.balign 16
2481d0b3732eSbhollerL(bk_use_rep):
2482d0b3732eSbholler	xchg   %rcx,%r9
2483d0b3732eSbholler	mov    %rdx,%rsi		# source
2484d0b3732eSbholler	mov    %r9,%rdi			# destination
2485d0b3732eSbholler	mov    %r8,%rcx			# count
2486d0b3732eSbholler	sub    $8,%rsi
2487d0b3732eSbholler	sub    $8,%rdi
2488d0b3732eSbholler	shr    $3,%rcx
2489d0b3732eSbholler	std				# reverse direction
2490d0b3732eSbholler	rep
2491d0b3732eSbholler	  movsq
2492d0b3732eSbholler	cld				# reset direction flag
2493d0b3732eSbholler
2494d0b3732eSbholler	xchg   %rcx,%r9
2495d0b3732eSbholler	lea    L(bkPxQx)(%rip),%r10
2496d0b3732eSbholler	sub    %r8,%rdx
2497d0b3732eSbholler	sub    %r8,%rcx
2498d0b3732eSbholler	andq   $7,%r8			# remainder
2499d0b3732eSbholler	jz     2f
2500d0b3732eSbholler	movslq (%r10,%r8,4),%r9
2501d0b3732eSbholler	lea    (%r9,%r10,1),%r10
2502d0b3732eSbholler	jmpq   *%r10
2503d0b3732eSbholler2:
25047c478bd9Sstevel@tonic-gate	ret
25057c478bd9Sstevel@tonic-gate
2506d0b3732eSbholler	.balign 16
2507d0b3732eSbhollerL(bkP0QI):
2508d0b3732eSbholler	mov    0x88(%rdx),%r10
2509d0b3732eSbholler	mov    %r10,0x88(%rcx)
2510d0b3732eSbhollerL(bkP0QH):
2511d0b3732eSbholler	mov    0x80(%rdx),%r10
2512d0b3732eSbholler	mov    %r10,0x80(%rcx)
2513d0b3732eSbhollerL(bkP0QG):
2514d0b3732eSbholler	mov    0x78(%rdx),%r9
2515d0b3732eSbholler	mov    %r9,0x78(%rcx)
2516d0b3732eSbhollerL(bkP0QF):
2517d0b3732eSbholler	mov    0x70(%rdx),%r11
2518d0b3732eSbholler	mov    %r11,0x70(%rcx)
2519d0b3732eSbhollerL(bkP0QE):
2520d0b3732eSbholler	mov    0x68(%rdx),%r10
2521d0b3732eSbholler	mov    %r10,0x68(%rcx)
2522d0b3732eSbhollerL(bkP0QD):
2523d0b3732eSbholler	mov    0x60(%rdx),%r9
2524d0b3732eSbholler	mov    %r9,0x60(%rcx)
2525d0b3732eSbhollerL(bkP0QC):
2526d0b3732eSbholler	mov    0x58(%rdx),%r11
2527d0b3732eSbholler	mov    %r11,0x58(%rcx)
2528d0b3732eSbhollerL(bkP0QB):
2529d0b3732eSbholler	mov    0x50(%rdx),%r10
2530d0b3732eSbholler	mov    %r10,0x50(%rcx)
2531d0b3732eSbhollerL(bkP0QA):
2532d0b3732eSbholler	mov    0x48(%rdx),%r9
2533d0b3732eSbholler	mov    %r9,0x48(%rcx)
2534d0b3732eSbhollerL(bkP0Q9):
2535d0b3732eSbholler	mov    0x40(%rdx),%r11
2536d0b3732eSbholler	mov    %r11,0x40(%rcx)
2537d0b3732eSbhollerL(bkP0Q8):
2538d0b3732eSbholler	mov    0x38(%rdx),%r10
2539d0b3732eSbholler	mov    %r10,0x38(%rcx)
2540d0b3732eSbhollerL(bkP0Q7):
2541d0b3732eSbholler	mov    0x30(%rdx),%r9
2542d0b3732eSbholler	mov    %r9,0x30(%rcx)
2543d0b3732eSbhollerL(bkP0Q6):
2544d0b3732eSbholler	mov    0x28(%rdx),%r11
2545d0b3732eSbholler	mov    %r11,0x28(%rcx)
2546d0b3732eSbhollerL(bkP0Q5):
2547d0b3732eSbholler	mov    0x20(%rdx),%r10
2548d0b3732eSbholler	mov    %r10,0x20(%rcx)
2549d0b3732eSbhollerL(bkP0Q4):
2550d0b3732eSbholler	mov    0x18(%rdx),%r9
2551d0b3732eSbholler	mov    %r9,0x18(%rcx)
2552d0b3732eSbhollerL(bkP0Q3):
2553d0b3732eSbholler	mov    0x10(%rdx),%r11
2554d0b3732eSbholler	mov    %r11,0x10(%rcx)
2555d0b3732eSbhollerL(bkP0Q2):
2556d0b3732eSbholler	mov    0x8(%rdx),%r10
2557d0b3732eSbholler	mov    %r10,0x8(%rcx)
2558d0b3732eSbhollerL(bkP0Q1):
2559d0b3732eSbholler	mov    (%rdx),%r9
2560d0b3732eSbholler	mov    %r9,(%rcx)
2561d0b3732eSbhollerL(bkP0Q0):
2562d0b3732eSbholler	ret
25637c478bd9Sstevel@tonic-gate
2564d0b3732eSbholler	.balign 16
2565d0b3732eSbhollerL(bkP1QI):
2566d0b3732eSbholler	mov    0x89(%rdx),%r10
2567d0b3732eSbholler	mov    %r10,0x89(%rcx)
2568d0b3732eSbhollerL(bkP1QH):
2569d0b3732eSbholler	mov    0x81(%rdx),%r11
2570d0b3732eSbholler	mov    %r11,0x81(%rcx)
2571d0b3732eSbhollerL(bkP1QG):
2572d0b3732eSbholler	mov    0x79(%rdx),%r10
2573d0b3732eSbholler	mov    %r10,0x79(%rcx)
2574d0b3732eSbhollerL(bkP1QF):
2575d0b3732eSbholler	mov    0x71(%rdx),%r9
2576d0b3732eSbholler	mov    %r9,0x71(%rcx)
2577d0b3732eSbhollerL(bkP1QE):
2578d0b3732eSbholler	mov    0x69(%rdx),%r11
2579d0b3732eSbholler	mov    %r11,0x69(%rcx)
2580d0b3732eSbhollerL(bkP1QD):
2581d0b3732eSbholler	mov    0x61(%rdx),%r10
2582d0b3732eSbholler	mov    %r10,0x61(%rcx)
2583d0b3732eSbhollerL(bkP1QC):
2584d0b3732eSbholler	mov    0x59(%rdx),%r9
2585d0b3732eSbholler	mov    %r9,0x59(%rcx)
2586d0b3732eSbhollerL(bkP1QB):
2587d0b3732eSbholler	mov    0x51(%rdx),%r11
2588d0b3732eSbholler	mov    %r11,0x51(%rcx)
2589d0b3732eSbhollerL(bkP1QA):
2590d0b3732eSbholler	mov    0x49(%rdx),%r10
2591d0b3732eSbholler	mov    %r10,0x49(%rcx)
2592d0b3732eSbhollerL(bkP1Q9):
2593d0b3732eSbholler	mov    0x41(%rdx),%r9
2594d0b3732eSbholler	mov    %r9,0x41(%rcx)
2595d0b3732eSbhollerL(bkP1Q8):
2596d0b3732eSbholler	mov    0x39(%rdx),%r11
2597d0b3732eSbholler	mov    %r11,0x39(%rcx)
2598d0b3732eSbhollerL(bkP1Q7):
2599d0b3732eSbholler	mov    0x31(%rdx),%r10
2600d0b3732eSbholler	mov    %r10,0x31(%rcx)
2601d0b3732eSbhollerL(bkP1Q6):
2602d0b3732eSbholler	mov    0x29(%rdx),%r9
2603d0b3732eSbholler	mov    %r9,0x29(%rcx)
2604d0b3732eSbhollerL(bkP1Q5):
2605d0b3732eSbholler	mov    0x21(%rdx),%r11
2606d0b3732eSbholler	mov    %r11,0x21(%rcx)
2607d0b3732eSbhollerL(bkP1Q4):
2608d0b3732eSbholler	mov    0x19(%rdx),%r10
2609d0b3732eSbholler	mov    %r10,0x19(%rcx)
2610d0b3732eSbhollerL(bkP1Q3):
2611d0b3732eSbholler	mov    0x11(%rdx),%r9
2612d0b3732eSbholler	mov    %r9,0x11(%rcx)
2613d0b3732eSbhollerL(bkP1Q2):
2614d0b3732eSbholler	mov    0x9(%rdx),%r11
2615d0b3732eSbholler	mov    %r11,0x9(%rcx)
2616d0b3732eSbhollerL(bkP1Q1):
2617d0b3732eSbholler	mov    0x1(%rdx),%r10
2618d0b3732eSbholler	mov    %r10,0x1(%rcx)
2619d0b3732eSbhollerL(bkP1Q0):
2620d0b3732eSbholler	mov    (%rdx),%r9b
2621d0b3732eSbholler	mov    %r9b,(%rcx)
2622d0b3732eSbholler	ret
2623d0b3732eSbholler
2624d0b3732eSbholler	.balign 16
2625d0b3732eSbhollerL(bkP2QI):
2626d0b3732eSbholler	mov    0x8a(%rdx),%r10
2627d0b3732eSbholler	mov    %r10,0x8a(%rcx)
2628d0b3732eSbhollerL(bkP2QH):
2629d0b3732eSbholler	mov    0x82(%rdx),%r11
2630d0b3732eSbholler	mov    %r11,0x82(%rcx)
2631d0b3732eSbhollerL(bkP2QG):
2632d0b3732eSbholler	mov    0x7a(%rdx),%r10
2633d0b3732eSbholler	mov    %r10,0x7a(%rcx)
2634d0b3732eSbhollerL(bkP2QF):
2635d0b3732eSbholler	mov    0x72(%rdx),%r9
2636d0b3732eSbholler	mov    %r9,0x72(%rcx)
2637d0b3732eSbhollerL(bkP2QE):
2638d0b3732eSbholler	mov    0x6a(%rdx),%r11
2639d0b3732eSbholler	mov    %r11,0x6a(%rcx)
2640d0b3732eSbhollerL(bkP2QD):
2641d0b3732eSbholler	mov    0x62(%rdx),%r10
2642d0b3732eSbholler	mov    %r10,0x62(%rcx)
2643d0b3732eSbhollerL(bkP2QC):
2644d0b3732eSbholler	mov    0x5a(%rdx),%r9
2645d0b3732eSbholler	mov    %r9,0x5a(%rcx)
2646d0b3732eSbhollerL(bkP2QB):
2647d0b3732eSbholler	mov    0x52(%rdx),%r11
2648d0b3732eSbholler	mov    %r11,0x52(%rcx)
2649d0b3732eSbhollerL(bkP2QA):
2650d0b3732eSbholler	mov    0x4a(%rdx),%r10
2651d0b3732eSbholler	mov    %r10,0x4a(%rcx)
2652d0b3732eSbhollerL(bkP2Q9):
2653d0b3732eSbholler	mov    0x42(%rdx),%r9
2654d0b3732eSbholler	mov    %r9,0x42(%rcx)
2655d0b3732eSbhollerL(bkP2Q8):
2656d0b3732eSbholler	mov    0x3a(%rdx),%r11
2657d0b3732eSbholler	mov    %r11,0x3a(%rcx)
2658d0b3732eSbhollerL(bkP2Q7):
2659d0b3732eSbholler	mov    0x32(%rdx),%r10
2660d0b3732eSbholler	mov    %r10,0x32(%rcx)
2661d0b3732eSbhollerL(bkP2Q6):
2662d0b3732eSbholler	mov    0x2a(%rdx),%r9
2663d0b3732eSbholler	mov    %r9,0x2a(%rcx)
2664d0b3732eSbhollerL(bkP2Q5):
2665d0b3732eSbholler	mov    0x22(%rdx),%r11
2666d0b3732eSbholler	mov    %r11,0x22(%rcx)
2667d0b3732eSbhollerL(bkP2Q4):
2668d0b3732eSbholler	mov    0x1a(%rdx),%r10
2669d0b3732eSbholler	mov    %r10,0x1a(%rcx)
2670d0b3732eSbhollerL(bkP2Q3):
2671d0b3732eSbholler	mov    0x12(%rdx),%r9
2672d0b3732eSbholler	mov    %r9,0x12(%rcx)
2673d0b3732eSbhollerL(bkP2Q2):
2674d0b3732eSbholler	mov    0xa(%rdx),%r11
2675d0b3732eSbholler	mov    %r11,0xa(%rcx)
2676d0b3732eSbhollerL(bkP2Q1):
2677d0b3732eSbholler	mov    0x2(%rdx),%r10
2678d0b3732eSbholler	mov    %r10,0x2(%rcx)
2679d0b3732eSbhollerL(bkP2Q0):
2680d0b3732eSbholler	mov    (%rdx),%r9w
2681d0b3732eSbholler	mov    %r9w,(%rcx)
2682d0b3732eSbholler	ret
2683d0b3732eSbholler
2684d0b3732eSbholler	.balign 16
2685d0b3732eSbhollerL(bkP3QI):
2686d0b3732eSbholler	mov    0x8b(%rdx),%r10
2687d0b3732eSbholler	mov    %r10,0x8b(%rcx)
2688d0b3732eSbhollerL(bkP3QH):
2689d0b3732eSbholler	mov    0x83(%rdx),%r11
2690d0b3732eSbholler	mov    %r11,0x83(%rcx)
2691d0b3732eSbhollerL(bkP3QG):
2692d0b3732eSbholler	mov    0x7b(%rdx),%r10
2693d0b3732eSbholler	mov    %r10,0x7b(%rcx)
2694d0b3732eSbhollerL(bkP3QF):
2695d0b3732eSbholler	mov    0x73(%rdx),%r9
2696d0b3732eSbholler	mov    %r9,0x73(%rcx)
2697d0b3732eSbhollerL(bkP3QE):
2698d0b3732eSbholler	mov    0x6b(%rdx),%r11
2699d0b3732eSbholler	mov    %r11,0x6b(%rcx)
2700d0b3732eSbhollerL(bkP3QD):
2701d0b3732eSbholler	mov    0x63(%rdx),%r10
2702d0b3732eSbholler	mov    %r10,0x63(%rcx)
2703d0b3732eSbhollerL(bkP3QC):
2704d0b3732eSbholler	mov    0x5b(%rdx),%r9
2705d0b3732eSbholler	mov    %r9,0x5b(%rcx)
2706d0b3732eSbhollerL(bkP3QB):
2707d0b3732eSbholler	mov    0x53(%rdx),%r11
2708d0b3732eSbholler	mov    %r11,0x53(%rcx)
2709d0b3732eSbhollerL(bkP3QA):
2710d0b3732eSbholler	mov    0x4b(%rdx),%r10
2711d0b3732eSbholler	mov    %r10,0x4b(%rcx)
2712d0b3732eSbhollerL(bkP3Q9):
2713d0b3732eSbholler	mov    0x43(%rdx),%r9
2714d0b3732eSbholler	mov    %r9,0x43(%rcx)
2715d0b3732eSbhollerL(bkP3Q8):
2716d0b3732eSbholler	mov    0x3b(%rdx),%r11
2717d0b3732eSbholler	mov    %r11,0x3b(%rcx)
2718d0b3732eSbhollerL(bkP3Q7):
2719d0b3732eSbholler	mov    0x33(%rdx),%r10
2720d0b3732eSbholler	mov    %r10,0x33(%rcx)
2721d0b3732eSbhollerL(bkP3Q6):
2722d0b3732eSbholler	mov    0x2b(%rdx),%r9
2723d0b3732eSbholler	mov    %r9,0x2b(%rcx)
2724d0b3732eSbhollerL(bkP3Q5):
2725d0b3732eSbholler	mov    0x23(%rdx),%r11
2726d0b3732eSbholler	mov    %r11,0x23(%rcx)
2727d0b3732eSbhollerL(bkP3Q4):
2728d0b3732eSbholler	mov    0x1b(%rdx),%r10
2729d0b3732eSbholler	mov    %r10,0x1b(%rcx)
2730d0b3732eSbhollerL(bkP3Q3):
2731d0b3732eSbholler	mov    0x13(%rdx),%r9
2732d0b3732eSbholler	mov    %r9,0x13(%rcx)
2733d0b3732eSbhollerL(bkP3Q2):
2734d0b3732eSbholler	mov    0xb(%rdx),%r11
2735d0b3732eSbholler	mov    %r11,0xb(%rcx)
2736d0b3732eSbhollerL(bkP3Q1):
2737d0b3732eSbholler	mov    0x3(%rdx),%r10
2738d0b3732eSbholler	mov    %r10,0x3(%rcx)
2739d0b3732eSbhollerL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
2740d0b3732eSbholler	mov    0x1(%rdx),%r9w
2741d0b3732eSbholler	mov    %r9w,0x1(%rcx)
2742d0b3732eSbholler	mov    (%rdx),%r10b
2743d0b3732eSbholler	mov    %r10b,(%rcx)
2744d0b3732eSbholler	ret
2745d0b3732eSbholler
2746d0b3732eSbholler	.balign 16
2747d0b3732eSbhollerL(bkP4QI):
2748d0b3732eSbholler	mov    0x8c(%rdx),%r10
2749d0b3732eSbholler	mov    %r10,0x8c(%rcx)
2750d0b3732eSbhollerL(bkP4QH):
2751d0b3732eSbholler	mov    0x84(%rdx),%r11
2752d0b3732eSbholler	mov    %r11,0x84(%rcx)
2753d0b3732eSbhollerL(bkP4QG):
2754d0b3732eSbholler	mov    0x7c(%rdx),%r10
2755d0b3732eSbholler	mov    %r10,0x7c(%rcx)
2756d0b3732eSbhollerL(bkP4QF):
2757d0b3732eSbholler	mov    0x74(%rdx),%r9
2758d0b3732eSbholler	mov    %r9,0x74(%rcx)
2759d0b3732eSbhollerL(bkP4QE):
2760d0b3732eSbholler	mov    0x6c(%rdx),%r11
2761d0b3732eSbholler	mov    %r11,0x6c(%rcx)
2762d0b3732eSbhollerL(bkP4QD):
2763d0b3732eSbholler	mov    0x64(%rdx),%r10
2764d0b3732eSbholler	mov    %r10,0x64(%rcx)
2765d0b3732eSbhollerL(bkP4QC):
2766d0b3732eSbholler	mov    0x5c(%rdx),%r9
2767d0b3732eSbholler	mov    %r9,0x5c(%rcx)
2768d0b3732eSbhollerL(bkP4QB):
2769d0b3732eSbholler	mov    0x54(%rdx),%r11
2770d0b3732eSbholler	mov    %r11,0x54(%rcx)
2771d0b3732eSbhollerL(bkP4QA):
2772d0b3732eSbholler	mov    0x4c(%rdx),%r10
2773d0b3732eSbholler	mov    %r10,0x4c(%rcx)
2774d0b3732eSbhollerL(bkP4Q9):
2775d0b3732eSbholler	mov    0x44(%rdx),%r9
2776d0b3732eSbholler	mov    %r9,0x44(%rcx)
2777d0b3732eSbhollerL(bkP4Q8):
2778d0b3732eSbholler	mov    0x3c(%rdx),%r11
2779d0b3732eSbholler	mov    %r11,0x3c(%rcx)
2780d0b3732eSbhollerL(bkP4Q7):
2781d0b3732eSbholler	mov    0x34(%rdx),%r10
2782d0b3732eSbholler	mov    %r10,0x34(%rcx)
2783d0b3732eSbhollerL(bkP4Q6):
2784d0b3732eSbholler	mov    0x2c(%rdx),%r9
2785d0b3732eSbholler	mov    %r9,0x2c(%rcx)
2786d0b3732eSbhollerL(bkP4Q5):
2787d0b3732eSbholler	mov    0x24(%rdx),%r11
2788d0b3732eSbholler	mov    %r11,0x24(%rcx)
2789d0b3732eSbhollerL(bkP4Q4):
2790d0b3732eSbholler	mov    0x1c(%rdx),%r10
2791d0b3732eSbholler	mov    %r10,0x1c(%rcx)
2792d0b3732eSbhollerL(bkP4Q3):
2793d0b3732eSbholler	mov    0x14(%rdx),%r9
2794d0b3732eSbholler	mov    %r9,0x14(%rcx)
2795d0b3732eSbhollerL(bkP4Q2):
2796d0b3732eSbholler	mov    0xc(%rdx),%r11
2797d0b3732eSbholler	mov    %r11,0xc(%rcx)
2798d0b3732eSbhollerL(bkP4Q1):
2799d0b3732eSbholler	mov    0x4(%rdx),%r10
2800d0b3732eSbholler	mov    %r10,0x4(%rcx)
2801d0b3732eSbhollerL(bkP4Q0):
2802d0b3732eSbholler	mov    (%rdx),%r9d
2803d0b3732eSbholler	mov    %r9d,(%rcx)
2804d0b3732eSbholler	ret
2805d0b3732eSbholler
2806d0b3732eSbholler	.balign 16
2807d0b3732eSbhollerL(bkP5QI):
2808d0b3732eSbholler	mov    0x8d(%rdx),%r10
2809d0b3732eSbholler	mov    %r10,0x8d(%rcx)
2810d0b3732eSbhollerL(bkP5QH):
2811d0b3732eSbholler	mov    0x85(%rdx),%r9
2812d0b3732eSbholler	mov    %r9,0x85(%rcx)
2813d0b3732eSbhollerL(bkP5QG):
2814d0b3732eSbholler	mov    0x7d(%rdx),%r11
2815d0b3732eSbholler	mov    %r11,0x7d(%rcx)
2816d0b3732eSbhollerL(bkP5QF):
2817d0b3732eSbholler	mov    0x75(%rdx),%r10
2818d0b3732eSbholler	mov    %r10,0x75(%rcx)
2819d0b3732eSbhollerL(bkP5QE):
2820d0b3732eSbholler	mov    0x6d(%rdx),%r9
2821d0b3732eSbholler	mov    %r9,0x6d(%rcx)
2822d0b3732eSbhollerL(bkP5QD):
2823d0b3732eSbholler	mov    0x65(%rdx),%r11
2824d0b3732eSbholler	mov    %r11,0x65(%rcx)
2825d0b3732eSbhollerL(bkP5QC):
2826d0b3732eSbholler	mov    0x5d(%rdx),%r10
2827d0b3732eSbholler	mov    %r10,0x5d(%rcx)
2828d0b3732eSbhollerL(bkP5QB):
2829d0b3732eSbholler	mov    0x55(%rdx),%r9
2830d0b3732eSbholler	mov    %r9,0x55(%rcx)
2831d0b3732eSbhollerL(bkP5QA):
2832d0b3732eSbholler	mov    0x4d(%rdx),%r11
2833d0b3732eSbholler	mov    %r11,0x4d(%rcx)
2834d0b3732eSbhollerL(bkP5Q9):
2835d0b3732eSbholler	mov    0x45(%rdx),%r10
2836d0b3732eSbholler	mov    %r10,0x45(%rcx)
2837d0b3732eSbhollerL(bkP5Q8):
2838d0b3732eSbholler	mov    0x3d(%rdx),%r9
2839d0b3732eSbholler	mov    %r9,0x3d(%rcx)
2840d0b3732eSbhollerL(bkP5Q7):
2841d0b3732eSbholler	mov    0x35(%rdx),%r11
2842d0b3732eSbholler	mov    %r11,0x35(%rcx)
2843d0b3732eSbhollerL(bkP5Q6):
2844d0b3732eSbholler	mov    0x2d(%rdx),%r10
2845d0b3732eSbholler	mov    %r10,0x2d(%rcx)
2846d0b3732eSbhollerL(bkP5Q5):
2847d0b3732eSbholler	mov    0x25(%rdx),%r9
2848d0b3732eSbholler	mov    %r9,0x25(%rcx)
2849d0b3732eSbhollerL(bkP5Q4):
2850d0b3732eSbholler	mov    0x1d(%rdx),%r11
2851d0b3732eSbholler	mov    %r11,0x1d(%rcx)
2852d0b3732eSbhollerL(bkP5Q3):
2853d0b3732eSbholler	mov    0x15(%rdx),%r10
2854d0b3732eSbholler	mov    %r10,0x15(%rcx)
2855d0b3732eSbhollerL(bkP5Q2):
2856d0b3732eSbholler	mov    0xd(%rdx),%r9
2857d0b3732eSbholler	mov    %r9,0xd(%rcx)
2858d0b3732eSbhollerL(bkP5Q1):
2859d0b3732eSbholler	mov    0x5(%rdx),%r11
2860d0b3732eSbholler	mov    %r11,0x5(%rcx)
2861d0b3732eSbhollerL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
2862d0b3732eSbholler	mov    0x1(%rdx),%r9d
2863d0b3732eSbholler	mov    %r9d,0x1(%rcx)
2864d0b3732eSbholler	mov    (%rdx),%r10b
2865d0b3732eSbholler	mov    %r10b,(%rcx)
2866d0b3732eSbholler	ret
2867d0b3732eSbholler
2868d0b3732eSbholler	.balign 16
2869d0b3732eSbhollerL(bkP6QI):
2870d0b3732eSbholler	mov    0x8e(%rdx),%r10
2871d0b3732eSbholler	mov    %r10,0x8e(%rcx)
2872d0b3732eSbhollerL(bkP6QH):
2873d0b3732eSbholler	mov    0x86(%rdx),%r11
2874d0b3732eSbholler	mov    %r11,0x86(%rcx)
2875d0b3732eSbhollerL(bkP6QG):
2876d0b3732eSbholler	mov    0x7e(%rdx),%r10
2877d0b3732eSbholler	mov    %r10,0x7e(%rcx)
2878d0b3732eSbhollerL(bkP6QF):
2879d0b3732eSbholler	mov    0x76(%rdx),%r9
2880d0b3732eSbholler	mov    %r9,0x76(%rcx)
2881d0b3732eSbhollerL(bkP6QE):
2882d0b3732eSbholler	mov    0x6e(%rdx),%r11
2883d0b3732eSbholler	mov    %r11,0x6e(%rcx)
2884d0b3732eSbhollerL(bkP6QD):
2885d0b3732eSbholler	mov    0x66(%rdx),%r10
2886d0b3732eSbholler	mov    %r10,0x66(%rcx)
2887d0b3732eSbhollerL(bkP6QC):
2888d0b3732eSbholler	mov    0x5e(%rdx),%r9
2889d0b3732eSbholler	mov    %r9,0x5e(%rcx)
2890d0b3732eSbhollerL(bkP6QB):
2891d0b3732eSbholler	mov    0x56(%rdx),%r11
2892d0b3732eSbholler	mov    %r11,0x56(%rcx)
2893d0b3732eSbhollerL(bkP6QA):
2894d0b3732eSbholler	mov    0x4e(%rdx),%r10
2895d0b3732eSbholler	mov    %r10,0x4e(%rcx)
2896d0b3732eSbhollerL(bkP6Q9):
2897d0b3732eSbholler	mov    0x46(%rdx),%r9
2898d0b3732eSbholler	mov    %r9,0x46(%rcx)
2899d0b3732eSbhollerL(bkP6Q8):
2900d0b3732eSbholler	mov    0x3e(%rdx),%r11
2901d0b3732eSbholler	mov    %r11,0x3e(%rcx)
2902d0b3732eSbhollerL(bkP6Q7):
2903d0b3732eSbholler	mov    0x36(%rdx),%r10
2904d0b3732eSbholler	mov    %r10,0x36(%rcx)
2905d0b3732eSbhollerL(bkP6Q6):
2906d0b3732eSbholler	mov    0x2e(%rdx),%r9
2907d0b3732eSbholler	mov    %r9,0x2e(%rcx)
2908d0b3732eSbhollerL(bkP6Q5):
2909d0b3732eSbholler	mov    0x26(%rdx),%r11
2910d0b3732eSbholler	mov    %r11,0x26(%rcx)
2911d0b3732eSbhollerL(bkP6Q4):
2912d0b3732eSbholler	mov    0x1e(%rdx),%r10
2913d0b3732eSbholler	mov    %r10,0x1e(%rcx)
2914d0b3732eSbhollerL(bkP6Q3):
2915d0b3732eSbholler	mov    0x16(%rdx),%r9
2916d0b3732eSbholler	mov    %r9,0x16(%rcx)
2917d0b3732eSbhollerL(bkP6Q2):
2918d0b3732eSbholler	mov    0xe(%rdx),%r11
2919d0b3732eSbholler	mov    %r11,0xe(%rcx)
2920d0b3732eSbhollerL(bkP6Q1):
2921d0b3732eSbholler	mov    0x6(%rdx),%r10
2922d0b3732eSbholler	mov    %r10,0x6(%rcx)
2923d0b3732eSbhollerL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
2924d0b3732eSbholler	mov    0x2(%rdx),%r9d
2925d0b3732eSbholler	mov    %r9d,0x2(%rcx)
2926d0b3732eSbholler	mov    (%rdx),%r10w
2927d0b3732eSbholler	mov    %r10w,(%rcx)
2928d0b3732eSbholler	ret
2929d0b3732eSbholler
2930d0b3732eSbholler	.balign 16
2931d0b3732eSbhollerL(bkP7QI):
2932d0b3732eSbholler	mov    0x8f(%rdx),%r10
2933d0b3732eSbholler	mov    %r10,0x8f(%rcx)
2934d0b3732eSbhollerL(bkP7QH):
2935d0b3732eSbholler	mov    0x87(%rdx),%r11
2936d0b3732eSbholler	mov    %r11,0x87(%rcx)
2937d0b3732eSbhollerL(bkP7QG):
2938d0b3732eSbholler	mov    0x7f(%rdx),%r10
2939d0b3732eSbholler	mov    %r10,0x7f(%rcx)
2940d0b3732eSbhollerL(bkP7QF):
2941d0b3732eSbholler	mov    0x77(%rdx),%r9
2942d0b3732eSbholler	mov    %r9,0x77(%rcx)
2943d0b3732eSbhollerL(bkP7QE):
2944d0b3732eSbholler	mov    0x6f(%rdx),%r11
2945d0b3732eSbholler	mov    %r11,0x6f(%rcx)
2946d0b3732eSbhollerL(bkP7QD):
2947d0b3732eSbholler	mov    0x67(%rdx),%r10
2948d0b3732eSbholler	mov    %r10,0x67(%rcx)
2949d0b3732eSbhollerL(bkP7QC):
2950d0b3732eSbholler	mov    0x5f(%rdx),%r9
2951d0b3732eSbholler	mov    %r9,0x5f(%rcx)
2952d0b3732eSbhollerL(bkP7QB):
2953d0b3732eSbholler	mov    0x57(%rdx),%r11
2954d0b3732eSbholler	mov    %r11,0x57(%rcx)
2955d0b3732eSbhollerL(bkP7QA):
2956d0b3732eSbholler	mov    0x4f(%rdx),%r10
2957d0b3732eSbholler	mov    %r10,0x4f(%rcx)
2958d0b3732eSbhollerL(bkP7Q9):
2959d0b3732eSbholler	mov    0x47(%rdx),%r9
2960d0b3732eSbholler	mov    %r9,0x47(%rcx)
2961d0b3732eSbhollerL(bkP7Q8):
2962d0b3732eSbholler	mov    0x3f(%rdx),%r11
2963d0b3732eSbholler	mov    %r11,0x3f(%rcx)
2964d0b3732eSbhollerL(bkP7Q7):
2965d0b3732eSbholler	mov    0x37(%rdx),%r10
2966d0b3732eSbholler	mov    %r10,0x37(%rcx)
2967d0b3732eSbhollerL(bkP7Q6):
2968d0b3732eSbholler	mov    0x2f(%rdx),%r9
2969d0b3732eSbholler	mov    %r9,0x2f(%rcx)
2970d0b3732eSbhollerL(bkP7Q5):
2971d0b3732eSbholler	mov    0x27(%rdx),%r11
2972d0b3732eSbholler	mov    %r11,0x27(%rcx)
2973d0b3732eSbhollerL(bkP7Q4):
2974d0b3732eSbholler	mov    0x1f(%rdx),%r10
2975d0b3732eSbholler	mov    %r10,0x1f(%rcx)
2976d0b3732eSbhollerL(bkP7Q3):
2977d0b3732eSbholler	mov    0x17(%rdx),%r9
2978d0b3732eSbholler	mov    %r9,0x17(%rcx)
2979d0b3732eSbhollerL(bkP7Q2):
2980d0b3732eSbholler	mov    0xf(%rdx),%r11
2981d0b3732eSbholler	mov    %r11,0xf(%rcx)
2982d0b3732eSbhollerL(bkP7Q1):
2983d0b3732eSbholler	mov    0x7(%rdx),%r10
2984d0b3732eSbholler	mov    %r10,0x7(%rcx)
2985d0b3732eSbhollerL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
2986d0b3732eSbholler	mov    0x3(%rdx),%r9d
2987d0b3732eSbholler	mov    %r9d,0x3(%rcx)
2988d0b3732eSbholler	mov    0x1(%rdx),%r10w
2989d0b3732eSbholler	mov    %r10w,0x1(%rcx)
2990d0b3732eSbholler	mov    (%rdx),%r11b
2991d0b3732eSbholler	mov    %r11b,(%rcx)
2992d0b3732eSbholler	ret
2993d0b3732eSbholler
2994d0b3732eSbholler		.balign 16
2995d0b3732eSbhollerL(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
2996d0b3732eSbholler		.int L(bkP1Q0)-L(bkPxQx)
2997d0b3732eSbholler		.int L(bkP2Q0)-L(bkPxQx)
2998d0b3732eSbholler		.int L(bkP3Q0)-L(bkPxQx)
2999d0b3732eSbholler		.int L(bkP4Q0)-L(bkPxQx)
3000d0b3732eSbholler		.int L(bkP5Q0)-L(bkPxQx)
3001d0b3732eSbholler		.int L(bkP6Q0)-L(bkPxQx)
3002d0b3732eSbholler		.int L(bkP7Q0)-L(bkPxQx)
3003d0b3732eSbholler
3004d0b3732eSbholler		.int L(bkP0Q1)-L(bkPxQx)
3005d0b3732eSbholler		.int L(bkP1Q1)-L(bkPxQx)
3006d0b3732eSbholler		.int L(bkP2Q1)-L(bkPxQx)
3007d0b3732eSbholler		.int L(bkP3Q1)-L(bkPxQx)
3008d0b3732eSbholler		.int L(bkP4Q1)-L(bkPxQx)
3009d0b3732eSbholler		.int L(bkP5Q1)-L(bkPxQx)
3010d0b3732eSbholler		.int L(bkP6Q1)-L(bkPxQx)
3011d0b3732eSbholler		.int L(bkP7Q1)-L(bkPxQx)
3012d0b3732eSbholler
3013d0b3732eSbholler		.int L(bkP0Q2)-L(bkPxQx)
3014d0b3732eSbholler		.int L(bkP1Q2)-L(bkPxQx)
3015d0b3732eSbholler		.int L(bkP2Q2)-L(bkPxQx)
3016d0b3732eSbholler		.int L(bkP3Q2)-L(bkPxQx)
3017d0b3732eSbholler		.int L(bkP4Q2)-L(bkPxQx)
3018d0b3732eSbholler		.int L(bkP5Q2)-L(bkPxQx)
3019d0b3732eSbholler		.int L(bkP6Q2)-L(bkPxQx)
3020d0b3732eSbholler		.int L(bkP7Q2)-L(bkPxQx)
3021d0b3732eSbholler
3022d0b3732eSbholler		.int L(bkP0Q3)-L(bkPxQx)
3023d0b3732eSbholler		.int L(bkP1Q3)-L(bkPxQx)
3024d0b3732eSbholler		.int L(bkP2Q3)-L(bkPxQx)
3025d0b3732eSbholler		.int L(bkP3Q3)-L(bkPxQx)
3026d0b3732eSbholler		.int L(bkP4Q3)-L(bkPxQx)
3027d0b3732eSbholler		.int L(bkP5Q3)-L(bkPxQx)
3028d0b3732eSbholler		.int L(bkP6Q3)-L(bkPxQx)
3029d0b3732eSbholler		.int L(bkP7Q3)-L(bkPxQx)
3030d0b3732eSbholler
3031d0b3732eSbholler		.int L(bkP0Q4)-L(bkPxQx)
3032d0b3732eSbholler		.int L(bkP1Q4)-L(bkPxQx)
3033d0b3732eSbholler		.int L(bkP2Q4)-L(bkPxQx)
3034d0b3732eSbholler		.int L(bkP3Q4)-L(bkPxQx)
3035d0b3732eSbholler		.int L(bkP4Q4)-L(bkPxQx)
3036d0b3732eSbholler		.int L(bkP5Q4)-L(bkPxQx)
3037d0b3732eSbholler		.int L(bkP6Q4)-L(bkPxQx)
3038d0b3732eSbholler		.int L(bkP7Q4)-L(bkPxQx)
3039d0b3732eSbholler
3040d0b3732eSbholler		.int L(bkP0Q5)-L(bkPxQx)
3041d0b3732eSbholler		.int L(bkP1Q5)-L(bkPxQx)
3042d0b3732eSbholler		.int L(bkP2Q5)-L(bkPxQx)
3043d0b3732eSbholler		.int L(bkP3Q5)-L(bkPxQx)
3044d0b3732eSbholler		.int L(bkP4Q5)-L(bkPxQx)
3045d0b3732eSbholler		.int L(bkP5Q5)-L(bkPxQx)
3046d0b3732eSbholler		.int L(bkP6Q5)-L(bkPxQx)
3047d0b3732eSbholler		.int L(bkP7Q5)-L(bkPxQx)
3048d0b3732eSbholler
3049d0b3732eSbholler		.int L(bkP0Q6)-L(bkPxQx)
3050d0b3732eSbholler		.int L(bkP1Q6)-L(bkPxQx)
3051d0b3732eSbholler		.int L(bkP2Q6)-L(bkPxQx)
3052d0b3732eSbholler		.int L(bkP3Q6)-L(bkPxQx)
3053d0b3732eSbholler		.int L(bkP4Q6)-L(bkPxQx)
3054d0b3732eSbholler		.int L(bkP5Q6)-L(bkPxQx)
3055d0b3732eSbholler		.int L(bkP6Q6)-L(bkPxQx)
3056d0b3732eSbholler		.int L(bkP7Q6)-L(bkPxQx)
3057d0b3732eSbholler
3058d0b3732eSbholler		.int L(bkP0Q7)-L(bkPxQx)
3059d0b3732eSbholler		.int L(bkP1Q7)-L(bkPxQx)
3060d0b3732eSbholler		.int L(bkP2Q7)-L(bkPxQx)
3061d0b3732eSbholler		.int L(bkP3Q7)-L(bkPxQx)
3062d0b3732eSbholler		.int L(bkP4Q7)-L(bkPxQx)
3063d0b3732eSbholler		.int L(bkP5Q7)-L(bkPxQx)
3064d0b3732eSbholler		.int L(bkP6Q7)-L(bkPxQx)
3065d0b3732eSbholler		.int L(bkP7Q7)-L(bkPxQx)
3066d0b3732eSbholler
3067d0b3732eSbholler		.int L(bkP0Q8)-L(bkPxQx)
3068d0b3732eSbholler		.int L(bkP1Q8)-L(bkPxQx)
3069d0b3732eSbholler		.int L(bkP2Q8)-L(bkPxQx)
3070d0b3732eSbholler		.int L(bkP3Q8)-L(bkPxQx)
3071d0b3732eSbholler		.int L(bkP4Q8)-L(bkPxQx)
3072d0b3732eSbholler		.int L(bkP5Q8)-L(bkPxQx)
3073d0b3732eSbholler		.int L(bkP6Q8)-L(bkPxQx)
3074d0b3732eSbholler		.int L(bkP7Q8)-L(bkPxQx)
3075d0b3732eSbholler
3076d0b3732eSbholler		.int L(bkP0Q9)-L(bkPxQx)
3077d0b3732eSbholler		.int L(bkP1Q9)-L(bkPxQx)
3078d0b3732eSbholler		.int L(bkP2Q9)-L(bkPxQx)
3079d0b3732eSbholler		.int L(bkP3Q9)-L(bkPxQx)
3080d0b3732eSbholler		.int L(bkP4Q9)-L(bkPxQx)
3081d0b3732eSbholler		.int L(bkP5Q9)-L(bkPxQx)
3082d0b3732eSbholler		.int L(bkP6Q9)-L(bkPxQx)
3083d0b3732eSbholler		.int L(bkP7Q9)-L(bkPxQx)
3084d0b3732eSbholler
3085d0b3732eSbholler		.int L(bkP0QA)-L(bkPxQx)
3086d0b3732eSbholler		.int L(bkP1QA)-L(bkPxQx)
3087d0b3732eSbholler		.int L(bkP2QA)-L(bkPxQx)
3088d0b3732eSbholler		.int L(bkP3QA)-L(bkPxQx)
3089d0b3732eSbholler		.int L(bkP4QA)-L(bkPxQx)
3090d0b3732eSbholler		.int L(bkP5QA)-L(bkPxQx)
3091d0b3732eSbholler		.int L(bkP6QA)-L(bkPxQx)
3092d0b3732eSbholler		.int L(bkP7QA)-L(bkPxQx)
3093d0b3732eSbholler
3094d0b3732eSbholler		.int L(bkP0QB)-L(bkPxQx)
3095d0b3732eSbholler		.int L(bkP1QB)-L(bkPxQx)
3096d0b3732eSbholler		.int L(bkP2QB)-L(bkPxQx)
3097d0b3732eSbholler		.int L(bkP3QB)-L(bkPxQx)
3098d0b3732eSbholler		.int L(bkP4QB)-L(bkPxQx)
3099d0b3732eSbholler		.int L(bkP5QB)-L(bkPxQx)
3100d0b3732eSbholler		.int L(bkP6QB)-L(bkPxQx)
3101d0b3732eSbholler		.int L(bkP7QB)-L(bkPxQx)
3102d0b3732eSbholler
3103d0b3732eSbholler		.int L(bkP0QC)-L(bkPxQx)
3104d0b3732eSbholler		.int L(bkP1QC)-L(bkPxQx)
3105d0b3732eSbholler		.int L(bkP2QC)-L(bkPxQx)
3106d0b3732eSbholler		.int L(bkP3QC)-L(bkPxQx)
3107d0b3732eSbholler		.int L(bkP4QC)-L(bkPxQx)
3108d0b3732eSbholler		.int L(bkP5QC)-L(bkPxQx)
3109d0b3732eSbholler		.int L(bkP6QC)-L(bkPxQx)
3110d0b3732eSbholler		.int L(bkP7QC)-L(bkPxQx)
3111d0b3732eSbholler
3112d0b3732eSbholler		.int L(bkP0QD)-L(bkPxQx)
3113d0b3732eSbholler		.int L(bkP1QD)-L(bkPxQx)
3114d0b3732eSbholler		.int L(bkP2QD)-L(bkPxQx)
3115d0b3732eSbholler		.int L(bkP3QD)-L(bkPxQx)
3116d0b3732eSbholler		.int L(bkP4QD)-L(bkPxQx)
3117d0b3732eSbholler		.int L(bkP5QD)-L(bkPxQx)
3118d0b3732eSbholler		.int L(bkP6QD)-L(bkPxQx)
3119d0b3732eSbholler		.int L(bkP7QD)-L(bkPxQx)
3120d0b3732eSbholler
3121d0b3732eSbholler		.int L(bkP0QE)-L(bkPxQx)
3122d0b3732eSbholler		.int L(bkP1QE)-L(bkPxQx)
3123d0b3732eSbholler		.int L(bkP2QE)-L(bkPxQx)
3124d0b3732eSbholler		.int L(bkP3QE)-L(bkPxQx)
3125d0b3732eSbholler		.int L(bkP4QE)-L(bkPxQx)
3126d0b3732eSbholler		.int L(bkP5QE)-L(bkPxQx)
3127d0b3732eSbholler		.int L(bkP6QE)-L(bkPxQx)
3128d0b3732eSbholler		.int L(bkP7QE)-L(bkPxQx)
3129d0b3732eSbholler
3130d0b3732eSbholler		.int L(bkP0QF)-L(bkPxQx)
3131d0b3732eSbholler		.int L(bkP1QF)-L(bkPxQx)
3132d0b3732eSbholler		.int L(bkP2QF)-L(bkPxQx)
3133d0b3732eSbholler		.int L(bkP3QF)-L(bkPxQx)
3134d0b3732eSbholler		.int L(bkP4QF)-L(bkPxQx)
3135d0b3732eSbholler		.int L(bkP5QF)-L(bkPxQx)
3136d0b3732eSbholler		.int L(bkP6QF)-L(bkPxQx)
3137d0b3732eSbholler		.int L(bkP7QF)-L(bkPxQx)
3138d0b3732eSbholler
3139d0b3732eSbholler		.int L(bkP0QG)-L(bkPxQx)
3140d0b3732eSbholler		.int L(bkP1QG)-L(bkPxQx)
3141d0b3732eSbholler		.int L(bkP2QG)-L(bkPxQx)
3142d0b3732eSbholler		.int L(bkP3QG)-L(bkPxQx)
3143d0b3732eSbholler		.int L(bkP4QG)-L(bkPxQx)
3144d0b3732eSbholler		.int L(bkP5QG)-L(bkPxQx)
3145d0b3732eSbholler		.int L(bkP6QG)-L(bkPxQx)
3146d0b3732eSbholler		.int L(bkP7QG)-L(bkPxQx)
3147d0b3732eSbholler
3148d0b3732eSbholler		.int L(bkP0QH)-L(bkPxQx)
3149d0b3732eSbholler		.int L(bkP1QH)-L(bkPxQx)
3150d0b3732eSbholler		.int L(bkP2QH)-L(bkPxQx)
3151d0b3732eSbholler		.int L(bkP3QH)-L(bkPxQx)
3152d0b3732eSbholler		.int L(bkP4QH)-L(bkPxQx)
3153d0b3732eSbholler		.int L(bkP5QH)-L(bkPxQx)
3154d0b3732eSbholler		.int L(bkP6QH)-L(bkPxQx)
3155d0b3732eSbholler		.int L(bkP7QH)-L(bkPxQx)
3156d0b3732eSbholler
3157d0b3732eSbholler		.int L(bkP0QI)-L(bkPxQx)
3158d0b3732eSbholler		.int L(bkP1QI)-L(bkPxQx)
3159d0b3732eSbholler		.int L(bkP2QI)-L(bkPxQx)
3160d0b3732eSbholler		.int L(bkP3QI)-L(bkPxQx)
3161d0b3732eSbholler		.int L(bkP4QI)-L(bkPxQx)
3162d0b3732eSbholler		.int L(bkP5QI)-L(bkPxQx)
3163d0b3732eSbholler		.int L(bkP6QI)-L(bkPxQx)
3164d0b3732eSbholler		.int L(bkP7QI)-L(bkPxQx)
3165d0b3732eSbholler
31667c478bd9Sstevel@tonic-gate	SET_SIZE(memmove)
3167