xref: /titanic_52/usr/src/lib/libc/amd64/gen/memcpy.s (revision e23347b1b88ce2c0847fad6e9467a1f953597aa7)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2008, Intel Corporation
29 * All rights reserved.
30 */
31
32/*
33 * memcpy.s - copies two blocks of memory
34 *	Implements memcpy() and memmove() libc primitives.
35 */
36
37	.file	"memcpy.s"
38
39#include <sys/asm_linkage.h>
40
41	ANSI_PRAGMA_WEAK(memmove,function)
42	ANSI_PRAGMA_WEAK(memcpy,function)
43
44#include "cache.h"
45#include "proc64_id.h"
46
47#define L(s) .memcpy/**/s
48
49/*
50 * memcpy algorithm overview:
51 *
52 * Thresholds used below were determined experimentally.
53 *
54 * Pseudo code:
55 *
56 * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
57 * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
58 * future AMD processors.
59 *
60 *
61 * If (size <= 128 bytes) {
62 *	do unrolled code (primarily 8-byte loads/stores) regardless of
63 *	alignment.
64 * } else {
65 *	Align destination to 16-byte boundary
66 *
67 *      if (NO_SSE) {
68 *		If (size > half of the largest level cache) {
69 *			Use 8-byte non-temporal stores (64-bytes/loop)
70 *		} else {
71 *			if (size > 4K && size <= half l1 cache size) {
72 *				Use rep movsq
73 *			} else {
74 *				Use 8-byte loads/stores (64 bytes per loop)
75 *			}
76 *		}
77 *
78 *	} else { **USE SSE**
79 *		If (size > half of the largest level cache) {
80 *			Use 16-byte non-temporal stores (128-bytes per loop)
81 *		} else {
82 *			If (both source and destination are aligned) {
83 *			    Use 16-byte aligned loads and stores (128 bytes/loop)
84 *			} else {
85 *			    use pairs of xmm registers with SSE2 or SSSE3
86 *			    instructions to concatenate and shift appropriately
87 *			    to account for source unalignment. This enables
88 *			    16-byte aligned loads to be done.
89 *			}
90 *		}
91	}
92 *
93 *	Finish any remaining bytes via unrolled code above.
94 * }
95 *
96 * memmove overview:
97 *	memmove is the same as memcpy except one case where copy needs to be
98 *	done backwards. The copy backwards code is done in a similar manner.
99 */
100
101	ENTRY(memmove)
102	cmp	%rsi,%rdi		# if dst <= src
103	jbe	L(CopyForward)		# then do copy forward
104	mov	%rsi,%r9		# move src to r9
105	add	%rdx,%r9		# add len to get addr of end of src
106	cmp	%r9,%rdi		# if dst < end of src
107	jb	L(CopyBackwards)	# then do copy backwards
108	jmp	L(CopyForward)
109
110	ENTRY (memcpy)
111L(CopyForward):
112	mov    %rdx,%r8
113	mov    %rdi,%rcx
114	mov    %rsi,%rdx
115	mov    %rdi,%rax
116	lea    L(fwdPxQx)(%rip),%r11
117	cmp    $0x80,%r8		# 128
118	jg     L(ck_use_sse2)
119	add    %r8,%rcx
120	add    %r8,%rdx
121
122	movslq (%r11,%r8,4),%r10
123	lea    (%r10,%r11,1),%r11
124	jmpq   *%r11
125
126	.balign 16
127L(ShrtAlignNew):
128	lea    L(AliPxQx)(%rip),%r11
129	mov    %rcx,%r9
130	and    $0xf,%r9
131
132	movslq (%r11,%r9,4),%r10
133	lea    (%r10,%r11,1),%r11
134	jmpq   *%r11
135
136	.balign 16
137L(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
138           .int        L(P1Q0)-L(fwdPxQx)
139           .int        L(P2Q0)-L(fwdPxQx)
140           .int        L(P3Q0)-L(fwdPxQx)
141           .int        L(P4Q0)-L(fwdPxQx)
142           .int        L(P5Q0)-L(fwdPxQx)
143           .int        L(P6Q0)-L(fwdPxQx)
144           .int        L(P7Q0)-L(fwdPxQx)
145
146           .int        L(P0Q1)-L(fwdPxQx)
147           .int        L(P1Q1)-L(fwdPxQx)
148           .int        L(P2Q1)-L(fwdPxQx)
149           .int        L(P3Q1)-L(fwdPxQx)
150           .int        L(P4Q1)-L(fwdPxQx)
151           .int        L(P5Q1)-L(fwdPxQx)
152           .int        L(P6Q1)-L(fwdPxQx)
153           .int        L(P7Q1)-L(fwdPxQx)
154
155           .int        L(P0Q2)-L(fwdPxQx)
156           .int        L(P1Q2)-L(fwdPxQx)
157           .int        L(P2Q2)-L(fwdPxQx)
158           .int        L(P3Q2)-L(fwdPxQx)
159           .int        L(P4Q2)-L(fwdPxQx)
160           .int        L(P5Q2)-L(fwdPxQx)
161           .int        L(P6Q2)-L(fwdPxQx)
162           .int        L(P7Q2)-L(fwdPxQx)
163
164           .int        L(P0Q3)-L(fwdPxQx)
165           .int        L(P1Q3)-L(fwdPxQx)
166           .int        L(P2Q3)-L(fwdPxQx)
167           .int        L(P3Q3)-L(fwdPxQx)
168           .int        L(P4Q3)-L(fwdPxQx)
169           .int        L(P5Q3)-L(fwdPxQx)
170           .int        L(P6Q3)-L(fwdPxQx)
171           .int        L(P7Q3)-L(fwdPxQx)
172
173           .int        L(P0Q4)-L(fwdPxQx)
174           .int        L(P1Q4)-L(fwdPxQx)
175           .int        L(P2Q4)-L(fwdPxQx)
176           .int        L(P3Q4)-L(fwdPxQx)
177           .int        L(P4Q4)-L(fwdPxQx)
178           .int        L(P5Q4)-L(fwdPxQx)
179           .int        L(P6Q4)-L(fwdPxQx)
180           .int        L(P7Q4)-L(fwdPxQx)
181
182           .int        L(P0Q5)-L(fwdPxQx)
183           .int        L(P1Q5)-L(fwdPxQx)
184           .int        L(P2Q5)-L(fwdPxQx)
185           .int        L(P3Q5)-L(fwdPxQx)
186           .int        L(P4Q5)-L(fwdPxQx)
187           .int        L(P5Q5)-L(fwdPxQx)
188           .int        L(P6Q5)-L(fwdPxQx)
189           .int        L(P7Q5)-L(fwdPxQx)
190
191           .int        L(P0Q6)-L(fwdPxQx)
192           .int        L(P1Q6)-L(fwdPxQx)
193           .int        L(P2Q6)-L(fwdPxQx)
194           .int        L(P3Q6)-L(fwdPxQx)
195           .int        L(P4Q6)-L(fwdPxQx)
196           .int        L(P5Q6)-L(fwdPxQx)
197           .int        L(P6Q6)-L(fwdPxQx)
198           .int        L(P7Q6)-L(fwdPxQx)
199
200           .int        L(P0Q7)-L(fwdPxQx)
201           .int        L(P1Q7)-L(fwdPxQx)
202           .int        L(P2Q7)-L(fwdPxQx)
203           .int        L(P3Q7)-L(fwdPxQx)
204           .int        L(P4Q7)-L(fwdPxQx)
205           .int        L(P5Q7)-L(fwdPxQx)
206           .int        L(P6Q7)-L(fwdPxQx)
207           .int        L(P7Q7)-L(fwdPxQx)
208
209           .int        L(P0Q8)-L(fwdPxQx)
210           .int        L(P1Q8)-L(fwdPxQx)
211           .int        L(P2Q8)-L(fwdPxQx)
212           .int        L(P3Q8)-L(fwdPxQx)
213           .int        L(P4Q8)-L(fwdPxQx)
214           .int        L(P5Q8)-L(fwdPxQx)
215           .int        L(P6Q8)-L(fwdPxQx)
216           .int        L(P7Q8)-L(fwdPxQx)
217
218           .int        L(P0Q9)-L(fwdPxQx)
219           .int        L(P1Q9)-L(fwdPxQx)
220           .int        L(P2Q9)-L(fwdPxQx)
221           .int        L(P3Q9)-L(fwdPxQx)
222           .int        L(P4Q9)-L(fwdPxQx)
223           .int        L(P5Q9)-L(fwdPxQx)
224           .int        L(P6Q9)-L(fwdPxQx)
225           .int        L(P7Q9)-L(fwdPxQx)
226
227           .int        L(P0QA)-L(fwdPxQx)
228           .int        L(P1QA)-L(fwdPxQx)
229           .int        L(P2QA)-L(fwdPxQx)
230           .int        L(P3QA)-L(fwdPxQx)
231           .int        L(P4QA)-L(fwdPxQx)
232           .int        L(P5QA)-L(fwdPxQx)
233           .int        L(P6QA)-L(fwdPxQx)
234           .int        L(P7QA)-L(fwdPxQx)
235
236           .int        L(P0QB)-L(fwdPxQx)
237           .int        L(P1QB)-L(fwdPxQx)
238           .int        L(P2QB)-L(fwdPxQx)
239           .int        L(P3QB)-L(fwdPxQx)
240           .int        L(P4QB)-L(fwdPxQx)
241           .int        L(P5QB)-L(fwdPxQx)
242           .int        L(P6QB)-L(fwdPxQx)
243           .int        L(P7QB)-L(fwdPxQx)
244
245           .int        L(P0QC)-L(fwdPxQx)
246           .int        L(P1QC)-L(fwdPxQx)
247           .int        L(P2QC)-L(fwdPxQx)
248           .int        L(P3QC)-L(fwdPxQx)
249           .int        L(P4QC)-L(fwdPxQx)
250           .int        L(P5QC)-L(fwdPxQx)
251           .int        L(P6QC)-L(fwdPxQx)
252           .int        L(P7QC)-L(fwdPxQx)
253
254           .int        L(P0QD)-L(fwdPxQx)
255           .int        L(P1QD)-L(fwdPxQx)
256           .int        L(P2QD)-L(fwdPxQx)
257           .int        L(P3QD)-L(fwdPxQx)
258           .int        L(P4QD)-L(fwdPxQx)
259           .int        L(P5QD)-L(fwdPxQx)
260           .int        L(P6QD)-L(fwdPxQx)
261           .int        L(P7QD)-L(fwdPxQx)
262
263           .int        L(P0QE)-L(fwdPxQx)
264           .int        L(P1QE)-L(fwdPxQx)
265           .int        L(P2QE)-L(fwdPxQx)
266           .int        L(P3QE)-L(fwdPxQx)
267           .int        L(P4QE)-L(fwdPxQx)
268           .int        L(P5QE)-L(fwdPxQx)
269           .int        L(P6QE)-L(fwdPxQx)
270           .int        L(P7QE)-L(fwdPxQx)
271
272           .int        L(P0QF)-L(fwdPxQx)
273           .int        L(P1QF)-L(fwdPxQx)
274           .int        L(P2QF)-L(fwdPxQx)
275           .int        L(P3QF)-L(fwdPxQx)
276           .int        L(P4QF)-L(fwdPxQx)
277           .int        L(P5QF)-L(fwdPxQx)
278           .int        L(P6QF)-L(fwdPxQx)
279           .int        L(P7QF)-L(fwdPxQx)
280
281           .int        L(P0QG)-L(fwdPxQx)	# 0x80
282
283	   .balign 16
284L(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
285           .int        L(A1Q0)-L(AliPxQx)
286           .int        L(A2Q0)-L(AliPxQx)
287           .int        L(A3Q0)-L(AliPxQx)
288           .int        L(A4Q0)-L(AliPxQx)
289           .int        L(A5Q0)-L(AliPxQx)
290           .int        L(A6Q0)-L(AliPxQx)
291           .int        L(A7Q0)-L(AliPxQx)
292           .int        L(A0Q1)-L(AliPxQx)
293           .int        L(A1Q1)-L(AliPxQx)
294           .int        L(A2Q1)-L(AliPxQx)
295           .int        L(A3Q1)-L(AliPxQx)
296           .int        L(A4Q1)-L(AliPxQx)
297           .int        L(A5Q1)-L(AliPxQx)
298           .int        L(A6Q1)-L(AliPxQx)
299           .int        L(A7Q1)-L(AliPxQx)
300
301	.balign 16
302L(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
303	movzbq (%rdx),%r11
304	sub    $0xf,%r8
305	mov    %r11b,(%rcx)
306
307	movzwq 0x1(%rdx),%r10
308	mov    %r10w,0x1(%rcx)
309
310	mov    0x3(%rdx),%r9d
311	mov    %r9d,0x3(%rcx)
312
313	mov    0x7(%rdx),%r11
314	add    $0xf,%rdx
315	mov    %r11,0x7(%rcx)
316
317	add    $0xf,%rcx
318	jmp    L(now_qw_aligned)
319
320	.balign 16
321L(A2Q0):			# ; need to move 8+ 6=2+4 bytes
322	movzwq (%rdx),%r10
323	sub    $0xe,%r8
324	mov    %r10w,(%rcx)
325
326	mov    0x2(%rdx),%r9d
327	mov    %r9d,0x2(%rcx)
328
329	mov    0x6(%rdx),%r11
330	add    $0xe,%rdx
331	mov    %r11,0x6(%rcx)
332	add    $0xe,%rcx
333	jmp    L(now_qw_aligned)
334
335	.balign 16
336L(A3Q0):			# ; need to move 8+ 5=1+4 bytes
337	movzbq (%rdx),%r11
338	sub    $0xd,%r8
339	mov    %r11b,(%rcx)
340
341	mov    0x1(%rdx),%r9d
342	mov    %r9d,0x1(%rcx)
343
344	mov    0x5(%rdx),%r10
345	add    $0xd,%rdx
346	mov    %r10,0x5(%rcx)
347
348	add    $0xd,%rcx
349	jmp    L(now_qw_aligned)
350
351	.balign 16
352L(A4Q0):			# ; need to move 8+4 bytes
353	mov    (%rdx),%r9d
354	sub    $0xc,%r8
355	mov    %r9d,(%rcx)
356
357	mov    0x4(%rdx),%r10
358	add    $0xc,%rdx
359	mov    %r10,0x4(%rcx)
360
361	add    $0xc,%rcx
362	jmp    L(now_qw_aligned)
363
364	.balign 16
365L(A5Q0):			# ; need to move 8+ 3=1+2 bytes
366	movzbq (%rdx),%r11
367	sub    $0xb,%r8
368	mov    %r11b,(%rcx)
369
370	movzwq 0x1(%rdx),%r10
371	mov    %r10w,0x1(%rcx)
372
373	mov    0x3(%rdx),%r9
374	add    $0xb,%rdx
375	mov    %r9,0x3(%rcx)
376
377	add    $0xb,%rcx
378	jmp    L(now_qw_aligned)
379
380	.balign 16
381L(A6Q0):			# ; need to move 8+2 bytes
382	movzwq (%rdx),%r10
383	sub    $0xa,%r8
384	mov    %r10w,(%rcx)
385
386	mov    0x2(%rdx),%r9
387	add    $0xa,%rdx
388	mov    %r9,0x2(%rcx)
389
390	add    $0xa,%rcx
391	jmp    L(now_qw_aligned)
392
393	.balign 16
394L(A7Q0):			# ; need to move 8+1 byte
395	movzbq (%rdx),%r11
396	sub    $0x9,%r8
397	mov    %r11b,(%rcx)
398
399	mov    0x1(%rdx),%r10
400	add    $0x9,%rdx
401	mov    %r10,0x1(%rcx)
402
403	add    $0x9,%rcx
404	jmp    L(now_qw_aligned)
405
406	.balign 16
407L(A0Q1):			# ; need to move 8 bytes
408
409	mov    (%rdx),%r10
410	add    $0x8,%rdx
411	sub    $0x8,%r8
412	mov    %r10,(%rcx)
413
414	add    $0x8,%rcx
415	jmp    L(now_qw_aligned)
416
417	.balign 16
418L(A1Q1):			# ; need to move 7=1+2+4 bytes
419	movzbq (%rdx),%r11
420	sub    $0x7,%r8
421	mov    %r11b,(%rcx)
422
423	movzwq 0x1(%rdx),%r10
424	mov    %r10w,0x1(%rcx)
425
426	mov    0x3(%rdx),%r9d
427	add    $0x7,%rdx
428	mov    %r9d,0x3(%rcx)
429	add    $0x7,%rcx
430	jmp    L(now_qw_aligned)
431
432	.balign 16
433L(A2Q1):			# ; need to move 6=2+4 bytes
434	movzwq (%rdx),%r10
435	sub    $0x6,%r8
436	mov    %r10w,(%rcx)
437	mov    0x2(%rdx),%r9d
438	add    $0x6,%rdx
439	mov    %r9d,0x2(%rcx)
440	add    $0x6,%rcx
441	jmp    L(now_qw_aligned)
442
443	.balign 16
444L(A3Q1):			# ; need to move 5=1+4 bytes
445	movzbq (%rdx),%r11
446	sub    $0x5,%r8
447	mov    %r11b,(%rcx)
448	mov    0x1(%rdx),%r9d
449	add    $0x5,%rdx
450	mov    %r9d,0x1(%rcx)
451	add    $0x5,%rcx
452	jmp    L(now_qw_aligned)
453
454	.balign 16
455L(A4Q1):			# ; need to move 4 bytes
456	mov    (%rdx),%r9d
457	sub    $0x4,%r8
458	add    $0x4,%rdx
459	mov    %r9d,(%rcx)
460	add    $0x4,%rcx
461	jmp    L(now_qw_aligned)
462
463	.balign 16
464L(A5Q1):			# ; need to move 3=1+2 bytes
465	movzbq (%rdx),%r11
466	sub    $0x3,%r8
467	mov    %r11b,(%rcx)
468
469	movzwq 0x1(%rdx),%r10
470	add    $0x3,%rdx
471	mov    %r10w,0x1(%rcx)
472
473	add    $0x3,%rcx
474	jmp    L(now_qw_aligned)
475
476	.balign 16
477L(A6Q1):			# ; need to move 2 bytes
478	movzwq (%rdx),%r10
479	sub    $0x2,%r8
480	add    $0x2,%rdx
481	mov    %r10w,(%rcx)
482	add    $0x2,%rcx
483	jmp    L(now_qw_aligned)
484
485	.balign 16
486L(A7Q1):			# ; need to move 1 byte
487	movzbq (%rdx),%r11
488	dec    %r8
489	inc    %rdx
490	mov    %r11b,(%rcx)
491	inc    %rcx
492	jmp    L(now_qw_aligned)
493
494
495	.balign 16
496L(P0QG):
497	mov    -0x80(%rdx),%r9
498	mov    %r9,-0x80(%rcx)
499L(P0QF):
500	mov    -0x78(%rdx),%r10
501	mov    %r10,-0x78(%rcx)
502L(P0QE):
503	mov    -0x70(%rdx),%r9
504	mov    %r9,-0x70(%rcx)
505L(P0QD):
506	mov    -0x68(%rdx),%r10
507	mov    %r10,-0x68(%rcx)
508L(P0QC):
509	mov    -0x60(%rdx),%r9
510	mov    %r9,-0x60(%rcx)
511L(P0QB):
512	mov    -0x58(%rdx),%r10
513	mov    %r10,-0x58(%rcx)
514L(P0QA):
515	mov    -0x50(%rdx),%r9
516	mov    %r9,-0x50(%rcx)
517L(P0Q9):
518	mov    -0x48(%rdx),%r10
519	mov    %r10,-0x48(%rcx)
520L(P0Q8):
521	mov    -0x40(%rdx),%r9
522	mov    %r9,-0x40(%rcx)
523L(P0Q7):
524	mov    -0x38(%rdx),%r10
525	mov    %r10,-0x38(%rcx)
526L(P0Q6):
527	mov    -0x30(%rdx),%r9
528	mov    %r9,-0x30(%rcx)
529L(P0Q5):
530	mov    -0x28(%rdx),%r10
531	mov    %r10,-0x28(%rcx)
532L(P0Q4):
533	mov    -0x20(%rdx),%r9
534	mov    %r9,-0x20(%rcx)
535L(P0Q3):
536	mov    -0x18(%rdx),%r10
537	mov    %r10,-0x18(%rcx)
538L(P0Q2):
539	mov    -0x10(%rdx),%r9
540	mov    %r9,-0x10(%rcx)
541L(P0Q1):
542	mov    -0x8(%rdx),%r10
543	mov    %r10,-0x8(%rcx)
544L(P0Q0):
545	ret
546
547	.balign 16
548L(P1QF):
549	mov    -0x79(%rdx),%r9
550	mov    %r9,-0x79(%rcx)
551L(P1QE):
552	mov    -0x71(%rdx),%r11
553	mov    %r11,-0x71(%rcx)
554L(P1QD):
555	mov    -0x69(%rdx),%r10
556	mov    %r10,-0x69(%rcx)
557L(P1QC):
558	mov    -0x61(%rdx),%r9
559	mov    %r9,-0x61(%rcx)
560L(P1QB):
561	mov    -0x59(%rdx),%r11
562	mov    %r11,-0x59(%rcx)
563L(P1QA):
564	mov    -0x51(%rdx),%r10
565	mov    %r10,-0x51(%rcx)
566L(P1Q9):
567	mov    -0x49(%rdx),%r9
568	mov    %r9,-0x49(%rcx)
569L(P1Q8):
570	mov    -0x41(%rdx),%r11
571	mov    %r11,-0x41(%rcx)
572L(P1Q7):
573	mov    -0x39(%rdx),%r10
574	mov    %r10,-0x39(%rcx)
575L(P1Q6):
576	mov    -0x31(%rdx),%r9
577	mov    %r9,-0x31(%rcx)
578L(P1Q5):
579	mov    -0x29(%rdx),%r11
580	mov    %r11,-0x29(%rcx)
581L(P1Q4):
582	mov    -0x21(%rdx),%r10
583	mov    %r10,-0x21(%rcx)
584L(P1Q3):
585	mov    -0x19(%rdx),%r9
586	mov    %r9,-0x19(%rcx)
587L(P1Q2):
588	mov    -0x11(%rdx),%r11
589	mov    %r11,-0x11(%rcx)
590L(P1Q1):
591	mov    -0x9(%rdx),%r10
592	mov    %r10,-0x9(%rcx)
593L(P1Q0):
594	movzbq -0x1(%rdx),%r9
595	mov    %r9b,-0x1(%rcx)
596	ret
597
598	.balign 16
599L(P2QF):
600	mov    -0x7a(%rdx),%r9
601	mov    %r9,-0x7a(%rcx)
602L(P2QE):
603	mov    -0x72(%rdx),%r11
604	mov    %r11,-0x72(%rcx)
605L(P2QD):
606	mov    -0x6a(%rdx),%r10
607	mov    %r10,-0x6a(%rcx)
608L(P2QC):
609	mov    -0x62(%rdx),%r9
610	mov    %r9,-0x62(%rcx)
611L(P2QB):
612	mov    -0x5a(%rdx),%r11
613	mov    %r11,-0x5a(%rcx)
614L(P2QA):
615	mov    -0x52(%rdx),%r10
616	mov    %r10,-0x52(%rcx)
617L(P2Q9):
618	mov    -0x4a(%rdx),%r9
619	mov    %r9,-0x4a(%rcx)
620L(P2Q8):
621	mov    -0x42(%rdx),%r11
622	mov    %r11,-0x42(%rcx)
623L(P2Q7):
624	mov    -0x3a(%rdx),%r10
625	mov    %r10,-0x3a(%rcx)
626L(P2Q6):
627	mov    -0x32(%rdx),%r9
628	mov    %r9,-0x32(%rcx)
629L(P2Q5):
630	mov    -0x2a(%rdx),%r11
631	mov    %r11,-0x2a(%rcx)
632L(P2Q4):
633	mov    -0x22(%rdx),%r10
634	mov    %r10,-0x22(%rcx)
635L(P2Q3):
636	mov    -0x1a(%rdx),%r9
637	mov    %r9,-0x1a(%rcx)
638L(P2Q2):
639	mov    -0x12(%rdx),%r11
640	mov    %r11,-0x12(%rcx)
641L(P2Q1):
642	mov    -0xa(%rdx),%r10
643	mov    %r10,-0xa(%rcx)
644L(P2Q0):
645	movzwq -0x2(%rdx),%r9
646	mov    %r9w,-0x2(%rcx)
647	ret
648
649	.balign 16
650L(P3QF):
651	mov    -0x7b(%rdx),%r9
652	mov    %r9,-0x7b(%rcx)
653L(P3QE):
654	mov    -0x73(%rdx),%r11
655	mov    %r11,-0x73(%rcx)
656L(P3QD):
657	mov    -0x6b(%rdx),%r10
658	mov    %r10,-0x6b(%rcx)
659L(P3QC):
660	mov    -0x63(%rdx),%r9
661	mov    %r9,-0x63(%rcx)
662L(P3QB):
663	mov    -0x5b(%rdx),%r11
664	mov    %r11,-0x5b(%rcx)
665L(P3QA):
666	mov    -0x53(%rdx),%r10
667	mov    %r10,-0x53(%rcx)
668L(P3Q9):
669	mov    -0x4b(%rdx),%r9
670	mov    %r9,-0x4b(%rcx)
671L(P3Q8):
672	mov    -0x43(%rdx),%r11
673	mov    %r11,-0x43(%rcx)
674L(P3Q7):
675	mov    -0x3b(%rdx),%r10
676	mov    %r10,-0x3b(%rcx)
677L(P3Q6):
678	mov    -0x33(%rdx),%r9
679	mov    %r9,-0x33(%rcx)
680L(P3Q5):
681	mov    -0x2b(%rdx),%r11
682	mov    %r11,-0x2b(%rcx)
683L(P3Q4):
684	mov    -0x23(%rdx),%r10
685	mov    %r10,-0x23(%rcx)
686L(P3Q3):
687	mov    -0x1b(%rdx),%r9
688	mov    %r9,-0x1b(%rcx)
689L(P3Q2):
690	mov    -0x13(%rdx),%r11
691	mov    %r11,-0x13(%rcx)
692L(P3Q1):
693	mov    -0xb(%rdx),%r10
694	mov    %r10,-0xb(%rcx)
695	/*
696	 * These trailing loads/stores have to do all their loads 1st,
697	 * then do the stores.
698	 */
699L(P3Q0):
700	movzwq -0x3(%rdx),%r9
701	movzbq -0x1(%rdx),%r10
702	mov    %r9w,-0x3(%rcx)
703	mov    %r10b,-0x1(%rcx)
704	ret
705
706	.balign 16
707L(P4QF):
708	mov    -0x7c(%rdx),%r9
709	mov    %r9,-0x7c(%rcx)
710L(P4QE):
711	mov    -0x74(%rdx),%r11
712	mov    %r11,-0x74(%rcx)
713L(P4QD):
714	mov    -0x6c(%rdx),%r10
715	mov    %r10,-0x6c(%rcx)
716L(P4QC):
717	mov    -0x64(%rdx),%r9
718	mov    %r9,-0x64(%rcx)
719L(P4QB):
720	mov    -0x5c(%rdx),%r11
721	mov    %r11,-0x5c(%rcx)
722L(P4QA):
723	mov    -0x54(%rdx),%r10
724	mov    %r10,-0x54(%rcx)
725L(P4Q9):
726	mov    -0x4c(%rdx),%r9
727	mov    %r9,-0x4c(%rcx)
728L(P4Q8):
729	mov    -0x44(%rdx),%r11
730	mov    %r11,-0x44(%rcx)
731L(P4Q7):
732	mov    -0x3c(%rdx),%r10
733	mov    %r10,-0x3c(%rcx)
734L(P4Q6):
735	mov    -0x34(%rdx),%r9
736	mov    %r9,-0x34(%rcx)
737L(P4Q5):
738	mov    -0x2c(%rdx),%r11
739	mov    %r11,-0x2c(%rcx)
740L(P4Q4):
741	mov    -0x24(%rdx),%r10
742	mov    %r10,-0x24(%rcx)
743L(P4Q3):
744	mov    -0x1c(%rdx),%r9
745	mov    %r9,-0x1c(%rcx)
746L(P4Q2):
747	mov    -0x14(%rdx),%r11
748	mov    %r11,-0x14(%rcx)
749L(P4Q1):
750	mov    -0xc(%rdx),%r10
751	mov    %r10,-0xc(%rcx)
752L(P4Q0):
753	mov    -0x4(%rdx),%r9d
754	mov    %r9d,-0x4(%rcx)
755	ret
756
757	.balign 16
758L(P5QF):
759	mov    -0x7d(%rdx),%r9
760	mov    %r9,-0x7d(%rcx)
761L(P5QE):
762	mov    -0x75(%rdx),%r11
763	mov    %r11,-0x75(%rcx)
764L(P5QD):
765	mov    -0x6d(%rdx),%r10
766	mov    %r10,-0x6d(%rcx)
767L(P5QC):
768	mov    -0x65(%rdx),%r9
769	mov    %r9,-0x65(%rcx)
770L(P5QB):
771	mov    -0x5d(%rdx),%r11
772	mov    %r11,-0x5d(%rcx)
773L(P5QA):
774	mov    -0x55(%rdx),%r10
775	mov    %r10,-0x55(%rcx)
776L(P5Q9):
777	mov    -0x4d(%rdx),%r9
778	mov    %r9,-0x4d(%rcx)
779L(P5Q8):
780	mov    -0x45(%rdx),%r11
781	mov    %r11,-0x45(%rcx)
782L(P5Q7):
783	mov    -0x3d(%rdx),%r10
784	mov    %r10,-0x3d(%rcx)
785L(P5Q6):
786	mov    -0x35(%rdx),%r9
787	mov    %r9,-0x35(%rcx)
788L(P5Q5):
789	mov    -0x2d(%rdx),%r11
790	mov    %r11,-0x2d(%rcx)
791L(P5Q4):
792	mov    -0x25(%rdx),%r10
793	mov    %r10,-0x25(%rcx)
794L(P5Q3):
795	mov    -0x1d(%rdx),%r9
796	mov    %r9,-0x1d(%rcx)
797L(P5Q2):
798	mov    -0x15(%rdx),%r11
799	mov    %r11,-0x15(%rcx)
800L(P5Q1):
801	mov    -0xd(%rdx),%r10
802	mov    %r10,-0xd(%rcx)
803	/*
804	 * These trailing loads/stores have to do all their loads 1st,
805	 * then do the stores.
806	 */
807L(P5Q0):
808	mov    -0x5(%rdx),%r9d
809	movzbq -0x1(%rdx),%r10
810	mov    %r9d,-0x5(%rcx)
811	mov    %r10b,-0x1(%rcx)
812	ret
813
814	.balign 16
815L(P6QF):
816	mov    -0x7e(%rdx),%r9
817	mov    %r9,-0x7e(%rcx)
818L(P6QE):
819	mov    -0x76(%rdx),%r11
820	mov    %r11,-0x76(%rcx)
821L(P6QD):
822	mov    -0x6e(%rdx),%r10
823	mov    %r10,-0x6e(%rcx)
824L(P6QC):
825	mov    -0x66(%rdx),%r9
826	mov    %r9,-0x66(%rcx)
827L(P6QB):
828	mov    -0x5e(%rdx),%r11
829	mov    %r11,-0x5e(%rcx)
830L(P6QA):
831	mov    -0x56(%rdx),%r10
832	mov    %r10,-0x56(%rcx)
833L(P6Q9):
834	mov    -0x4e(%rdx),%r9
835	mov    %r9,-0x4e(%rcx)
836L(P6Q8):
837	mov    -0x46(%rdx),%r11
838	mov    %r11,-0x46(%rcx)
839L(P6Q7):
840	mov    -0x3e(%rdx),%r10
841	mov    %r10,-0x3e(%rcx)
842L(P6Q6):
843	mov    -0x36(%rdx),%r9
844	mov    %r9,-0x36(%rcx)
845L(P6Q5):
846	mov    -0x2e(%rdx),%r11
847	mov    %r11,-0x2e(%rcx)
848L(P6Q4):
849	mov    -0x26(%rdx),%r10
850	mov    %r10,-0x26(%rcx)
851L(P6Q3):
852	mov    -0x1e(%rdx),%r9
853	mov    %r9,-0x1e(%rcx)
854L(P6Q2):
855	mov    -0x16(%rdx),%r11
856	mov    %r11,-0x16(%rcx)
857L(P6Q1):
858	mov    -0xe(%rdx),%r10
859	mov    %r10,-0xe(%rcx)
860	/*
861	 * These trailing loads/stores have to do all their loads 1st,
862	 * then do the stores.
863	 */
864L(P6Q0):
865	mov    -0x6(%rdx),%r9d
866	movzwq -0x2(%rdx),%r10
867	mov    %r9d,-0x6(%rcx)
868	mov    %r10w,-0x2(%rcx)
869	ret
870
871	.balign 16
872L(P7QF):
873	mov    -0x7f(%rdx),%r9
874	mov    %r9,-0x7f(%rcx)
875L(P7QE):
876	mov    -0x77(%rdx),%r11
877	mov    %r11,-0x77(%rcx)
878L(P7QD):
879	mov    -0x6f(%rdx),%r10
880	mov    %r10,-0x6f(%rcx)
881L(P7QC):
882	mov    -0x67(%rdx),%r9
883	mov    %r9,-0x67(%rcx)
884L(P7QB):
885	mov    -0x5f(%rdx),%r11
886	mov    %r11,-0x5f(%rcx)
887L(P7QA):
888	mov    -0x57(%rdx),%r10
889	mov    %r10,-0x57(%rcx)
890L(P7Q9):
891	mov    -0x4f(%rdx),%r9
892	mov    %r9,-0x4f(%rcx)
893L(P7Q8):
894	mov    -0x47(%rdx),%r11
895	mov    %r11,-0x47(%rcx)
896L(P7Q7):
897	mov    -0x3f(%rdx),%r10
898	mov    %r10,-0x3f(%rcx)
899L(P7Q6):
900	mov    -0x37(%rdx),%r9
901	mov    %r9,-0x37(%rcx)
902L(P7Q5):
903	mov    -0x2f(%rdx),%r11
904	mov    %r11,-0x2f(%rcx)
905L(P7Q4):
906	mov    -0x27(%rdx),%r10
907	mov    %r10,-0x27(%rcx)
908L(P7Q3):
909	mov    -0x1f(%rdx),%r9
910	mov    %r9,-0x1f(%rcx)
911L(P7Q2):
912	mov    -0x17(%rdx),%r11
913	mov    %r11,-0x17(%rcx)
914L(P7Q1):
915	mov    -0xf(%rdx),%r10
916	mov    %r10,-0xf(%rcx)
917	/*
918	 * These trailing loads/stores have to do all their loads 1st,
919	 * then do the stores.
920	 */
921L(P7Q0):
922	mov    -0x7(%rdx),%r9d
923	movzwq -0x3(%rdx),%r10
924	movzbq -0x1(%rdx),%r11
925	mov    %r9d,-0x7(%rcx)
926	mov    %r10w,-0x3(%rcx)
927	mov    %r11b,-0x1(%rcx)
928	ret
929
930	.balign 16
931L(ck_use_sse2):
932	/*
933	 * Align dest to 16 byte boundary.
934	 */
935	test   $0xf,%rcx
936	jnz    L(ShrtAlignNew)
937
938L(now_qw_aligned):
939	cmpl   $NO_SSE,.memops_method(%rip)
940	je     L(Loop8byte_pre)
941
942	/*
943	 * The fall-through path is to do SSE2 16-byte load/stores
944	 */
945
946	/*
947	 * If current move size is larger than half of the highest level cache
948	 * size, then do non-temporal moves.
949	 */
950	mov    .largest_level_cache_size(%rip),%r9d
951	shr    %r9		# take half of it
952	cmp    %r9,%r8
953	jg     L(sse2_nt_move)
954
955	/*
956	 * If both the source and dest are aligned, then use the both aligned
957	 * logic. Well aligned data should reap the rewards.
958	 */
959	test   $0xf,%rdx
960	jz     L(pre_both_aligned)
961
962	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
963	testl  $USE_SSSE3,.memops_method(%rip)
964	jz     1f
965	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
966
9671:
968	/*
969	 * if the src is not 16 byte aligned...
970	 */
971	mov    %rdx,%r11
972	and    $0xf,%r11
973	movdqu (%rdx),%xmm0
974	movdqa %xmm0,(%rcx)
975	add    $0x10,%rdx
976	sub    %r11,%rdx
977	add    $0x10,%rcx
978	sub    $0x10,%r8
979	movdqa (%rdx),%xmm1
980
981	movslq (%r10,%r11,4),%r9
982	lea    (%r9,%r10,1),%r10
983	jmpq   *%r10
984
985	    .balign 16
986L(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
987	    .int        L(mov3dqa1) -L(SSSE3_src)
988	    .int        L(mov3dqa2) -L(SSSE3_src)
989	    .int        L(mov3dqa3) -L(SSSE3_src)
990	    .int        L(mov3dqa4) -L(SSSE3_src)
991	    .int        L(mov3dqa5) -L(SSSE3_src)
992	    .int        L(mov3dqa6) -L(SSSE3_src)
993	    .int        L(mov3dqa7) -L(SSSE3_src)
994	    .int        L(movdqa8)  -L(SSSE3_src)
995	    .int        L(mov3dqa9) -L(SSSE3_src)
996	    .int        L(mov3dqa10)-L(SSSE3_src)
997	    .int        L(mov3dqa11)-L(SSSE3_src)
998	    .int        L(mov3dqa12)-L(SSSE3_src)
999	    .int        L(mov3dqa13)-L(SSSE3_src)
1000	    .int        L(mov3dqa14)-L(SSSE3_src)
1001	    .int        L(mov3dqa15)-L(SSSE3_src)
1002L(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
1003	    .int        L(movdqa1) -L(SSE_src)
1004	    .int        L(movdqa2) -L(SSE_src)
1005	    .int        L(movdqa3) -L(SSE_src)
1006	    .int        L(movdqa4) -L(SSE_src)
1007	    .int        L(movdqa5) -L(SSE_src)
1008	    .int        L(movdqa6) -L(SSE_src)
1009	    .int        L(movdqa7) -L(SSE_src)
1010	    .int        L(movdqa8) -L(SSE_src)
1011	    .int        L(movdqa9) -L(SSE_src)
1012	    .int        L(movdqa10)-L(SSE_src)
1013	    .int        L(movdqa11)-L(SSE_src)
1014	    .int        L(movdqa12)-L(SSE_src)
1015	    .int        L(movdqa13)-L(SSE_src)
1016	    .int        L(movdqa14)-L(SSE_src)
1017	    .int        L(movdqa15)-L(SSE_src)
1018
1019	.balign 16
1020L(movdqa1):
1021	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
1022	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
1023	lea    0x20(%rdx),%rdx
1024	lea    -0x20(%r8),%r8
1025
1026	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
1027	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
1028	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
1029	por    %xmm1,%xmm3 # OR them together
1030	cmp    $0x20,%r8
1031
1032	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
1033	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
1034	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
1035	por    %xmm2,%xmm0 # OR them together
1036	movdqa %xmm3,(%rcx)     # store it
1037	movdqa %xmm0,0x10(%rcx) # store it
1038	lea    0x20(%rcx),%rcx
1039
1040	jge    L(movdqa1)
1041	jmp    L(movdqa_epi)
1042
1043	.balign 16
1044L(movdqa2):
1045	sub    $0x20,%r8
1046	movdqa 0x10(%rdx),%xmm3
1047	movdqa 0x20(%rdx),%xmm0
1048	add    $0x20,%rdx
1049
1050	psrldq $0x2,%xmm1
1051	movdqa %xmm3,%xmm2
1052	pslldq $0xe,%xmm3
1053	por    %xmm1,%xmm3
1054
1055	psrldq $0x2,%xmm2
1056	movdqa %xmm0,%xmm1
1057	pslldq $0xe,%xmm0
1058	por    %xmm2,%xmm0
1059	movdqa %xmm3,(%rcx)
1060	movdqa %xmm0,0x10(%rcx)
1061
1062	add    $0x20,%rcx
1063	cmp    $0x20,%r8
1064	jge    L(movdqa2)
1065	jmp    L(movdqa_epi)
1066
1067	.balign 16
1068L(movdqa3):
1069	sub    $0x20,%r8
1070	movdqa 0x10(%rdx),%xmm3
1071	movdqa 0x20(%rdx),%xmm0
1072	add    $0x20,%rdx
1073
1074	psrldq $0x3,%xmm1
1075	movdqa %xmm3,%xmm2
1076	pslldq $0xd,%xmm3
1077	por    %xmm1,%xmm3
1078
1079	psrldq $0x3,%xmm2
1080	movdqa %xmm0,%xmm1
1081	pslldq $0xd,%xmm0
1082	por    %xmm2,%xmm0
1083	movdqa %xmm3,(%rcx)
1084	movdqa %xmm0,0x10(%rcx)
1085
1086	add    $0x20,%rcx
1087	cmp    $0x20,%r8
1088	jge    L(movdqa3)
1089	jmp    L(movdqa_epi)
1090
1091	.balign 16
1092L(movdqa4):
1093	sub    $0x20,%r8
1094	movdqa 0x10(%rdx),%xmm3
1095	movdqa 0x20(%rdx),%xmm0
1096	add    $0x20,%rdx
1097
1098	psrldq $0x4,%xmm1
1099	movdqa %xmm3,%xmm2
1100	pslldq $0xc,%xmm3
1101	por    %xmm1,%xmm3
1102
1103	psrldq $0x4,%xmm2
1104	movdqa %xmm0,%xmm1
1105	pslldq $0xc,%xmm0
1106	por    %xmm2,%xmm0
1107
1108	movdqa %xmm3,(%rcx)
1109	movdqa %xmm0,0x10(%rcx)
1110
1111	add    $0x20,%rcx
1112	cmp    $0x20,%r8
1113	jge    L(movdqa4)
1114	jmp    L(movdqa_epi)
1115
1116	.balign 16
1117L(movdqa5):
1118	sub    $0x20,%r8
1119	movdqa 0x10(%rdx),%xmm3
1120	movdqa 0x20(%rdx),%xmm0
1121	add    $0x20,%rdx
1122
1123	psrldq $0x5,%xmm1
1124	movdqa %xmm3,%xmm2
1125	pslldq $0xb,%xmm3
1126	por    %xmm1,%xmm3
1127
1128	psrldq $0x5,%xmm2
1129	movdqa %xmm0,%xmm1
1130	pslldq $0xb,%xmm0
1131	por    %xmm2,%xmm0
1132
1133	movdqa %xmm3,(%rcx)
1134	movdqa %xmm0,0x10(%rcx)
1135
1136	add    $0x20,%rcx
1137	cmp    $0x20,%r8
1138	jge    L(movdqa5)
1139	jmp    L(movdqa_epi)
1140
1141	.balign 16
1142L(movdqa6):
1143	sub    $0x20,%r8
1144	movdqa 0x10(%rdx),%xmm3
1145	movdqa 0x20(%rdx),%xmm0
1146	add    $0x20,%rdx
1147
1148	psrldq $0x6,%xmm1
1149	movdqa %xmm3,%xmm2
1150	pslldq $0xa,%xmm3
1151	por    %xmm1,%xmm3
1152
1153	psrldq $0x6,%xmm2
1154	movdqa %xmm0,%xmm1
1155	pslldq $0xa,%xmm0
1156	por    %xmm2,%xmm0
1157	movdqa %xmm3,(%rcx)
1158	movdqa %xmm0,0x10(%rcx)
1159
1160	add    $0x20,%rcx
1161	cmp    $0x20,%r8
1162	jge    L(movdqa6)
1163	jmp    L(movdqa_epi)
1164
1165	.balign 16
1166L(movdqa7):
1167	sub    $0x20,%r8
1168	movdqa 0x10(%rdx),%xmm3
1169	movdqa 0x20(%rdx),%xmm0
1170	add    $0x20,%rdx
1171
1172	psrldq $0x7,%xmm1
1173	movdqa %xmm3,%xmm2
1174	pslldq $0x9,%xmm3
1175	por    %xmm1,%xmm3
1176
1177	psrldq $0x7,%xmm2
1178	movdqa %xmm0,%xmm1
1179	pslldq $0x9,%xmm0
1180	por    %xmm2,%xmm0
1181	movdqa %xmm3,(%rcx)
1182	movdqa %xmm0,0x10(%rcx)
1183
1184	add    $0x20,%rcx
1185	cmp    $0x20,%r8
1186	jge    L(movdqa7)
1187	jmp    L(movdqa_epi)
1188
1189	.balign 16
1190L(movdqa8):
1191	movdqa 0x10(%rdx),%xmm3
1192	sub    $0x30,%r8
1193	movdqa 0x20(%rdx),%xmm0
1194	movdqa 0x30(%rdx),%xmm5
1195	lea    0x30(%rdx),%rdx
1196
1197	shufpd $0x1,%xmm3,%xmm1
1198	movdqa %xmm1,(%rcx)
1199
1200	cmp    $0x30,%r8
1201
1202	shufpd $0x1,%xmm0,%xmm3
1203	movdqa %xmm3,0x10(%rcx)
1204
1205	movdqa %xmm5,%xmm1
1206	shufpd $0x1,%xmm5,%xmm0
1207	movdqa %xmm0,0x20(%rcx)
1208
1209	lea    0x30(%rcx),%rcx
1210
1211	jge    L(movdqa8)
1212	jmp    L(movdqa_epi)
1213
1214	.balign 16
1215L(movdqa9):
1216	sub    $0x20,%r8
1217	movdqa 0x10(%rdx),%xmm3
1218	movdqa 0x20(%rdx),%xmm0
1219	add    $0x20,%rdx
1220
1221	psrldq $0x9,%xmm1
1222	movdqa %xmm3,%xmm2
1223	pslldq $0x7,%xmm3
1224	por    %xmm1,%xmm3
1225
1226	psrldq $0x9,%xmm2
1227	movdqa %xmm0,%xmm1
1228	pslldq $0x7,%xmm0
1229	por    %xmm2,%xmm0
1230	movdqa %xmm3,(%rcx)
1231	movdqa %xmm0,0x10(%rcx)
1232
1233	add    $0x20,%rcx
1234	cmp    $0x20,%r8
1235	jge    L(movdqa9)
1236	jmp    L(movdqa_epi)
1237
1238	.balign 16
1239L(movdqa10):
1240	sub    $0x20,%r8
1241	movdqa 0x10(%rdx),%xmm3
1242	movdqa 0x20(%rdx),%xmm0
1243	add    $0x20,%rdx
1244
1245	psrldq $0xa,%xmm1
1246	movdqa %xmm3,%xmm2
1247	pslldq $0x6,%xmm3
1248	por    %xmm1,%xmm3
1249
1250	psrldq $0xa,%xmm2
1251	movdqa %xmm0,%xmm1
1252	pslldq $0x6,%xmm0
1253	por    %xmm2,%xmm0
1254	movdqa %xmm3,(%rcx)
1255	movdqa %xmm0,0x10(%rcx)
1256
1257	add    $0x20,%rcx
1258	cmp    $0x20,%r8
1259	jge    L(movdqa10)
1260	jmp    L(movdqa_epi)
1261
1262	.balign 16
1263L(movdqa11):
1264	sub    $0x20,%r8
1265	movdqa 0x10(%rdx),%xmm3
1266	movdqa 0x20(%rdx),%xmm0
1267	add    $0x20,%rdx
1268
1269	psrldq $0xb,%xmm1
1270	movdqa %xmm3,%xmm2
1271	pslldq $0x5,%xmm3
1272	por    %xmm1,%xmm3
1273
1274	psrldq $0xb,%xmm2
1275	movdqa %xmm0,%xmm1
1276	pslldq $0x5,%xmm0
1277	por    %xmm2,%xmm0
1278	movdqa %xmm3,(%rcx)
1279	movdqa %xmm0,0x10(%rcx)
1280
1281	add    $0x20,%rcx
1282	cmp    $0x20,%r8
1283	jge    L(movdqa11)
1284	jmp    L(movdqa_epi)
1285
1286	.balign 16
1287L(movdqa12):
1288	sub    $0x20,%r8
1289	movdqa 0x10(%rdx),%xmm3
1290	movdqa 0x20(%rdx),%xmm0
1291	add    $0x20,%rdx
1292
1293	psrldq $0xc,%xmm1
1294	movdqa %xmm3,%xmm2
1295	pslldq $0x4,%xmm3
1296	por    %xmm1,%xmm3
1297
1298	psrldq $0xc,%xmm2
1299	movdqa %xmm0,%xmm1
1300	pslldq $0x4,%xmm0
1301	por    %xmm2,%xmm0
1302	movdqa %xmm3,(%rcx)
1303	movdqa %xmm0,0x10(%rcx)
1304
1305	add    $0x20,%rcx
1306	cmp    $0x20,%r8
1307	jge    L(movdqa12)
1308	jmp    L(movdqa_epi)
1309
1310	.balign 16
1311L(movdqa13):
1312	sub    $0x20,%r8
1313	movdqa 0x10(%rdx),%xmm3
1314	movdqa 0x20(%rdx),%xmm0
1315	add    $0x20,%rdx
1316
1317	psrldq $0xd,%xmm1
1318	movdqa %xmm3,%xmm2
1319	pslldq $0x3,%xmm3
1320	por    %xmm1,%xmm3
1321
1322	psrldq $0xd,%xmm2
1323	movdqa %xmm0,%xmm1
1324	pslldq $0x3,%xmm0
1325	por    %xmm2,%xmm0
1326	movdqa %xmm3,(%rcx)
1327	movdqa %xmm0,0x10(%rcx)
1328
1329	add    $0x20,%rcx
1330	cmp    $0x20,%r8
1331	jge    L(movdqa13)
1332	jmp    L(movdqa_epi)
1333
1334	.balign 16
1335L(movdqa14):
1336	sub    $0x20,%r8
1337	movdqa 0x10(%rdx),%xmm3
1338	movdqa 0x20(%rdx),%xmm0
1339	add    $0x20,%rdx
1340
1341	psrldq $0xe,%xmm1
1342	movdqa %xmm3,%xmm2
1343	pslldq $0x2,%xmm3
1344	por    %xmm1,%xmm3
1345
1346	psrldq $0xe,%xmm2
1347	movdqa %xmm0,%xmm1
1348	pslldq $0x2,%xmm0
1349	por    %xmm2,%xmm0
1350	movdqa %xmm3,(%rcx)
1351	movdqa %xmm0,0x10(%rcx)
1352
1353	add    $0x20,%rcx
1354	cmp    $0x20,%r8
1355	jge    L(movdqa14)
1356	jmp    L(movdqa_epi)
1357
1358	.balign 16
1359L(movdqa15):
1360	sub    $0x20,%r8
1361	movdqa 0x10(%rdx),%xmm3
1362	movdqa 0x20(%rdx),%xmm0
1363	add    $0x20,%rdx
1364
1365	psrldq $0xf,%xmm1
1366	movdqa %xmm3,%xmm2
1367	pslldq $0x1,%xmm3
1368	por    %xmm1,%xmm3
1369
1370	psrldq $0xf,%xmm2
1371	movdqa %xmm0,%xmm1
1372	pslldq $0x1,%xmm0
1373	por    %xmm2,%xmm0
1374	movdqa %xmm3,(%rcx)
1375	movdqa %xmm0,0x10(%rcx)
1376
1377	add    $0x20,%rcx
1378	cmp    $0x20,%r8
1379	jge    L(movdqa15)
1380	#jmp   L(movdqa_epi)
1381
1382	.balign 16
1383L(movdqa_epi):
1384	lea    L(fwdPxQx)(%rip),%r10
1385	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
1386	add    %r8,%rcx
1387	add    %r8,%rdx
1388
1389	movslq (%r10,%r8,4),%r9
1390	lea    (%r9,%r10,1),%r10
1391	jmpq   *%r10
1392
1393	.balign 16
1394L(mov3dqa1):
1395	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
1396	sub	$0x30,%r8
1397	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
1398	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
1399	lea	0x30(%rdx),%rdx
1400	cmp	$0x30,%r8
1401
1402	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
1403	#palignr	$0x1,%xmm1,%xmm3
1404	.byte	0x66,0x0f,0x3a,0x0f
1405	.byte	0xd9,0x01
1406	movdqa	%xmm3,(%rcx)      # store it
1407
1408	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
1409	#palignr	$0x1,%xmm2,%xmm0
1410	.byte	0x66,0x0f,0x3a,0x0f
1411	.byte	0xc2,0x01
1412	movdqa	%xmm0,0x10(%rcx)  # store it
1413
1414	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
1415	#palignr	$0x1,%xmm4,%xmm5
1416	.byte	0x66,0x0f,0x3a,0x0f
1417	.byte	0xec,0x01
1418	movdqa	%xmm5,0x20(%rcx)  # store it
1419
1420	lea	0x30(%rcx),%rcx
1421	jge	L(mov3dqa1)
1422
1423	cmp	$0x10,%r8
1424	jl	L(movdqa_epi)
1425	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1426	sub	$0x10,%r8
1427	lea	0x10(%rdx),%rdx
1428	movdqa	%xmm3,%xmm2		# save for use next concat
1429	#palignr	$0x1,%xmm1,%xmm3
1430	.byte	0x66,0x0f,0x3a,0x0f
1431	.byte	0xd9,0x01
1432
1433	cmp	$0x10,%r8
1434	movdqa	%xmm3,(%rcx)      	# store it
1435	lea	0x10(%rcx),%rcx
1436	jl	L(movdqa_epi)
1437
1438	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1439	sub	$0x10,%r8
1440	lea	0x10(%rdx),%rdx
1441	#palignr	$0x1,%xmm2,%xmm0
1442	.byte	0x66,0x0f,0x3a,0x0f
1443	.byte	0xc2,0x01
1444	movdqa	%xmm0,(%rcx)      	# store it
1445	lea	0x10(%rcx),%rcx
1446	jmp	L(movdqa_epi)
1447
1448	.balign 16
1449L(mov3dqa2):
1450	movdqa	0x10(%rdx),%xmm3
1451	sub	$0x30,%r8
1452	movdqa	0x20(%rdx),%xmm0
1453	movdqa	0x30(%rdx),%xmm5
1454	lea	0x30(%rdx),%rdx
1455	cmp	$0x30,%r8
1456
1457	movdqa	%xmm3,%xmm2
1458	#palignr	$0x2,%xmm1,%xmm3
1459	.byte	0x66,0x0f,0x3a,0x0f
1460	.byte	0xd9,0x02
1461	movdqa	%xmm3,(%rcx)
1462
1463	movdqa	%xmm0,%xmm4
1464	#palignr	$0x2,%xmm2,%xmm0
1465	.byte	0x66,0x0f,0x3a,0x0f
1466	.byte	0xc2,0x02
1467	movdqa	%xmm0,0x10(%rcx)
1468
1469	movdqa	%xmm5,%xmm1
1470	#palignr	$0x2,%xmm4,%xmm5
1471	.byte	0x66,0x0f,0x3a,0x0f
1472	.byte	0xec,0x02
1473	movdqa	%xmm5,0x20(%rcx)
1474
1475	lea	0x30(%rcx),%rcx
1476	jge	L(mov3dqa2)
1477
1478	cmp	$0x10,%r8
1479	jl	L(movdqa_epi)
1480	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1481	sub	$0x10,%r8
1482	lea	0x10(%rdx),%rdx
1483	movdqa	%xmm3,%xmm2		# save for use next concat
1484	#palignr	$0x2,%xmm1,%xmm3
1485	.byte	0x66,0x0f,0x3a,0x0f
1486	.byte	0xd9,0x02
1487
1488	cmp	$0x10,%r8
1489	movdqa	%xmm3,(%rcx)      	# store it
1490	lea	0x10(%rcx),%rcx
1491	jl	L(movdqa_epi)
1492
1493	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1494	sub	$0x10,%r8
1495	lea	0x10(%rdx),%rdx
1496	#palignr	$0x2,%xmm2,%xmm0
1497	.byte	0x66,0x0f,0x3a,0x0f
1498	.byte	0xc2,0x02
1499	movdqa	%xmm0,(%rcx)      	# store it
1500	lea	0x10(%rcx),%rcx
1501	jmp	L(movdqa_epi)
1502
1503	.balign 16
1504L(mov3dqa3):
1505	movdqa	0x10(%rdx),%xmm3
1506	sub	$0x30,%r8
1507	movdqa	0x20(%rdx),%xmm0
1508	movdqa	0x30(%rdx),%xmm5
1509	lea	0x30(%rdx),%rdx
1510	cmp	$0x30,%r8
1511
1512	movdqa	%xmm3,%xmm2
1513	#palignr	$0x3,%xmm1,%xmm3
1514	.byte	0x66,0x0f,0x3a,0x0f
1515	.byte	0xd9,0x03
1516	movdqa	%xmm3,(%rcx)
1517
1518	movdqa	%xmm0,%xmm4
1519	#palignr	$0x3,%xmm2,%xmm0
1520	.byte	0x66,0x0f,0x3a,0x0f
1521	.byte	0xc2,0x03
1522	movdqa	%xmm0,0x10(%rcx)
1523
1524	movdqa	%xmm5,%xmm1
1525	#palignr	$0x3,%xmm4,%xmm5
1526	.byte	0x66,0x0f,0x3a,0x0f
1527	.byte	0xec,0x03
1528	movdqa	%xmm5,0x20(%rcx)
1529
1530	lea	0x30(%rcx),%rcx
1531	jge	L(mov3dqa3)
1532
1533	cmp	$0x10,%r8
1534	jl	L(movdqa_epi)
1535	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1536	sub	$0x10,%r8
1537	lea	0x10(%rdx),%rdx
1538	movdqa	%xmm3,%xmm2		# save for use next concat
1539	#palignr	$0x3,%xmm1,%xmm3
1540	.byte	0x66,0x0f,0x3a,0x0f
1541	.byte	0xd9,0x03
1542
1543	cmp	$0x10,%r8
1544	movdqa	%xmm3,(%rcx)      	# store it
1545	lea	0x10(%rcx),%rcx
1546	jl	L(movdqa_epi)
1547
1548	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1549	sub	$0x10,%r8
1550	lea	0x10(%rdx),%rdx
1551	#palignr	$0x3,%xmm2,%xmm0
1552	.byte	0x66,0x0f,0x3a,0x0f
1553	.byte	0xc2,0x03
1554	movdqa	%xmm0,(%rcx)      	# store it
1555	lea	0x10(%rcx),%rcx
1556	jmp	L(movdqa_epi)
1557
1558	.balign 16
1559L(mov3dqa4):
1560	movdqa	0x10(%rdx),%xmm3
1561	sub	$0x30,%r8
1562	movdqa	0x20(%rdx),%xmm0
1563	movdqa	0x30(%rdx),%xmm5
1564	lea	0x30(%rdx),%rdx
1565	cmp	$0x30,%r8
1566
1567	movdqa	%xmm3,%xmm2
1568	#palignr	$0x4,%xmm1,%xmm3
1569	.byte	0x66,0x0f,0x3a,0x0f
1570	.byte	0xd9,0x04
1571	movdqa	%xmm3,(%rcx)
1572
1573	movdqa	%xmm0,%xmm4
1574	#palignr	$0x4,%xmm2,%xmm0
1575	.byte	0x66,0x0f,0x3a,0x0f
1576	.byte	0xc2,0x04
1577	movdqa	%xmm0,0x10(%rcx)
1578
1579	movdqa	%xmm5,%xmm1
1580	#palignr	$0x4,%xmm4,%xmm5
1581	.byte	0x66,0x0f,0x3a,0x0f
1582	.byte	0xec,0x04
1583	movdqa	%xmm5,0x20(%rcx)
1584
1585	lea	0x30(%rcx),%rcx
1586	jge	L(mov3dqa4)
1587
1588	cmp	$0x10,%r8
1589	jl	L(movdqa_epi)
1590	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1591	sub	$0x10,%r8
1592	lea	0x10(%rdx),%rdx
1593	movdqa	%xmm3,%xmm2		# save for use next concat
1594	#palignr	$0x4,%xmm1,%xmm3
1595	.byte	0x66,0x0f,0x3a,0x0f
1596	.byte	0xd9,0x04
1597
1598	cmp	$0x10,%r8
1599	movdqa	%xmm3,(%rcx)      	# store it
1600	lea	0x10(%rcx),%rcx
1601	jl	L(movdqa_epi)
1602
1603	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1604	sub	$0x10,%r8
1605	lea	0x10(%rdx),%rdx
1606	#palignr	$0x4,%xmm2,%xmm0
1607	.byte	0x66,0x0f,0x3a,0x0f
1608	.byte	0xc2,0x04
1609	movdqa	%xmm0,(%rcx)      	# store it
1610	lea	0x10(%rcx),%rcx
1611	jmp	L(movdqa_epi)
1612
1613	.balign 16
1614L(mov3dqa5):
1615	movdqa	0x10(%rdx),%xmm3
1616	sub	$0x30,%r8
1617	movdqa	0x20(%rdx),%xmm0
1618	movdqa	0x30(%rdx),%xmm5
1619	lea	0x30(%rdx),%rdx
1620	cmp	$0x30,%r8
1621
1622	movdqa	%xmm3,%xmm2
1623	#palignr	$0x5,%xmm1,%xmm3
1624	.byte	0x66,0x0f,0x3a,0x0f
1625	.byte	0xd9,0x05
1626	movdqa	%xmm3,(%rcx)
1627
1628	movdqa	%xmm0,%xmm4
1629	#palignr	$0x5,%xmm2,%xmm0
1630	.byte	0x66,0x0f,0x3a,0x0f
1631	.byte	0xc2,0x05
1632	movdqa	%xmm0,0x10(%rcx)
1633
1634	movdqa	%xmm5,%xmm1
1635	#palignr	$0x5,%xmm4,%xmm5
1636	.byte	0x66,0x0f,0x3a,0x0f
1637	.byte	0xec,0x05
1638	movdqa	%xmm5,0x20(%rcx)
1639
1640	lea	0x30(%rcx),%rcx
1641	jge	L(mov3dqa5)
1642
1643	cmp	$0x10,%r8
1644	jl	L(movdqa_epi)
1645	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1646	sub	$0x10,%r8
1647	lea	0x10(%rdx),%rdx
1648	movdqa	%xmm3,%xmm2		# save for use next concat
1649	#palignr	$0x5,%xmm1,%xmm3
1650	.byte	0x66,0x0f,0x3a,0x0f
1651	.byte	0xd9,0x05
1652
1653	cmp	$0x10,%r8
1654	movdqa	%xmm3,(%rcx)      	# store it
1655	lea	0x10(%rcx),%rcx
1656	jl	L(movdqa_epi)
1657
1658	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1659	sub	$0x10,%r8
1660	lea	0x10(%rdx),%rdx
1661	#palignr	$0x5,%xmm2,%xmm0
1662	.byte	0x66,0x0f,0x3a,0x0f
1663	.byte	0xc2,0x05
1664	movdqa	%xmm0,(%rcx)      	# store it
1665	lea	0x10(%rcx),%rcx
1666	jmp	L(movdqa_epi)
1667
1668	.balign 16
1669L(mov3dqa6):
1670	movdqa	0x10(%rdx),%xmm3
1671	sub	$0x30,%r8
1672	movdqa	0x20(%rdx),%xmm0
1673	movdqa	0x30(%rdx),%xmm5
1674	lea	0x30(%rdx),%rdx
1675	cmp	$0x30,%r8
1676
1677	movdqa	%xmm3,%xmm2
1678	#palignr	$0x6,%xmm1,%xmm3
1679	.byte	0x66,0x0f,0x3a,0x0f
1680	.byte	0xd9,0x06
1681	movdqa	%xmm3,(%rcx)
1682
1683	movdqa	%xmm0,%xmm4
1684	#palignr	$0x6,%xmm2,%xmm0
1685	.byte	0x66,0x0f,0x3a,0x0f
1686	.byte	0xc2,0x06
1687	movdqa	%xmm0,0x10(%rcx)
1688
1689	movdqa	%xmm5,%xmm1
1690	#palignr	$0x6,%xmm4,%xmm5
1691	.byte	0x66,0x0f,0x3a,0x0f
1692	.byte	0xec,0x06
1693	movdqa	%xmm5,0x20(%rcx)
1694
1695	lea	0x30(%rcx),%rcx
1696	jge	L(mov3dqa6)
1697
1698	cmp	$0x10,%r8
1699	jl	L(movdqa_epi)
1700	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1701	sub	$0x10,%r8
1702	lea	0x10(%rdx),%rdx
1703	movdqa	%xmm3,%xmm2		# save for use next concat
1704	#palignr	$0x6,%xmm1,%xmm3
1705	.byte	0x66,0x0f,0x3a,0x0f
1706	.byte	0xd9,0x06
1707
1708	cmp	$0x10,%r8
1709	movdqa	%xmm3,(%rcx)      	# store it
1710	lea	0x10(%rcx),%rcx
1711	jl	L(movdqa_epi)
1712
1713	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1714	sub	$0x10,%r8
1715	lea	0x10(%rdx),%rdx
1716	#palignr	$0x6,%xmm2,%xmm0
1717	.byte	0x66,0x0f,0x3a,0x0f
1718	.byte	0xc2,0x06
1719	movdqa	%xmm0,(%rcx)      	# store it
1720	lea	0x10(%rcx),%rcx
1721	jmp	L(movdqa_epi)
1722
1723	.balign 16
1724L(mov3dqa7):
1725	movdqa	0x10(%rdx),%xmm3
1726	sub	$0x30,%r8
1727	movdqa	0x20(%rdx),%xmm0
1728	movdqa	0x30(%rdx),%xmm5
1729	lea	0x30(%rdx),%rdx
1730	cmp	$0x30,%r8
1731
1732	movdqa	%xmm3,%xmm2
1733	#palignr	$0x7,%xmm1,%xmm3
1734	.byte	0x66,0x0f,0x3a,0x0f
1735	.byte	0xd9,0x07
1736	movdqa	%xmm3,(%rcx)
1737
1738	movdqa	%xmm0,%xmm4
1739	#palignr	$0x7,%xmm2,%xmm0
1740	.byte	0x66,0x0f,0x3a,0x0f
1741	.byte	0xc2,0x07
1742	movdqa	%xmm0,0x10(%rcx)
1743
1744	movdqa	%xmm5,%xmm1
1745	#palignr	$0x7,%xmm4,%xmm5
1746	.byte	0x66,0x0f,0x3a,0x0f
1747	.byte	0xec,0x07
1748	movdqa	%xmm5,0x20(%rcx)
1749
1750	lea	0x30(%rcx),%rcx
1751	jge	L(mov3dqa7)
1752
1753	cmp	$0x10,%r8
1754	jl	L(movdqa_epi)
1755	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1756	sub	$0x10,%r8
1757	lea	0x10(%rdx),%rdx
1758	movdqa	%xmm3,%xmm2		# save for use next concat
1759	#palignr	$0x7,%xmm1,%xmm3
1760	.byte	0x66,0x0f,0x3a,0x0f
1761	.byte	0xd9,0x07
1762
1763	cmp	$0x10,%r8
1764	movdqa	%xmm3,(%rcx)      	# store it
1765	lea	0x10(%rcx),%rcx
1766	jl	L(movdqa_epi)
1767
1768	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1769	sub	$0x10,%r8
1770	lea	0x10(%rdx),%rdx
1771	#palignr	$0x7,%xmm2,%xmm0
1772	.byte	0x66,0x0f,0x3a,0x0f
1773	.byte	0xc2,0x07
1774	movdqa	%xmm0,(%rcx)      	# store it
1775	lea	0x10(%rcx),%rcx
1776	jmp	L(movdqa_epi)
1777
1778	.balign 16
1779L(mov3dqa9):
1780	movdqa	0x10(%rdx),%xmm3
1781	sub	$0x30,%r8
1782	movdqa	0x20(%rdx),%xmm0
1783	movdqa	0x30(%rdx),%xmm5
1784	lea	0x30(%rdx),%rdx
1785	cmp	$0x30,%r8
1786
1787	movdqa	%xmm3,%xmm2
1788	#palignr	$0x9,%xmm1,%xmm3
1789	.byte	0x66,0x0f,0x3a,0x0f
1790	.byte	0xd9,0x09
1791	movdqa	%xmm3,(%rcx)
1792
1793	movdqa	%xmm0,%xmm4
1794	#palignr	$0x9,%xmm2,%xmm0
1795	.byte	0x66,0x0f,0x3a,0x0f
1796	.byte	0xc2,0x09
1797	movdqa	%xmm0,0x10(%rcx)
1798
1799	movdqa	%xmm5,%xmm1
1800	#palignr	$0x9,%xmm4,%xmm5
1801	.byte	0x66,0x0f,0x3a,0x0f
1802	.byte	0xec,0x09
1803	movdqa	%xmm5,0x20(%rcx)
1804
1805	lea	0x30(%rcx),%rcx
1806	jge	L(mov3dqa9)
1807
1808	cmp	$0x10,%r8
1809	jl	L(movdqa_epi)
1810	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1811	sub	$0x10,%r8
1812	lea	0x10(%rdx),%rdx
1813	movdqa	%xmm3,%xmm2		# save for use next concat
1814	#palignr	$0x9,%xmm1,%xmm3
1815	.byte	0x66,0x0f,0x3a,0x0f
1816	.byte	0xd9,0x09
1817
1818	cmp	$0x10,%r8
1819	movdqa	%xmm3,(%rcx)      	# store it
1820	lea	0x10(%rcx),%rcx
1821	jl	L(movdqa_epi)
1822
1823	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1824	sub	$0x10,%r8
1825	lea	0x10(%rdx),%rdx
1826	#palignr	$0x9,%xmm2,%xmm0
1827	.byte	0x66,0x0f,0x3a,0x0f
1828	.byte	0xc2,0x09
1829	movdqa	%xmm0,(%rcx)      	# store it
1830	lea	0x10(%rcx),%rcx
1831	jmp	L(movdqa_epi)
1832
1833	.balign 16
1834L(mov3dqa10):
1835	movdqa	0x10(%rdx),%xmm3
1836	sub	$0x30,%r8
1837	movdqa	0x20(%rdx),%xmm0
1838	movdqa	0x30(%rdx),%xmm5
1839	lea	0x30(%rdx),%rdx
1840	cmp	$0x30,%r8
1841
1842	movdqa	%xmm3,%xmm2
1843	#palignr	$0xa,%xmm1,%xmm3
1844	.byte	0x66,0x0f,0x3a,0x0f
1845	.byte	0xd9,0x0a
1846	movdqa	%xmm3,(%rcx)
1847
1848	movdqa	%xmm0,%xmm4
1849	#palignr	$0xa,%xmm2,%xmm0
1850	.byte	0x66,0x0f,0x3a,0x0f
1851	.byte	0xc2,0x0a
1852	movdqa	%xmm0,0x10(%rcx)
1853
1854	movdqa	%xmm5,%xmm1
1855	#palignr	$0xa,%xmm4,%xmm5
1856	.byte	0x66,0x0f,0x3a,0x0f
1857	.byte	0xec,0x0a
1858	movdqa	%xmm5,0x20(%rcx)
1859
1860	lea	0x30(%rcx),%rcx
1861	jge	L(mov3dqa10)
1862
1863	cmp	$0x10,%r8
1864	jl	L(movdqa_epi)
1865	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1866	sub	$0x10,%r8
1867	lea	0x10(%rdx),%rdx
1868	movdqa	%xmm3,%xmm2		# save for use next concat
1869	#palignr	$0xa,%xmm1,%xmm3
1870	.byte	0x66,0x0f,0x3a,0x0f
1871	.byte	0xd9,0x0a
1872
1873	cmp	$0x10,%r8
1874	movdqa	%xmm3,(%rcx)      	# store it
1875	lea	0x10(%rcx),%rcx
1876	jl	L(movdqa_epi)
1877
1878	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1879	sub	$0x10,%r8
1880	lea	0x10(%rdx),%rdx
1881	#palignr	$0xa,%xmm2,%xmm0
1882	.byte	0x66,0x0f,0x3a,0x0f
1883	.byte	0xc2,0x0a
1884	movdqa	%xmm0,(%rcx)      	# store it
1885	lea	0x10(%rcx),%rcx
1886	jmp	L(movdqa_epi)
1887
1888	.balign 16
1889L(mov3dqa11):
1890	movdqa	0x10(%rdx),%xmm3
1891	sub	$0x30,%r8
1892	movdqa	0x20(%rdx),%xmm0
1893	movdqa	0x30(%rdx),%xmm5
1894	lea	0x30(%rdx),%rdx
1895	cmp	$0x30,%r8
1896
1897	movdqa	%xmm3,%xmm2
1898	#palignr	$0xb,%xmm1,%xmm3
1899	.byte	0x66,0x0f,0x3a,0x0f
1900	.byte	0xd9,0x0b
1901	movdqa	%xmm3,(%rcx)
1902
1903	movdqa	%xmm0,%xmm4
1904	#palignr	$0xb,%xmm2,%xmm0
1905	.byte	0x66,0x0f,0x3a,0x0f
1906	.byte	0xc2,0x0b
1907	movdqa	%xmm0,0x10(%rcx)
1908
1909	movdqa	%xmm5,%xmm1
1910	#palignr	$0xb,%xmm4,%xmm5
1911	.byte	0x66,0x0f,0x3a,0x0f
1912	.byte	0xec,0x0b
1913	movdqa	%xmm5,0x20(%rcx)
1914
1915	lea	0x30(%rcx),%rcx
1916	jge	L(mov3dqa11)
1917
1918	cmp	$0x10,%r8
1919	jl	L(movdqa_epi)
1920	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1921	sub	$0x10,%r8
1922	lea	0x10(%rdx),%rdx
1923	movdqa	%xmm3,%xmm2		# save for use next concat
1924	#palignr	$0xb,%xmm1,%xmm3
1925	.byte	0x66,0x0f,0x3a,0x0f
1926	.byte	0xd9,0x0b
1927
1928	cmp	$0x10,%r8
1929	movdqa	%xmm3,(%rcx)      	# store it
1930	lea	0x10(%rcx),%rcx
1931	jl	L(movdqa_epi)
1932
1933	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1934	sub	$0x10,%r8
1935	lea	0x10(%rdx),%rdx
1936	#palignr	$0xb,%xmm2,%xmm0
1937	.byte	0x66,0x0f,0x3a,0x0f
1938	.byte	0xc2,0x0b
1939	movdqa	%xmm0,(%rcx)      	# store it
1940	lea	0x10(%rcx),%rcx
1941	jmp	L(movdqa_epi)
1942
1943	.balign 16
1944L(mov3dqa12):
1945	movdqa	0x10(%rdx),%xmm3
1946	sub	$0x30,%r8
1947	movdqa	0x20(%rdx),%xmm0
1948	movdqa	0x30(%rdx),%xmm5
1949	lea	0x30(%rdx),%rdx
1950	cmp	$0x30,%r8
1951
1952	movdqa	%xmm3,%xmm2
1953	#palignr	$0xc,%xmm1,%xmm3
1954	.byte	0x66,0x0f,0x3a,0x0f
1955	.byte	0xd9,0x0c
1956	movdqa	%xmm3,(%rcx)
1957
1958	movdqa	%xmm0,%xmm4
1959	#palignr	$0xc,%xmm2,%xmm0
1960	.byte	0x66,0x0f,0x3a,0x0f
1961	.byte	0xc2,0x0c
1962	movdqa	%xmm0,0x10(%rcx)
1963
1964	movdqa	%xmm5,%xmm1
1965	#palignr	$0xc,%xmm4,%xmm5
1966	.byte	0x66,0x0f,0x3a,0x0f
1967	.byte	0xec,0x0c
1968	movdqa	%xmm5,0x20(%rcx)
1969
1970	lea	0x30(%rcx),%rcx
1971	jge	L(mov3dqa12)
1972
1973	cmp	$0x10,%r8
1974	jl	L(movdqa_epi)
1975	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1976	sub	$0x10,%r8
1977	lea	0x10(%rdx),%rdx
1978	movdqa	%xmm3,%xmm2		# save for use next concat
1979	#palignr	$0xc,%xmm1,%xmm3
1980	.byte	0x66,0x0f,0x3a,0x0f
1981	.byte	0xd9,0x0c
1982
1983	cmp	$0x10,%r8
1984	movdqa	%xmm3,(%rcx)      	# store it
1985	lea	0x10(%rcx),%rcx
1986	jl	L(movdqa_epi)
1987
1988	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1989	sub	$0x10,%r8
1990	lea	0x10(%rdx),%rdx
1991	#palignr	$0xc,%xmm2,%xmm0
1992	.byte	0x66,0x0f,0x3a,0x0f
1993	.byte	0xc2,0x0c
1994	movdqa	%xmm0,(%rcx)      	# store it
1995	lea	0x10(%rcx),%rcx
1996	jmp	L(movdqa_epi)
1997
1998	.balign 16
1999L(mov3dqa13):
2000	movdqa	0x10(%rdx),%xmm3
2001	sub	$0x30,%r8
2002	movdqa	0x20(%rdx),%xmm0
2003	movdqa	0x30(%rdx),%xmm5
2004	lea	0x30(%rdx),%rdx
2005	cmp	$0x30,%r8
2006
2007	movdqa	%xmm3,%xmm2
2008	#palignr	$0xd,%xmm1,%xmm3
2009	.byte	0x66,0x0f,0x3a,0x0f
2010	.byte	0xd9,0x0d
2011	movdqa	%xmm3,(%rcx)
2012
2013	movdqa	%xmm0,%xmm4
2014	#palignr	$0xd,%xmm2,%xmm0
2015	.byte	0x66,0x0f,0x3a,0x0f
2016	.byte	0xc2,0x0d
2017	movdqa	%xmm0,0x10(%rcx)
2018
2019	movdqa	%xmm5,%xmm1
2020	#palignr	$0xd,%xmm4,%xmm5
2021	.byte	0x66,0x0f,0x3a,0x0f
2022	.byte	0xec,0x0d
2023	movdqa	%xmm5,0x20(%rcx)
2024
2025	lea	0x30(%rcx),%rcx
2026	jge	L(mov3dqa13)
2027
2028	cmp	$0x10,%r8
2029	jl	L(movdqa_epi)
2030	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2031	sub	$0x10,%r8
2032	lea	0x10(%rdx),%rdx
2033	movdqa	%xmm3,%xmm2		# save for use next concat
2034	#palignr	$0xd,%xmm1,%xmm3
2035	.byte	0x66,0x0f,0x3a,0x0f
2036	.byte	0xd9,0x0d
2037
2038	cmp	$0x10,%r8
2039	movdqa	%xmm3,(%rcx)      	# store it
2040	lea	0x10(%rcx),%rcx
2041	jl	L(movdqa_epi)
2042
2043	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2044	sub	$0x10,%r8
2045	lea	0x10(%rdx),%rdx
2046	#palignr	$0xd,%xmm2,%xmm0
2047	.byte	0x66,0x0f,0x3a,0x0f
2048	.byte	0xc2,0x0d
2049	movdqa	%xmm0,(%rcx)      	# store it
2050	lea	0x10(%rcx),%rcx
2051	jmp	L(movdqa_epi)
2052
2053	.balign 16
2054L(mov3dqa14):
2055	movdqa	0x10(%rdx),%xmm3
2056	sub	$0x30,%r8
2057	movdqa	0x20(%rdx),%xmm0
2058	movdqa	0x30(%rdx),%xmm5
2059	lea	0x30(%rdx),%rdx
2060	cmp	$0x30,%r8
2061
2062	movdqa	%xmm3,%xmm2
2063	#palignr	$0xe,%xmm1,%xmm3
2064	.byte	0x66,0x0f,0x3a,0x0f
2065	.byte	0xd9,0x0e
2066	movdqa	%xmm3,(%rcx)
2067
2068	movdqa	%xmm0,%xmm4
2069	#palignr	$0xe,%xmm2,%xmm0
2070	.byte	0x66,0x0f,0x3a,0x0f
2071	.byte	0xc2,0x0e
2072	movdqa	%xmm0,0x10(%rcx)
2073
2074	movdqa	%xmm5,%xmm1
2075	#palignr	$0xe,%xmm4,%xmm5
2076	.byte	0x66,0x0f,0x3a,0x0f
2077	.byte	0xec,0x0e
2078	movdqa	%xmm5,0x20(%rcx)
2079
2080	lea	0x30(%rcx),%rcx
2081	jge	L(mov3dqa14)
2082
2083	cmp	$0x10,%r8
2084	jl	L(movdqa_epi)
2085	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2086	sub	$0x10,%r8
2087	lea	0x10(%rdx),%rdx
2088	movdqa	%xmm3,%xmm2		# save for use next concat
2089	#palignr	$0xe,%xmm1,%xmm3
2090	.byte	0x66,0x0f,0x3a,0x0f
2091	.byte	0xd9,0x0e
2092
2093	cmp	$0x10,%r8
2094	movdqa	%xmm3,(%rcx)      	# store it
2095	lea	0x10(%rcx),%rcx
2096	jl	L(movdqa_epi)
2097
2098	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2099	sub	$0x10,%r8
2100	lea	0x10(%rdx),%rdx
2101	#palignr	$0xe,%xmm2,%xmm0
2102	.byte	0x66,0x0f,0x3a,0x0f
2103	.byte	0xc2,0x0e
2104	movdqa	%xmm0,(%rcx)      	# store it
2105	lea	0x10(%rcx),%rcx
2106	jmp	L(movdqa_epi)
2107
2108	.balign 16
2109L(mov3dqa15):
2110	movdqa	0x10(%rdx),%xmm3
2111	sub	$0x30,%r8
2112	movdqa	0x20(%rdx),%xmm0
2113	movdqa	0x30(%rdx),%xmm5
2114	lea	0x30(%rdx),%rdx
2115	cmp	$0x30,%r8
2116
2117	movdqa	%xmm3,%xmm2
2118	#palignr	$0xf,%xmm1,%xmm3
2119	.byte	0x66,0x0f,0x3a,0x0f
2120	.byte	0xd9,0x0f
2121	movdqa	%xmm3,(%rcx)
2122
2123	movdqa	%xmm0,%xmm4
2124	#palignr	$0xf,%xmm2,%xmm0
2125	.byte	0x66,0x0f,0x3a,0x0f
2126	.byte	0xc2,0x0f
2127	movdqa	%xmm0,0x10(%rcx)
2128
2129	movdqa	%xmm5,%xmm1
2130	#palignr	$0xf,%xmm4,%xmm5
2131	.byte	0x66,0x0f,0x3a,0x0f
2132	.byte	0xec,0x0f
2133	movdqa	%xmm5,0x20(%rcx)
2134
2135	lea	0x30(%rcx),%rcx
2136	jge	L(mov3dqa15)
2137
2138	cmp	$0x10,%r8
2139	jl	L(movdqa_epi)
2140	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2141	sub	$0x10,%r8
2142	lea	0x10(%rdx),%rdx
2143	movdqa	%xmm3,%xmm2		# save for use next concat
2144	#palignr	$0xf,%xmm1,%xmm3
2145	.byte	0x66,0x0f,0x3a,0x0f
2146	.byte	0xd9,0x0f
2147
2148	cmp	$0x10,%r8
2149	movdqa	%xmm3,(%rcx)      	# store it
2150	lea	0x10(%rcx),%rcx
2151	jl	L(movdqa_epi)
2152
2153	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2154	sub	$0x10,%r8
2155	lea	0x10(%rdx),%rdx
2156	#palignr	$0xf,%xmm2,%xmm0
2157	.byte	0x66,0x0f,0x3a,0x0f
2158	.byte	0xc2,0x0f
2159	movdqa	%xmm0,(%rcx)      	# store it
2160	lea	0x10(%rcx),%rcx
2161	jmp	L(movdqa_epi)
2162
2163	.balign 16
2164L(sse2_nt_move):
2165	lea	0x40(%rcx),%rcx
2166	lea	0x40(%rdx),%rdx
2167	lea	-0x40(%r8),%r8
2168
2169	/*
2170	 * doesn't matter if source is aligned for stuff out of cache.
2171	 * the mis-aligned penalty is masked by the slowness of main memory.
2172	 */
2173	prefetchnta 0x180(%rdx)
2174	movdqu	-0x40(%rdx),%xmm0
2175	movdqu	-0x30(%rdx),%xmm1
2176
2177	cmp	$0x40,%r8
2178	movntdq	%xmm0,-0x40(%rcx)
2179	movntdq	%xmm1,-0x30(%rcx)
2180
2181	movdqu	-0x20(%rdx),%xmm2
2182	movdqu	-0x10(%rdx),%xmm3
2183
2184	movntdq	%xmm2,-0x20(%rcx)
2185	movntdq	%xmm3,-0x10(%rcx)
2186
2187	jge	L(sse2_nt_move)
2188
2189	lea	L(Fix16EndTable)(%rip),%r10
2190	mov	%r8,%r9
2191	and	$0xFFFFFFFFFFFFFFF0,%r9
2192	add	%r9,%rcx
2193	add	%r9,%rdx
2194	sub	%r9,%r8
2195	shr	$0x4,%r9
2196	sfence
2197
2198	movslq	(%r10,%r9,4),%r11
2199	lea	(%r11,%r10,1),%r10
2200	jmpq	*%r10
2201
2202	.balign 16
2203L(Fix16EndTable):
2204	.int    L(fix16_0)-L(Fix16EndTable)
2205	.int    L(fix16_1)-L(Fix16EndTable)
2206	.int    L(fix16_2)-L(Fix16EndTable)
2207	.int    L(fix16_3)-L(Fix16EndTable)
2208
2209	.balign 16
2210L(fix16_3):
2211	movdqu -0x30(%rdx),%xmm1
2212	movdqa %xmm1,-0x30(%rcx)
2213L(fix16_2):
2214	movdqu -0x20(%rdx),%xmm2
2215	movdqa %xmm2,-0x20(%rcx)
2216L(fix16_1):
2217	movdqu -0x10(%rdx),%xmm3
2218	movdqa %xmm3,-0x10(%rcx)
2219L(fix16_0):
2220	lea    L(fwdPxQx)(%rip),%r10
2221	add    %r8,%rdx
2222	add    %r8,%rcx
2223
2224	movslq (%r10,%r8,4),%r9
2225	lea    (%r9,%r10,1),%r10
2226	jmpq   *%r10
2227
2228	.balign 16
2229L(pre_both_aligned):
2230	cmp    $0x80,%r8
2231	jl     L(fix_16b)
2232
2233	.balign 16
2234L(both_aligned):
2235
2236	/*
2237	 * this 'paired' load/load/store/store seems to do best.
2238	 */
2239	movdqa (%rdx),%xmm0
2240	movdqa 0x10(%rdx),%xmm1
2241
2242	movdqa %xmm0,(%rcx)
2243	movdqa %xmm1,0x10(%rcx)
2244	lea    -0x80(%r8),%r8
2245
2246	movdqa 0x20(%rdx),%xmm2
2247	movdqa 0x30(%rdx),%xmm3
2248
2249	movdqa %xmm2,0x20(%rcx)
2250	movdqa %xmm3,0x30(%rcx)
2251
2252	movdqa 0x40(%rdx),%xmm0
2253	movdqa 0x50(%rdx),%xmm1
2254	cmp    $0x80,%r8
2255
2256	movdqa %xmm0,0x40(%rcx)
2257	movdqa %xmm1,0x50(%rcx)
2258
2259	movdqa 0x60(%rdx),%xmm2
2260	movdqa 0x70(%rdx),%xmm3
2261	lea    0x80(%rdx),%rdx
2262	movdqa %xmm2,0x60(%rcx)
2263	movdqa %xmm3,0x70(%rcx)
2264	lea    0x80(%rcx),%rcx
2265	jge    L(both_aligned)
2266
2267L(fix_16b):
2268	add    %r8,%rcx
2269	lea    L(fwdPxQx)(%rip),%r10
2270	add    %r8,%rdx
2271
2272	movslq (%r10,%r8,4),%r9
2273	lea    (%r9,%r10,1),%r10
2274	jmpq   *%r10
2275
2276	.balign 16
2277L(Loop8byte_pre):
2278	# Use 8-byte moves
2279	mov    .largest_level_cache_size(%rip),%r9d
2280	shr    %r9		# take half of it
2281	cmp    %r9,%r8
2282	jge    L(byte8_nt_top)
2283	# Find out whether to use rep movsq
2284	cmp    $4096,%r8
2285	jle    L(byte8_top)
2286	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
2287	cmp    %r9,%r8
2288	jle    L(use_rep)
2289
2290	.balign     16
2291L(byte8_top):
2292	mov    (%rdx),%r9
2293	mov    0x8(%rdx),%r10
2294	lea    -0x40(%r8),%r8
2295	mov    %r9,(%rcx)
2296	mov    %r10,0x8(%rcx)
2297	mov    0x10(%rdx),%r11
2298	mov    0x18(%rdx),%r9
2299	mov    %r11,0x10(%rcx)
2300	mov    %r9,0x18(%rcx)
2301
2302	cmp    $0x40,%r8
2303	mov    0x20(%rdx),%r10
2304	mov    0x28(%rdx),%r11
2305	mov    %r10,0x20(%rcx)
2306	mov    %r11,0x28(%rcx)
2307	mov    0x30(%rdx),%r9
2308	mov    0x38(%rdx),%r10
2309	lea    0x40(%rdx),%rdx
2310	mov    %r9,0x30(%rcx)
2311	mov    %r10,0x38(%rcx)
2312	lea    0x40(%rcx),%rcx
2313	jg     L(byte8_top)
2314
2315L(byte8_end):
2316	lea    L(fwdPxQx)(%rip),%r10
2317	lea    (%rdx,%r8,1),%rdx
2318	lea    (%rcx,%r8,1),%rcx
2319
2320	movslq (%r10,%r8,4),%r9
2321	lea    (%r9,%r10,1),%r10
2322	jmpq   *%r10
2323
2324	.balign	16
2325L(use_rep):
2326	mov    %rdx,%rsi		# %rsi = source
2327	mov    %rcx,%rdi		# %rdi = destination
2328	mov    %r8,%rcx			# %rcx = count
2329	shrq   $3,%rcx			# 8-byte word count
2330	rep
2331	  movsq
2332	mov    %rsi,%rdx		# source
2333	mov    %rdi,%rcx		# destination
2334	andq   $7,%r8			# remainder
2335	jnz    L(byte8_end)
2336	ret
2337
2338	.balign 16
2339L(byte8_nt_top):
2340	sub    $0x40,%r8
2341	prefetchnta 0x180(%rdx)
2342	mov    (%rdx),%r9
2343	movnti %r9,(%rcx)
2344	mov    0x8(%rdx),%r10
2345	movnti %r10,0x8(%rcx)
2346	mov    0x10(%rdx),%r11
2347	movnti %r11,0x10(%rcx)
2348	mov    0x18(%rdx),%r9
2349	movnti %r9,0x18(%rcx)
2350	mov    0x20(%rdx),%r10
2351	movnti %r10,0x20(%rcx)
2352	mov    0x28(%rdx),%r11
2353	movnti %r11,0x28(%rcx)
2354	mov    0x30(%rdx),%r9
2355	movnti %r9,0x30(%rcx)
2356	mov    0x38(%rdx),%r10
2357	movnti %r10,0x38(%rcx)
2358
2359	lea    0x40(%rdx),%rdx
2360	lea    0x40(%rcx),%rcx
2361	cmp    $0x40,%r8
2362	jge    L(byte8_nt_top)
2363	sfence
2364	jmp    L(byte8_end)
2365
2366	SET_SIZE(memcpy)
2367
2368	.balign 16
2369L(CopyBackwards):
2370	mov    %rdx,%r8
2371	mov    %rdi,%rcx
2372	mov    %rsi,%rdx
2373	mov    %rdi,%rax		# return value
2374
2375	# ck alignment of last byte
2376	lea    (%rcx,%r8,1),%rcx
2377	test   $0x7,%rcx
2378	lea    (%rdx,%r8,1),%rdx
2379	jne    L(bk_align)
2380
2381L(bk_qw_aligned):
2382	lea    L(bkPxQx)(%rip),%r10
2383
2384	cmp    $0x90,%r8		# 144
2385	jg     L(bk_ck_sse2_alignment)
2386
2387	sub    %r8,%rcx
2388	sub    %r8,%rdx
2389
2390	movslq (%r10,%r8,4),%r9
2391	lea    (%r9,%r10,1),%r10
2392	jmpq   *%r10
2393
2394	.balign 16
2395L(bk_align):
2396	# only align if len > 8
2397	cmp    $8,%r8
2398	jle    L(bk_qw_aligned)
2399	test   $0x1,%rcx
2400	je     L(bk_tst2)
2401	dec    %rcx
2402	dec    %rdx
2403	dec    %r8
2404	mov    (%rdx),%r9b
2405	mov    %r9b,(%rcx)
2406
2407L(bk_tst2):
2408	test   $0x2,%rcx
2409	je     L(bk_tst3)
2410
2411L(bk_got2):
2412	sub    $0x2,%rcx
2413	sub    $0x2,%rdx
2414	sub    $0x2,%r8
2415	movzwq (%rdx),%r9
2416	mov    %r9w,(%rcx)
2417
2418L(bk_tst3):
2419	test   $0x4,%rcx
2420	je     L(bk_qw_aligned)
2421
2422L(bk_got3):
2423	sub    $0x4,%rcx
2424	sub    $0x4,%rdx
2425	sub    $0x4,%r8
2426	mov    (%rdx),%r9d
2427	mov    %r9d,(%rcx)
2428	jmp    L(bk_qw_aligned)
2429
2430	.balign 16
2431L(bk_ck_sse2_alignment):
2432	cmpl   $NO_SSE,.memops_method(%rip)
2433	je     L(bk_use_rep)
2434	# check alignment of last byte
2435	test   $0xf,%rcx
2436	jz     L(bk_sse2_cpy)
2437
2438L(bk_sse2_align):
2439	# only here if already aligned on at least a qword bndry
2440	sub    $0x8,%rcx
2441	sub    $0x8,%rdx
2442	sub    $0x8,%r8
2443	mov    (%rdx),%r9
2444	mov    %r9,(%rcx)
2445	#jmp   L(bk_sse2_cpy)
2446
2447	.balign 16
2448L(bk_sse2_cpy):
2449	sub    $0x80,%rcx		# 128
2450	sub    $0x80,%rdx
2451	movdqu 0x70(%rdx),%xmm3
2452	movdqu 0x60(%rdx),%xmm2
2453	movdqa %xmm3,0x70(%rcx)
2454	movdqa %xmm2,0x60(%rcx)
2455	sub    $0x80,%r8
2456	movdqu 0x50(%rdx),%xmm1
2457	movdqu 0x40(%rdx),%xmm0
2458	movdqa %xmm1,0x50(%rcx)
2459	movdqa %xmm0,0x40(%rcx)
2460
2461	cmp    $0x80,%r8
2462	movdqu 0x30(%rdx),%xmm3
2463	movdqu 0x20(%rdx),%xmm2
2464	movdqa %xmm3,0x30(%rcx)
2465	movdqa %xmm2,0x20(%rcx)
2466	movdqu 0x10(%rdx),%xmm1
2467	movdqu (%rdx),%xmm0
2468	movdqa %xmm1,0x10(%rcx)
2469	movdqa %xmm0,(%rcx)
2470	jge    L(bk_sse2_cpy)
2471
2472L(bk_sse2_cpy_end):
2473	lea    L(bkPxQx)(%rip),%r10
2474	sub    %r8,%rdx
2475	sub    %r8,%rcx
2476	movslq (%r10,%r8,4),%r9
2477	lea    (%r9,%r10,1),%r10
2478	jmpq   *%r10
2479
2480	.balign 16
2481L(bk_use_rep):
2482	xchg   %rcx,%r9
2483	mov    %rdx,%rsi		# source
2484	mov    %r9,%rdi			# destination
2485	mov    %r8,%rcx			# count
2486	sub    $8,%rsi
2487	sub    $8,%rdi
2488	shr    $3,%rcx
2489	std				# reverse direction
2490	rep
2491	  movsq
2492	cld				# reset direction flag
2493
2494	xchg   %rcx,%r9
2495	lea    L(bkPxQx)(%rip),%r10
2496	sub    %r8,%rdx
2497	sub    %r8,%rcx
2498	andq   $7,%r8			# remainder
2499	jz     2f
2500	movslq (%r10,%r8,4),%r9
2501	lea    (%r9,%r10,1),%r10
2502	jmpq   *%r10
25032:
2504	ret
2505
2506	.balign 16
2507L(bkP0QI):
2508	mov    0x88(%rdx),%r10
2509	mov    %r10,0x88(%rcx)
2510L(bkP0QH):
2511	mov    0x80(%rdx),%r10
2512	mov    %r10,0x80(%rcx)
2513L(bkP0QG):
2514	mov    0x78(%rdx),%r9
2515	mov    %r9,0x78(%rcx)
2516L(bkP0QF):
2517	mov    0x70(%rdx),%r11
2518	mov    %r11,0x70(%rcx)
2519L(bkP0QE):
2520	mov    0x68(%rdx),%r10
2521	mov    %r10,0x68(%rcx)
2522L(bkP0QD):
2523	mov    0x60(%rdx),%r9
2524	mov    %r9,0x60(%rcx)
2525L(bkP0QC):
2526	mov    0x58(%rdx),%r11
2527	mov    %r11,0x58(%rcx)
2528L(bkP0QB):
2529	mov    0x50(%rdx),%r10
2530	mov    %r10,0x50(%rcx)
2531L(bkP0QA):
2532	mov    0x48(%rdx),%r9
2533	mov    %r9,0x48(%rcx)
2534L(bkP0Q9):
2535	mov    0x40(%rdx),%r11
2536	mov    %r11,0x40(%rcx)
2537L(bkP0Q8):
2538	mov    0x38(%rdx),%r10
2539	mov    %r10,0x38(%rcx)
2540L(bkP0Q7):
2541	mov    0x30(%rdx),%r9
2542	mov    %r9,0x30(%rcx)
2543L(bkP0Q6):
2544	mov    0x28(%rdx),%r11
2545	mov    %r11,0x28(%rcx)
2546L(bkP0Q5):
2547	mov    0x20(%rdx),%r10
2548	mov    %r10,0x20(%rcx)
2549L(bkP0Q4):
2550	mov    0x18(%rdx),%r9
2551	mov    %r9,0x18(%rcx)
2552L(bkP0Q3):
2553	mov    0x10(%rdx),%r11
2554	mov    %r11,0x10(%rcx)
2555L(bkP0Q2):
2556	mov    0x8(%rdx),%r10
2557	mov    %r10,0x8(%rcx)
2558L(bkP0Q1):
2559	mov    (%rdx),%r9
2560	mov    %r9,(%rcx)
2561L(bkP0Q0):
2562	ret
2563
2564	.balign 16
2565L(bkP1QI):
2566	mov    0x89(%rdx),%r10
2567	mov    %r10,0x89(%rcx)
2568L(bkP1QH):
2569	mov    0x81(%rdx),%r11
2570	mov    %r11,0x81(%rcx)
2571L(bkP1QG):
2572	mov    0x79(%rdx),%r10
2573	mov    %r10,0x79(%rcx)
2574L(bkP1QF):
2575	mov    0x71(%rdx),%r9
2576	mov    %r9,0x71(%rcx)
2577L(bkP1QE):
2578	mov    0x69(%rdx),%r11
2579	mov    %r11,0x69(%rcx)
2580L(bkP1QD):
2581	mov    0x61(%rdx),%r10
2582	mov    %r10,0x61(%rcx)
2583L(bkP1QC):
2584	mov    0x59(%rdx),%r9
2585	mov    %r9,0x59(%rcx)
2586L(bkP1QB):
2587	mov    0x51(%rdx),%r11
2588	mov    %r11,0x51(%rcx)
2589L(bkP1QA):
2590	mov    0x49(%rdx),%r10
2591	mov    %r10,0x49(%rcx)
2592L(bkP1Q9):
2593	mov    0x41(%rdx),%r9
2594	mov    %r9,0x41(%rcx)
2595L(bkP1Q8):
2596	mov    0x39(%rdx),%r11
2597	mov    %r11,0x39(%rcx)
2598L(bkP1Q7):
2599	mov    0x31(%rdx),%r10
2600	mov    %r10,0x31(%rcx)
2601L(bkP1Q6):
2602	mov    0x29(%rdx),%r9
2603	mov    %r9,0x29(%rcx)
2604L(bkP1Q5):
2605	mov    0x21(%rdx),%r11
2606	mov    %r11,0x21(%rcx)
2607L(bkP1Q4):
2608	mov    0x19(%rdx),%r10
2609	mov    %r10,0x19(%rcx)
2610L(bkP1Q3):
2611	mov    0x11(%rdx),%r9
2612	mov    %r9,0x11(%rcx)
2613L(bkP1Q2):
2614	mov    0x9(%rdx),%r11
2615	mov    %r11,0x9(%rcx)
2616L(bkP1Q1):
2617	mov    0x1(%rdx),%r10
2618	mov    %r10,0x1(%rcx)
2619L(bkP1Q0):
2620	mov    (%rdx),%r9b
2621	mov    %r9b,(%rcx)
2622	ret
2623
2624	.balign 16
2625L(bkP2QI):
2626	mov    0x8a(%rdx),%r10
2627	mov    %r10,0x8a(%rcx)
2628L(bkP2QH):
2629	mov    0x82(%rdx),%r11
2630	mov    %r11,0x82(%rcx)
2631L(bkP2QG):
2632	mov    0x7a(%rdx),%r10
2633	mov    %r10,0x7a(%rcx)
2634L(bkP2QF):
2635	mov    0x72(%rdx),%r9
2636	mov    %r9,0x72(%rcx)
2637L(bkP2QE):
2638	mov    0x6a(%rdx),%r11
2639	mov    %r11,0x6a(%rcx)
2640L(bkP2QD):
2641	mov    0x62(%rdx),%r10
2642	mov    %r10,0x62(%rcx)
2643L(bkP2QC):
2644	mov    0x5a(%rdx),%r9
2645	mov    %r9,0x5a(%rcx)
2646L(bkP2QB):
2647	mov    0x52(%rdx),%r11
2648	mov    %r11,0x52(%rcx)
2649L(bkP2QA):
2650	mov    0x4a(%rdx),%r10
2651	mov    %r10,0x4a(%rcx)
2652L(bkP2Q9):
2653	mov    0x42(%rdx),%r9
2654	mov    %r9,0x42(%rcx)
2655L(bkP2Q8):
2656	mov    0x3a(%rdx),%r11
2657	mov    %r11,0x3a(%rcx)
2658L(bkP2Q7):
2659	mov    0x32(%rdx),%r10
2660	mov    %r10,0x32(%rcx)
2661L(bkP2Q6):
2662	mov    0x2a(%rdx),%r9
2663	mov    %r9,0x2a(%rcx)
2664L(bkP2Q5):
2665	mov    0x22(%rdx),%r11
2666	mov    %r11,0x22(%rcx)
2667L(bkP2Q4):
2668	mov    0x1a(%rdx),%r10
2669	mov    %r10,0x1a(%rcx)
2670L(bkP2Q3):
2671	mov    0x12(%rdx),%r9
2672	mov    %r9,0x12(%rcx)
2673L(bkP2Q2):
2674	mov    0xa(%rdx),%r11
2675	mov    %r11,0xa(%rcx)
2676L(bkP2Q1):
2677	mov    0x2(%rdx),%r10
2678	mov    %r10,0x2(%rcx)
2679L(bkP2Q0):
2680	mov    (%rdx),%r9w
2681	mov    %r9w,(%rcx)
2682	ret
2683
2684	.balign 16
2685L(bkP3QI):
2686	mov    0x8b(%rdx),%r10
2687	mov    %r10,0x8b(%rcx)
2688L(bkP3QH):
2689	mov    0x83(%rdx),%r11
2690	mov    %r11,0x83(%rcx)
2691L(bkP3QG):
2692	mov    0x7b(%rdx),%r10
2693	mov    %r10,0x7b(%rcx)
2694L(bkP3QF):
2695	mov    0x73(%rdx),%r9
2696	mov    %r9,0x73(%rcx)
2697L(bkP3QE):
2698	mov    0x6b(%rdx),%r11
2699	mov    %r11,0x6b(%rcx)
2700L(bkP3QD):
2701	mov    0x63(%rdx),%r10
2702	mov    %r10,0x63(%rcx)
2703L(bkP3QC):
2704	mov    0x5b(%rdx),%r9
2705	mov    %r9,0x5b(%rcx)
2706L(bkP3QB):
2707	mov    0x53(%rdx),%r11
2708	mov    %r11,0x53(%rcx)
2709L(bkP3QA):
2710	mov    0x4b(%rdx),%r10
2711	mov    %r10,0x4b(%rcx)
2712L(bkP3Q9):
2713	mov    0x43(%rdx),%r9
2714	mov    %r9,0x43(%rcx)
2715L(bkP3Q8):
2716	mov    0x3b(%rdx),%r11
2717	mov    %r11,0x3b(%rcx)
2718L(bkP3Q7):
2719	mov    0x33(%rdx),%r10
2720	mov    %r10,0x33(%rcx)
2721L(bkP3Q6):
2722	mov    0x2b(%rdx),%r9
2723	mov    %r9,0x2b(%rcx)
2724L(bkP3Q5):
2725	mov    0x23(%rdx),%r11
2726	mov    %r11,0x23(%rcx)
2727L(bkP3Q4):
2728	mov    0x1b(%rdx),%r10
2729	mov    %r10,0x1b(%rcx)
2730L(bkP3Q3):
2731	mov    0x13(%rdx),%r9
2732	mov    %r9,0x13(%rcx)
2733L(bkP3Q2):
2734	mov    0xb(%rdx),%r11
2735	mov    %r11,0xb(%rcx)
2736L(bkP3Q1):
2737	mov    0x3(%rdx),%r10
2738	mov    %r10,0x3(%rcx)
2739L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
2740	mov    0x1(%rdx),%r9w
2741	mov    %r9w,0x1(%rcx)
2742	mov    (%rdx),%r10b
2743	mov    %r10b,(%rcx)
2744	ret
2745
2746	.balign 16
2747L(bkP4QI):
2748	mov    0x8c(%rdx),%r10
2749	mov    %r10,0x8c(%rcx)
2750L(bkP4QH):
2751	mov    0x84(%rdx),%r11
2752	mov    %r11,0x84(%rcx)
2753L(bkP4QG):
2754	mov    0x7c(%rdx),%r10
2755	mov    %r10,0x7c(%rcx)
2756L(bkP4QF):
2757	mov    0x74(%rdx),%r9
2758	mov    %r9,0x74(%rcx)
2759L(bkP4QE):
2760	mov    0x6c(%rdx),%r11
2761	mov    %r11,0x6c(%rcx)
2762L(bkP4QD):
2763	mov    0x64(%rdx),%r10
2764	mov    %r10,0x64(%rcx)
2765L(bkP4QC):
2766	mov    0x5c(%rdx),%r9
2767	mov    %r9,0x5c(%rcx)
2768L(bkP4QB):
2769	mov    0x54(%rdx),%r11
2770	mov    %r11,0x54(%rcx)
2771L(bkP4QA):
2772	mov    0x4c(%rdx),%r10
2773	mov    %r10,0x4c(%rcx)
2774L(bkP4Q9):
2775	mov    0x44(%rdx),%r9
2776	mov    %r9,0x44(%rcx)
2777L(bkP4Q8):
2778	mov    0x3c(%rdx),%r11
2779	mov    %r11,0x3c(%rcx)
2780L(bkP4Q7):
2781	mov    0x34(%rdx),%r10
2782	mov    %r10,0x34(%rcx)
2783L(bkP4Q6):
2784	mov    0x2c(%rdx),%r9
2785	mov    %r9,0x2c(%rcx)
2786L(bkP4Q5):
2787	mov    0x24(%rdx),%r11
2788	mov    %r11,0x24(%rcx)
2789L(bkP4Q4):
2790	mov    0x1c(%rdx),%r10
2791	mov    %r10,0x1c(%rcx)
2792L(bkP4Q3):
2793	mov    0x14(%rdx),%r9
2794	mov    %r9,0x14(%rcx)
2795L(bkP4Q2):
2796	mov    0xc(%rdx),%r11
2797	mov    %r11,0xc(%rcx)
2798L(bkP4Q1):
2799	mov    0x4(%rdx),%r10
2800	mov    %r10,0x4(%rcx)
2801L(bkP4Q0):
2802	mov    (%rdx),%r9d
2803	mov    %r9d,(%rcx)
2804	ret
2805
2806	.balign 16
2807L(bkP5QI):
2808	mov    0x8d(%rdx),%r10
2809	mov    %r10,0x8d(%rcx)
2810L(bkP5QH):
2811	mov    0x85(%rdx),%r9
2812	mov    %r9,0x85(%rcx)
2813L(bkP5QG):
2814	mov    0x7d(%rdx),%r11
2815	mov    %r11,0x7d(%rcx)
2816L(bkP5QF):
2817	mov    0x75(%rdx),%r10
2818	mov    %r10,0x75(%rcx)
2819L(bkP5QE):
2820	mov    0x6d(%rdx),%r9
2821	mov    %r9,0x6d(%rcx)
2822L(bkP5QD):
2823	mov    0x65(%rdx),%r11
2824	mov    %r11,0x65(%rcx)
2825L(bkP5QC):
2826	mov    0x5d(%rdx),%r10
2827	mov    %r10,0x5d(%rcx)
2828L(bkP5QB):
2829	mov    0x55(%rdx),%r9
2830	mov    %r9,0x55(%rcx)
2831L(bkP5QA):
2832	mov    0x4d(%rdx),%r11
2833	mov    %r11,0x4d(%rcx)
2834L(bkP5Q9):
2835	mov    0x45(%rdx),%r10
2836	mov    %r10,0x45(%rcx)
2837L(bkP5Q8):
2838	mov    0x3d(%rdx),%r9
2839	mov    %r9,0x3d(%rcx)
2840L(bkP5Q7):
2841	mov    0x35(%rdx),%r11
2842	mov    %r11,0x35(%rcx)
2843L(bkP5Q6):
2844	mov    0x2d(%rdx),%r10
2845	mov    %r10,0x2d(%rcx)
2846L(bkP5Q5):
2847	mov    0x25(%rdx),%r9
2848	mov    %r9,0x25(%rcx)
2849L(bkP5Q4):
2850	mov    0x1d(%rdx),%r11
2851	mov    %r11,0x1d(%rcx)
2852L(bkP5Q3):
2853	mov    0x15(%rdx),%r10
2854	mov    %r10,0x15(%rcx)
2855L(bkP5Q2):
2856	mov    0xd(%rdx),%r9
2857	mov    %r9,0xd(%rcx)
2858L(bkP5Q1):
2859	mov    0x5(%rdx),%r11
2860	mov    %r11,0x5(%rcx)
2861L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
2862	mov    0x1(%rdx),%r9d
2863	mov    %r9d,0x1(%rcx)
2864	mov    (%rdx),%r10b
2865	mov    %r10b,(%rcx)
2866	ret
2867
2868	.balign 16
2869L(bkP6QI):
2870	mov    0x8e(%rdx),%r10
2871	mov    %r10,0x8e(%rcx)
2872L(bkP6QH):
2873	mov    0x86(%rdx),%r11
2874	mov    %r11,0x86(%rcx)
2875L(bkP6QG):
2876	mov    0x7e(%rdx),%r10
2877	mov    %r10,0x7e(%rcx)
2878L(bkP6QF):
2879	mov    0x76(%rdx),%r9
2880	mov    %r9,0x76(%rcx)
2881L(bkP6QE):
2882	mov    0x6e(%rdx),%r11
2883	mov    %r11,0x6e(%rcx)
2884L(bkP6QD):
2885	mov    0x66(%rdx),%r10
2886	mov    %r10,0x66(%rcx)
2887L(bkP6QC):
2888	mov    0x5e(%rdx),%r9
2889	mov    %r9,0x5e(%rcx)
2890L(bkP6QB):
2891	mov    0x56(%rdx),%r11
2892	mov    %r11,0x56(%rcx)
2893L(bkP6QA):
2894	mov    0x4e(%rdx),%r10
2895	mov    %r10,0x4e(%rcx)
2896L(bkP6Q9):
2897	mov    0x46(%rdx),%r9
2898	mov    %r9,0x46(%rcx)
2899L(bkP6Q8):
2900	mov    0x3e(%rdx),%r11
2901	mov    %r11,0x3e(%rcx)
2902L(bkP6Q7):
2903	mov    0x36(%rdx),%r10
2904	mov    %r10,0x36(%rcx)
2905L(bkP6Q6):
2906	mov    0x2e(%rdx),%r9
2907	mov    %r9,0x2e(%rcx)
2908L(bkP6Q5):
2909	mov    0x26(%rdx),%r11
2910	mov    %r11,0x26(%rcx)
2911L(bkP6Q4):
2912	mov    0x1e(%rdx),%r10
2913	mov    %r10,0x1e(%rcx)
2914L(bkP6Q3):
2915	mov    0x16(%rdx),%r9
2916	mov    %r9,0x16(%rcx)
2917L(bkP6Q2):
2918	mov    0xe(%rdx),%r11
2919	mov    %r11,0xe(%rcx)
2920L(bkP6Q1):
2921	mov    0x6(%rdx),%r10
2922	mov    %r10,0x6(%rcx)
2923L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
2924	mov    0x2(%rdx),%r9d
2925	mov    %r9d,0x2(%rcx)
2926	mov    (%rdx),%r10w
2927	mov    %r10w,(%rcx)
2928	ret
2929
2930	.balign 16
2931L(bkP7QI):
2932	mov    0x8f(%rdx),%r10
2933	mov    %r10,0x8f(%rcx)
2934L(bkP7QH):
2935	mov    0x87(%rdx),%r11
2936	mov    %r11,0x87(%rcx)
2937L(bkP7QG):
2938	mov    0x7f(%rdx),%r10
2939	mov    %r10,0x7f(%rcx)
2940L(bkP7QF):
2941	mov    0x77(%rdx),%r9
2942	mov    %r9,0x77(%rcx)
2943L(bkP7QE):
2944	mov    0x6f(%rdx),%r11
2945	mov    %r11,0x6f(%rcx)
2946L(bkP7QD):
2947	mov    0x67(%rdx),%r10
2948	mov    %r10,0x67(%rcx)
2949L(bkP7QC):
2950	mov    0x5f(%rdx),%r9
2951	mov    %r9,0x5f(%rcx)
2952L(bkP7QB):
2953	mov    0x57(%rdx),%r11
2954	mov    %r11,0x57(%rcx)
2955L(bkP7QA):
2956	mov    0x4f(%rdx),%r10
2957	mov    %r10,0x4f(%rcx)
2958L(bkP7Q9):
2959	mov    0x47(%rdx),%r9
2960	mov    %r9,0x47(%rcx)
2961L(bkP7Q8):
2962	mov    0x3f(%rdx),%r11
2963	mov    %r11,0x3f(%rcx)
2964L(bkP7Q7):
2965	mov    0x37(%rdx),%r10
2966	mov    %r10,0x37(%rcx)
2967L(bkP7Q6):
2968	mov    0x2f(%rdx),%r9
2969	mov    %r9,0x2f(%rcx)
2970L(bkP7Q5):
2971	mov    0x27(%rdx),%r11
2972	mov    %r11,0x27(%rcx)
2973L(bkP7Q4):
2974	mov    0x1f(%rdx),%r10
2975	mov    %r10,0x1f(%rcx)
2976L(bkP7Q3):
2977	mov    0x17(%rdx),%r9
2978	mov    %r9,0x17(%rcx)
2979L(bkP7Q2):
2980	mov    0xf(%rdx),%r11
2981	mov    %r11,0xf(%rcx)
2982L(bkP7Q1):
2983	mov    0x7(%rdx),%r10
2984	mov    %r10,0x7(%rcx)
2985L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
2986	mov    0x3(%rdx),%r9d
2987	mov    %r9d,0x3(%rcx)
2988	mov    0x1(%rdx),%r10w
2989	mov    %r10w,0x1(%rcx)
2990	mov    (%rdx),%r11b
2991	mov    %r11b,(%rcx)
2992	ret
2993
2994		.balign 16
2995L(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
2996		.int L(bkP1Q0)-L(bkPxQx)
2997		.int L(bkP2Q0)-L(bkPxQx)
2998		.int L(bkP3Q0)-L(bkPxQx)
2999		.int L(bkP4Q0)-L(bkPxQx)
3000		.int L(bkP5Q0)-L(bkPxQx)
3001		.int L(bkP6Q0)-L(bkPxQx)
3002		.int L(bkP7Q0)-L(bkPxQx)
3003
3004		.int L(bkP0Q1)-L(bkPxQx)
3005		.int L(bkP1Q1)-L(bkPxQx)
3006		.int L(bkP2Q1)-L(bkPxQx)
3007		.int L(bkP3Q1)-L(bkPxQx)
3008		.int L(bkP4Q1)-L(bkPxQx)
3009		.int L(bkP5Q1)-L(bkPxQx)
3010		.int L(bkP6Q1)-L(bkPxQx)
3011		.int L(bkP7Q1)-L(bkPxQx)
3012
3013		.int L(bkP0Q2)-L(bkPxQx)
3014		.int L(bkP1Q2)-L(bkPxQx)
3015		.int L(bkP2Q2)-L(bkPxQx)
3016		.int L(bkP3Q2)-L(bkPxQx)
3017		.int L(bkP4Q2)-L(bkPxQx)
3018		.int L(bkP5Q2)-L(bkPxQx)
3019		.int L(bkP6Q2)-L(bkPxQx)
3020		.int L(bkP7Q2)-L(bkPxQx)
3021
3022		.int L(bkP0Q3)-L(bkPxQx)
3023		.int L(bkP1Q3)-L(bkPxQx)
3024		.int L(bkP2Q3)-L(bkPxQx)
3025		.int L(bkP3Q3)-L(bkPxQx)
3026		.int L(bkP4Q3)-L(bkPxQx)
3027		.int L(bkP5Q3)-L(bkPxQx)
3028		.int L(bkP6Q3)-L(bkPxQx)
3029		.int L(bkP7Q3)-L(bkPxQx)
3030
3031		.int L(bkP0Q4)-L(bkPxQx)
3032		.int L(bkP1Q4)-L(bkPxQx)
3033		.int L(bkP2Q4)-L(bkPxQx)
3034		.int L(bkP3Q4)-L(bkPxQx)
3035		.int L(bkP4Q4)-L(bkPxQx)
3036		.int L(bkP5Q4)-L(bkPxQx)
3037		.int L(bkP6Q4)-L(bkPxQx)
3038		.int L(bkP7Q4)-L(bkPxQx)
3039
3040		.int L(bkP0Q5)-L(bkPxQx)
3041		.int L(bkP1Q5)-L(bkPxQx)
3042		.int L(bkP2Q5)-L(bkPxQx)
3043		.int L(bkP3Q5)-L(bkPxQx)
3044		.int L(bkP4Q5)-L(bkPxQx)
3045		.int L(bkP5Q5)-L(bkPxQx)
3046		.int L(bkP6Q5)-L(bkPxQx)
3047		.int L(bkP7Q5)-L(bkPxQx)
3048
3049		.int L(bkP0Q6)-L(bkPxQx)
3050		.int L(bkP1Q6)-L(bkPxQx)
3051		.int L(bkP2Q6)-L(bkPxQx)
3052		.int L(bkP3Q6)-L(bkPxQx)
3053		.int L(bkP4Q6)-L(bkPxQx)
3054		.int L(bkP5Q6)-L(bkPxQx)
3055		.int L(bkP6Q6)-L(bkPxQx)
3056		.int L(bkP7Q6)-L(bkPxQx)
3057
3058		.int L(bkP0Q7)-L(bkPxQx)
3059		.int L(bkP1Q7)-L(bkPxQx)
3060		.int L(bkP2Q7)-L(bkPxQx)
3061		.int L(bkP3Q7)-L(bkPxQx)
3062		.int L(bkP4Q7)-L(bkPxQx)
3063		.int L(bkP5Q7)-L(bkPxQx)
3064		.int L(bkP6Q7)-L(bkPxQx)
3065		.int L(bkP7Q7)-L(bkPxQx)
3066
3067		.int L(bkP0Q8)-L(bkPxQx)
3068		.int L(bkP1Q8)-L(bkPxQx)
3069		.int L(bkP2Q8)-L(bkPxQx)
3070		.int L(bkP3Q8)-L(bkPxQx)
3071		.int L(bkP4Q8)-L(bkPxQx)
3072		.int L(bkP5Q8)-L(bkPxQx)
3073		.int L(bkP6Q8)-L(bkPxQx)
3074		.int L(bkP7Q8)-L(bkPxQx)
3075
3076		.int L(bkP0Q9)-L(bkPxQx)
3077		.int L(bkP1Q9)-L(bkPxQx)
3078		.int L(bkP2Q9)-L(bkPxQx)
3079		.int L(bkP3Q9)-L(bkPxQx)
3080		.int L(bkP4Q9)-L(bkPxQx)
3081		.int L(bkP5Q9)-L(bkPxQx)
3082		.int L(bkP6Q9)-L(bkPxQx)
3083		.int L(bkP7Q9)-L(bkPxQx)
3084
3085		.int L(bkP0QA)-L(bkPxQx)
3086		.int L(bkP1QA)-L(bkPxQx)
3087		.int L(bkP2QA)-L(bkPxQx)
3088		.int L(bkP3QA)-L(bkPxQx)
3089		.int L(bkP4QA)-L(bkPxQx)
3090		.int L(bkP5QA)-L(bkPxQx)
3091		.int L(bkP6QA)-L(bkPxQx)
3092		.int L(bkP7QA)-L(bkPxQx)
3093
3094		.int L(bkP0QB)-L(bkPxQx)
3095		.int L(bkP1QB)-L(bkPxQx)
3096		.int L(bkP2QB)-L(bkPxQx)
3097		.int L(bkP3QB)-L(bkPxQx)
3098		.int L(bkP4QB)-L(bkPxQx)
3099		.int L(bkP5QB)-L(bkPxQx)
3100		.int L(bkP6QB)-L(bkPxQx)
3101		.int L(bkP7QB)-L(bkPxQx)
3102
3103		.int L(bkP0QC)-L(bkPxQx)
3104		.int L(bkP1QC)-L(bkPxQx)
3105		.int L(bkP2QC)-L(bkPxQx)
3106		.int L(bkP3QC)-L(bkPxQx)
3107		.int L(bkP4QC)-L(bkPxQx)
3108		.int L(bkP5QC)-L(bkPxQx)
3109		.int L(bkP6QC)-L(bkPxQx)
3110		.int L(bkP7QC)-L(bkPxQx)
3111
3112		.int L(bkP0QD)-L(bkPxQx)
3113		.int L(bkP1QD)-L(bkPxQx)
3114		.int L(bkP2QD)-L(bkPxQx)
3115		.int L(bkP3QD)-L(bkPxQx)
3116		.int L(bkP4QD)-L(bkPxQx)
3117		.int L(bkP5QD)-L(bkPxQx)
3118		.int L(bkP6QD)-L(bkPxQx)
3119		.int L(bkP7QD)-L(bkPxQx)
3120
3121		.int L(bkP0QE)-L(bkPxQx)
3122		.int L(bkP1QE)-L(bkPxQx)
3123		.int L(bkP2QE)-L(bkPxQx)
3124		.int L(bkP3QE)-L(bkPxQx)
3125		.int L(bkP4QE)-L(bkPxQx)
3126		.int L(bkP5QE)-L(bkPxQx)
3127		.int L(bkP6QE)-L(bkPxQx)
3128		.int L(bkP7QE)-L(bkPxQx)
3129
3130		.int L(bkP0QF)-L(bkPxQx)
3131		.int L(bkP1QF)-L(bkPxQx)
3132		.int L(bkP2QF)-L(bkPxQx)
3133		.int L(bkP3QF)-L(bkPxQx)
3134		.int L(bkP4QF)-L(bkPxQx)
3135		.int L(bkP5QF)-L(bkPxQx)
3136		.int L(bkP6QF)-L(bkPxQx)
3137		.int L(bkP7QF)-L(bkPxQx)
3138
3139		.int L(bkP0QG)-L(bkPxQx)
3140		.int L(bkP1QG)-L(bkPxQx)
3141		.int L(bkP2QG)-L(bkPxQx)
3142		.int L(bkP3QG)-L(bkPxQx)
3143		.int L(bkP4QG)-L(bkPxQx)
3144		.int L(bkP5QG)-L(bkPxQx)
3145		.int L(bkP6QG)-L(bkPxQx)
3146		.int L(bkP7QG)-L(bkPxQx)
3147
3148		.int L(bkP0QH)-L(bkPxQx)
3149		.int L(bkP1QH)-L(bkPxQx)
3150		.int L(bkP2QH)-L(bkPxQx)
3151		.int L(bkP3QH)-L(bkPxQx)
3152		.int L(bkP4QH)-L(bkPxQx)
3153		.int L(bkP5QH)-L(bkPxQx)
3154		.int L(bkP6QH)-L(bkPxQx)
3155		.int L(bkP7QH)-L(bkPxQx)
3156
3157		.int L(bkP0QI)-L(bkPxQx)
3158		.int L(bkP1QI)-L(bkPxQx)
3159		.int L(bkP2QI)-L(bkPxQx)
3160		.int L(bkP3QI)-L(bkPxQx)
3161		.int L(bkP4QI)-L(bkPxQx)
3162		.int L(bkP5QI)-L(bkPxQx)
3163		.int L(bkP6QI)-L(bkPxQx)
3164		.int L(bkP7QI)-L(bkPxQx)
3165
3166	SET_SIZE(memmove)
3167