xref: /titanic_52/usr/src/lib/libc/amd64/gen/memcpy.s (revision b9bd317cda1afb3a01f4812de73e8cec888cbbd7)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2008, Intel Corporation
29 * All rights reserved.
30 */
31
32/*
33 * memcpy.s - copies two blocks of memory
34 *	Implements memcpy() and memmove() libc primitives.
35 */
36
37#pragma ident	"%Z%%M%	%I%	%E% SMI"
38
39	.file	"%M%"
40
41#include <sys/asm_linkage.h>
42
43	ANSI_PRAGMA_WEAK(memmove,function)
44	ANSI_PRAGMA_WEAK(memcpy,function)
45
46#include "cache.h"
47#include "proc64_id.h"
48
49#define L(s) .memcpy/**/s
50
51/*
52 * memcpy algorithm overview:
53 *
54 * Thresholds used below were determined experimentally.
55 *
56 * Pseudo code:
57 *
58 * If (size <= 128 bytes) {
59 *	do unrolled code (primarily 8-byte loads/stores) regardless of
60 *	alignment.
61 * } else {
62 *	Align destination to 16-byte boundary
63 *
64 *      if (NO_SSE) {
65 *		If (size > half of the largest level cache) {
66 *			Use 8-byte non-temporal stores (64-bytes/loop)
67 *		} else {
68 *			if (size > 4K && size <= half l1 cache size) {
69 *				Use rep movsq
70 *			} else {
71 *				Use 8-byte loads/stores (64 bytes per loop)
72 *			}
73 *		}
74 *
75 *	} else { **USE SSE**
76 *		If (size > half of the largest level cache) {
77 *			Use 16-byte non-temporal stores (128-bytes per loop)
78 *		} else {
79 *			If (both source and destination are aligned) {
80 *			    Use 16-byte aligned loads and stores (128 bytes/loop)
81 *			} else {
82 *			    use pairs of xmm registers with SSE2 or SSSE3
83 *			    instructions to concatenate and shift appropriately
84 *			    to account for source unalignment. This enables
85 *			    16-byte aligned loads to be done.
86 *			}
87 *		}
88	}
89 *
90 *	Finish any remaining bytes via unrolled code above.
91 * }
92 *
93 * memmove overview:
94 *	memmove is the same as memcpy except one case where copy needs to be
95 *	done backwards. The copy backwards code is done in a similar manner.
96 */
97
98	ENTRY(memmove)
99	cmp	%rsi,%rdi		# if dst <= src
100	jbe	L(CopyForward)		# then do copy forward
101	mov	%rsi,%r9		# move src to r9
102	add	%rdx,%r9		# add len to get addr of end of src
103	cmp	%r9,%rdi		# if dst < end of src
104	jb	L(CopyBackwards)	# then do copy backwards
105	jmp	L(CopyForward)
106
107	ENTRY (memcpy)
108L(CopyForward):
109	mov    %rdx,%r8
110	mov    %rdi,%rcx
111	mov    %rsi,%rdx
112	mov    %rdi,%rax
113	lea    L(fwdPxQx)(%rip),%r11
114	cmp    $0x80,%r8		# 128
115	jg     L(ck_use_sse2)
116	add    %r8,%rcx
117	add    %r8,%rdx
118
119	movslq (%r11,%r8,4),%r10
120	lea    (%r10,%r11,1),%r11
121	jmpq   *%r11
122
123	.balign 16
124L(ShrtAlignNew):
125	lea    L(AliPxQx)(%rip),%r11
126	mov    %rcx,%r9
127	and    $0xf,%r9
128
129	movslq (%r11,%r9,4),%r10
130	lea    (%r10,%r11,1),%r11
131	jmpq   *%r11
132
133	.balign 16
134L(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
135           .int        L(P1Q0)-L(fwdPxQx)
136           .int        L(P2Q0)-L(fwdPxQx)
137           .int        L(P3Q0)-L(fwdPxQx)
138           .int        L(P4Q0)-L(fwdPxQx)
139           .int        L(P5Q0)-L(fwdPxQx)
140           .int        L(P6Q0)-L(fwdPxQx)
141           .int        L(P7Q0)-L(fwdPxQx)
142
143           .int        L(P0Q1)-L(fwdPxQx)
144           .int        L(P1Q1)-L(fwdPxQx)
145           .int        L(P2Q1)-L(fwdPxQx)
146           .int        L(P3Q1)-L(fwdPxQx)
147           .int        L(P4Q1)-L(fwdPxQx)
148           .int        L(P5Q1)-L(fwdPxQx)
149           .int        L(P6Q1)-L(fwdPxQx)
150           .int        L(P7Q1)-L(fwdPxQx)
151
152           .int        L(P0Q2)-L(fwdPxQx)
153           .int        L(P1Q2)-L(fwdPxQx)
154           .int        L(P2Q2)-L(fwdPxQx)
155           .int        L(P3Q2)-L(fwdPxQx)
156           .int        L(P4Q2)-L(fwdPxQx)
157           .int        L(P5Q2)-L(fwdPxQx)
158           .int        L(P6Q2)-L(fwdPxQx)
159           .int        L(P7Q2)-L(fwdPxQx)
160
161           .int        L(P0Q3)-L(fwdPxQx)
162           .int        L(P1Q3)-L(fwdPxQx)
163           .int        L(P2Q3)-L(fwdPxQx)
164           .int        L(P3Q3)-L(fwdPxQx)
165           .int        L(P4Q3)-L(fwdPxQx)
166           .int        L(P5Q3)-L(fwdPxQx)
167           .int        L(P6Q3)-L(fwdPxQx)
168           .int        L(P7Q3)-L(fwdPxQx)
169
170           .int        L(P0Q4)-L(fwdPxQx)
171           .int        L(P1Q4)-L(fwdPxQx)
172           .int        L(P2Q4)-L(fwdPxQx)
173           .int        L(P3Q4)-L(fwdPxQx)
174           .int        L(P4Q4)-L(fwdPxQx)
175           .int        L(P5Q4)-L(fwdPxQx)
176           .int        L(P6Q4)-L(fwdPxQx)
177           .int        L(P7Q4)-L(fwdPxQx)
178
179           .int        L(P0Q5)-L(fwdPxQx)
180           .int        L(P1Q5)-L(fwdPxQx)
181           .int        L(P2Q5)-L(fwdPxQx)
182           .int        L(P3Q5)-L(fwdPxQx)
183           .int        L(P4Q5)-L(fwdPxQx)
184           .int        L(P5Q5)-L(fwdPxQx)
185           .int        L(P6Q5)-L(fwdPxQx)
186           .int        L(P7Q5)-L(fwdPxQx)
187
188           .int        L(P0Q6)-L(fwdPxQx)
189           .int        L(P1Q6)-L(fwdPxQx)
190           .int        L(P2Q6)-L(fwdPxQx)
191           .int        L(P3Q6)-L(fwdPxQx)
192           .int        L(P4Q6)-L(fwdPxQx)
193           .int        L(P5Q6)-L(fwdPxQx)
194           .int        L(P6Q6)-L(fwdPxQx)
195           .int        L(P7Q6)-L(fwdPxQx)
196
197           .int        L(P0Q7)-L(fwdPxQx)
198           .int        L(P1Q7)-L(fwdPxQx)
199           .int        L(P2Q7)-L(fwdPxQx)
200           .int        L(P3Q7)-L(fwdPxQx)
201           .int        L(P4Q7)-L(fwdPxQx)
202           .int        L(P5Q7)-L(fwdPxQx)
203           .int        L(P6Q7)-L(fwdPxQx)
204           .int        L(P7Q7)-L(fwdPxQx)
205
206           .int        L(P0Q8)-L(fwdPxQx)
207           .int        L(P1Q8)-L(fwdPxQx)
208           .int        L(P2Q8)-L(fwdPxQx)
209           .int        L(P3Q8)-L(fwdPxQx)
210           .int        L(P4Q8)-L(fwdPxQx)
211           .int        L(P5Q8)-L(fwdPxQx)
212           .int        L(P6Q8)-L(fwdPxQx)
213           .int        L(P7Q8)-L(fwdPxQx)
214
215           .int        L(P0Q9)-L(fwdPxQx)
216           .int        L(P1Q9)-L(fwdPxQx)
217           .int        L(P2Q9)-L(fwdPxQx)
218           .int        L(P3Q9)-L(fwdPxQx)
219           .int        L(P4Q9)-L(fwdPxQx)
220           .int        L(P5Q9)-L(fwdPxQx)
221           .int        L(P6Q9)-L(fwdPxQx)
222           .int        L(P7Q9)-L(fwdPxQx)
223
224           .int        L(P0QA)-L(fwdPxQx)
225           .int        L(P1QA)-L(fwdPxQx)
226           .int        L(P2QA)-L(fwdPxQx)
227           .int        L(P3QA)-L(fwdPxQx)
228           .int        L(P4QA)-L(fwdPxQx)
229           .int        L(P5QA)-L(fwdPxQx)
230           .int        L(P6QA)-L(fwdPxQx)
231           .int        L(P7QA)-L(fwdPxQx)
232
233           .int        L(P0QB)-L(fwdPxQx)
234           .int        L(P1QB)-L(fwdPxQx)
235           .int        L(P2QB)-L(fwdPxQx)
236           .int        L(P3QB)-L(fwdPxQx)
237           .int        L(P4QB)-L(fwdPxQx)
238           .int        L(P5QB)-L(fwdPxQx)
239           .int        L(P6QB)-L(fwdPxQx)
240           .int        L(P7QB)-L(fwdPxQx)
241
242           .int        L(P0QC)-L(fwdPxQx)
243           .int        L(P1QC)-L(fwdPxQx)
244           .int        L(P2QC)-L(fwdPxQx)
245           .int        L(P3QC)-L(fwdPxQx)
246           .int        L(P4QC)-L(fwdPxQx)
247           .int        L(P5QC)-L(fwdPxQx)
248           .int        L(P6QC)-L(fwdPxQx)
249           .int        L(P7QC)-L(fwdPxQx)
250
251           .int        L(P0QD)-L(fwdPxQx)
252           .int        L(P1QD)-L(fwdPxQx)
253           .int        L(P2QD)-L(fwdPxQx)
254           .int        L(P3QD)-L(fwdPxQx)
255           .int        L(P4QD)-L(fwdPxQx)
256           .int        L(P5QD)-L(fwdPxQx)
257           .int        L(P6QD)-L(fwdPxQx)
258           .int        L(P7QD)-L(fwdPxQx)
259
260           .int        L(P0QE)-L(fwdPxQx)
261           .int        L(P1QE)-L(fwdPxQx)
262           .int        L(P2QE)-L(fwdPxQx)
263           .int        L(P3QE)-L(fwdPxQx)
264           .int        L(P4QE)-L(fwdPxQx)
265           .int        L(P5QE)-L(fwdPxQx)
266           .int        L(P6QE)-L(fwdPxQx)
267           .int        L(P7QE)-L(fwdPxQx)
268
269           .int        L(P0QF)-L(fwdPxQx)
270           .int        L(P1QF)-L(fwdPxQx)
271           .int        L(P2QF)-L(fwdPxQx)
272           .int        L(P3QF)-L(fwdPxQx)
273           .int        L(P4QF)-L(fwdPxQx)
274           .int        L(P5QF)-L(fwdPxQx)
275           .int        L(P6QF)-L(fwdPxQx)
276           .int        L(P7QF)-L(fwdPxQx)
277
278           .int        L(P0QG)-L(fwdPxQx)	# 0x80
279
280	   .balign 16
281L(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
282           .int        L(A1Q0)-L(AliPxQx)
283           .int        L(A2Q0)-L(AliPxQx)
284           .int        L(A3Q0)-L(AliPxQx)
285           .int        L(A4Q0)-L(AliPxQx)
286           .int        L(A5Q0)-L(AliPxQx)
287           .int        L(A6Q0)-L(AliPxQx)
288           .int        L(A7Q0)-L(AliPxQx)
289           .int        L(A0Q1)-L(AliPxQx)
290           .int        L(A1Q1)-L(AliPxQx)
291           .int        L(A2Q1)-L(AliPxQx)
292           .int        L(A3Q1)-L(AliPxQx)
293           .int        L(A4Q1)-L(AliPxQx)
294           .int        L(A5Q1)-L(AliPxQx)
295           .int        L(A6Q1)-L(AliPxQx)
296           .int        L(A7Q1)-L(AliPxQx)
297
298	.balign 16
299L(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
300	movzbq (%rdx),%r11
301	sub    $0xf,%r8
302	mov    %r11b,(%rcx)
303
304	movzwq 0x1(%rdx),%r10
305	mov    %r10w,0x1(%rcx)
306
307	mov    0x3(%rdx),%r9d
308	mov    %r9d,0x3(%rcx)
309
310	mov    0x7(%rdx),%r11
311	add    $0xf,%rdx
312	mov    %r11,0x7(%rcx)
313
314	add    $0xf,%rcx
315	jmp    L(now_qw_aligned)
316
317	.balign 16
318L(A2Q0):			# ; need to move 8+ 6=2+4 bytes
319	movzwq (%rdx),%r10
320	sub    $0xe,%r8
321	mov    %r10w,(%rcx)
322
323	mov    0x2(%rdx),%r9d
324	mov    %r9d,0x2(%rcx)
325
326	mov    0x6(%rdx),%r11
327	add    $0xe,%rdx
328	mov    %r11,0x6(%rcx)
329	add    $0xe,%rcx
330	jmp    L(now_qw_aligned)
331
332	.balign 16
333L(A3Q0):			# ; need to move 8+ 5=1+4 bytes
334	movzbq (%rdx),%r11
335	sub    $0xd,%r8
336	mov    %r11b,(%rcx)
337
338	mov    0x1(%rdx),%r9d
339	mov    %r9d,0x1(%rcx)
340
341	mov    0x5(%rdx),%r10
342	add    $0xd,%rdx
343	mov    %r10,0x5(%rcx)
344
345	add    $0xd,%rcx
346	jmp    L(now_qw_aligned)
347
348	.balign 16
349L(A4Q0):			# ; need to move 8+4 bytes
350	mov    (%rdx),%r9d
351	sub    $0xc,%r8
352	mov    %r9d,(%rcx)
353
354	mov    0x4(%rdx),%r10
355	add    $0xc,%rdx
356	mov    %r10,0x4(%rcx)
357
358	add    $0xc,%rcx
359	jmp    L(now_qw_aligned)
360
361	.balign 16
362L(A5Q0):			# ; need to move 8+ 3=1+2 bytes
363	movzbq (%rdx),%r11
364	sub    $0xb,%r8
365	mov    %r11b,(%rcx)
366
367	movzwq 0x1(%rdx),%r10
368	mov    %r10w,0x1(%rcx)
369
370	mov    0x3(%rdx),%r9
371	add    $0xb,%rdx
372	mov    %r9,0x3(%rcx)
373
374	add    $0xb,%rcx
375	jmp    L(now_qw_aligned)
376
377	.balign 16
378L(A6Q0):			# ; need to move 8+2 bytes
379	movzwq (%rdx),%r10
380	sub    $0xa,%r8
381	mov    %r10w,(%rcx)
382
383	mov    0x2(%rdx),%r9
384	add    $0xa,%rdx
385	mov    %r9,0x2(%rcx)
386
387	add    $0xa,%rcx
388	jmp    L(now_qw_aligned)
389
390	.balign 16
391L(A7Q0):			# ; need to move 8+1 byte
392	movzbq (%rdx),%r11
393	sub    $0x9,%r8
394	mov    %r11b,(%rcx)
395
396	mov    0x1(%rdx),%r10
397	add    $0x9,%rdx
398	mov    %r10,0x1(%rcx)
399
400	add    $0x9,%rcx
401	jmp    L(now_qw_aligned)
402
403	.balign 16
404L(A0Q1):			# ; need to move 8 bytes
405
406	mov    (%rdx),%r10
407	add    $0x8,%rdx
408	sub    $0x8,%r8
409	mov    %r10,(%rcx)
410
411	add    $0x8,%rcx
412	jmp    L(now_qw_aligned)
413
414	.balign 16
415L(A1Q1):			# ; need to move 7=1+2+4 bytes
416	movzbq (%rdx),%r11
417	sub    $0x7,%r8
418	mov    %r11b,(%rcx)
419
420	movzwq 0x1(%rdx),%r10
421	mov    %r10w,0x1(%rcx)
422
423	mov    0x3(%rdx),%r9d
424	add    $0x7,%rdx
425	mov    %r9d,0x3(%rcx)
426	add    $0x7,%rcx
427	jmp    L(now_qw_aligned)
428
429	.balign 16
430L(A2Q1):			# ; need to move 6=2+4 bytes
431	movzwq (%rdx),%r10
432	sub    $0x6,%r8
433	mov    %r10w,(%rcx)
434	mov    0x2(%rdx),%r9d
435	add    $0x6,%rdx
436	mov    %r9d,0x2(%rcx)
437	add    $0x6,%rcx
438	jmp    L(now_qw_aligned)
439
440	.balign 16
441L(A3Q1):			# ; need to move 5=1+4 bytes
442	movzbq (%rdx),%r11
443	sub    $0x5,%r8
444	mov    %r11b,(%rcx)
445	mov    0x1(%rdx),%r9d
446	add    $0x5,%rdx
447	mov    %r9d,0x1(%rcx)
448	add    $0x5,%rcx
449	jmp    L(now_qw_aligned)
450
451	.balign 16
452L(A4Q1):			# ; need to move 4 bytes
453	mov    (%rdx),%r9d
454	sub    $0x4,%r8
455	add    $0x4,%rdx
456	mov    %r9d,(%rcx)
457	add    $0x4,%rcx
458	jmp    L(now_qw_aligned)
459
460	.balign 16
461L(A5Q1):			# ; need to move 3=1+2 bytes
462	movzbq (%rdx),%r11
463	sub    $0x3,%r8
464	mov    %r11b,(%rcx)
465
466	movzwq 0x1(%rdx),%r10
467	add    $0x3,%rdx
468	mov    %r10w,0x1(%rcx)
469
470	add    $0x3,%rcx
471	jmp    L(now_qw_aligned)
472
473	.balign 16
474L(A6Q1):			# ; need to move 2 bytes
475	movzwq (%rdx),%r10
476	sub    $0x2,%r8
477	add    $0x2,%rdx
478	mov    %r10w,(%rcx)
479	add    $0x2,%rcx
480	jmp    L(now_qw_aligned)
481
482	.balign 16
483L(A7Q1):			# ; need to move 1 byte
484	movzbq (%rdx),%r11
485	dec    %r8
486	inc    %rdx
487	mov    %r11b,(%rcx)
488	inc    %rcx
489	jmp    L(now_qw_aligned)
490
491
492	.balign 16
493L(P0QG):
494	mov    -0x80(%rdx),%r9
495	mov    %r9,-0x80(%rcx)
496L(P0QF):
497	mov    -0x78(%rdx),%r10
498	mov    %r10,-0x78(%rcx)
499L(P0QE):
500	mov    -0x70(%rdx),%r9
501	mov    %r9,-0x70(%rcx)
502L(P0QD):
503	mov    -0x68(%rdx),%r10
504	mov    %r10,-0x68(%rcx)
505L(P0QC):
506	mov    -0x60(%rdx),%r9
507	mov    %r9,-0x60(%rcx)
508L(P0QB):
509	mov    -0x58(%rdx),%r10
510	mov    %r10,-0x58(%rcx)
511L(P0QA):
512	mov    -0x50(%rdx),%r9
513	mov    %r9,-0x50(%rcx)
514L(P0Q9):
515	mov    -0x48(%rdx),%r10
516	mov    %r10,-0x48(%rcx)
517L(P0Q8):
518	mov    -0x40(%rdx),%r9
519	mov    %r9,-0x40(%rcx)
520L(P0Q7):
521	mov    -0x38(%rdx),%r10
522	mov    %r10,-0x38(%rcx)
523L(P0Q6):
524	mov    -0x30(%rdx),%r9
525	mov    %r9,-0x30(%rcx)
526L(P0Q5):
527	mov    -0x28(%rdx),%r10
528	mov    %r10,-0x28(%rcx)
529L(P0Q4):
530	mov    -0x20(%rdx),%r9
531	mov    %r9,-0x20(%rcx)
532L(P0Q3):
533	mov    -0x18(%rdx),%r10
534	mov    %r10,-0x18(%rcx)
535L(P0Q2):
536	mov    -0x10(%rdx),%r9
537	mov    %r9,-0x10(%rcx)
538L(P0Q1):
539	mov    -0x8(%rdx),%r10
540	mov    %r10,-0x8(%rcx)
541L(P0Q0):
542	ret
543
544	.balign 16
545L(P1QF):
546	mov    -0x79(%rdx),%r9
547	mov    %r9,-0x79(%rcx)
548L(P1QE):
549	mov    -0x71(%rdx),%r11
550	mov    %r11,-0x71(%rcx)
551L(P1QD):
552	mov    -0x69(%rdx),%r10
553	mov    %r10,-0x69(%rcx)
554L(P1QC):
555	mov    -0x61(%rdx),%r9
556	mov    %r9,-0x61(%rcx)
557L(P1QB):
558	mov    -0x59(%rdx),%r11
559	mov    %r11,-0x59(%rcx)
560L(P1QA):
561	mov    -0x51(%rdx),%r10
562	mov    %r10,-0x51(%rcx)
563L(P1Q9):
564	mov    -0x49(%rdx),%r9
565	mov    %r9,-0x49(%rcx)
566L(P1Q8):
567	mov    -0x41(%rdx),%r11
568	mov    %r11,-0x41(%rcx)
569L(P1Q7):
570	mov    -0x39(%rdx),%r10
571	mov    %r10,-0x39(%rcx)
572L(P1Q6):
573	mov    -0x31(%rdx),%r9
574	mov    %r9,-0x31(%rcx)
575L(P1Q5):
576	mov    -0x29(%rdx),%r11
577	mov    %r11,-0x29(%rcx)
578L(P1Q4):
579	mov    -0x21(%rdx),%r10
580	mov    %r10,-0x21(%rcx)
581L(P1Q3):
582	mov    -0x19(%rdx),%r9
583	mov    %r9,-0x19(%rcx)
584L(P1Q2):
585	mov    -0x11(%rdx),%r11
586	mov    %r11,-0x11(%rcx)
587L(P1Q1):
588	mov    -0x9(%rdx),%r10
589	mov    %r10,-0x9(%rcx)
590L(P1Q0):
591	movzbq -0x1(%rdx),%r9
592	mov    %r9b,-0x1(%rcx)
593	ret
594
595	.balign 16
596L(P2QF):
597	mov    -0x7a(%rdx),%r9
598	mov    %r9,-0x7a(%rcx)
599L(P2QE):
600	mov    -0x72(%rdx),%r11
601	mov    %r11,-0x72(%rcx)
602L(P2QD):
603	mov    -0x6a(%rdx),%r10
604	mov    %r10,-0x6a(%rcx)
605L(P2QC):
606	mov    -0x62(%rdx),%r9
607	mov    %r9,-0x62(%rcx)
608L(P2QB):
609	mov    -0x5a(%rdx),%r11
610	mov    %r11,-0x5a(%rcx)
611L(P2QA):
612	mov    -0x52(%rdx),%r10
613	mov    %r10,-0x52(%rcx)
614L(P2Q9):
615	mov    -0x4a(%rdx),%r9
616	mov    %r9,-0x4a(%rcx)
617L(P2Q8):
618	mov    -0x42(%rdx),%r11
619	mov    %r11,-0x42(%rcx)
620L(P2Q7):
621	mov    -0x3a(%rdx),%r10
622	mov    %r10,-0x3a(%rcx)
623L(P2Q6):
624	mov    -0x32(%rdx),%r9
625	mov    %r9,-0x32(%rcx)
626L(P2Q5):
627	mov    -0x2a(%rdx),%r11
628	mov    %r11,-0x2a(%rcx)
629L(P2Q4):
630	mov    -0x22(%rdx),%r10
631	mov    %r10,-0x22(%rcx)
632L(P2Q3):
633	mov    -0x1a(%rdx),%r9
634	mov    %r9,-0x1a(%rcx)
635L(P2Q2):
636	mov    -0x12(%rdx),%r11
637	mov    %r11,-0x12(%rcx)
638L(P2Q1):
639	mov    -0xa(%rdx),%r10
640	mov    %r10,-0xa(%rcx)
641L(P2Q0):
642	movzwq -0x2(%rdx),%r9
643	mov    %r9w,-0x2(%rcx)
644	ret
645
646	.balign 16
647L(P3QF):
648	mov    -0x7b(%rdx),%r9
649	mov    %r9,-0x7b(%rcx)
650L(P3QE):
651	mov    -0x73(%rdx),%r11
652	mov    %r11,-0x73(%rcx)
653L(P3QD):
654	mov    -0x6b(%rdx),%r10
655	mov    %r10,-0x6b(%rcx)
656L(P3QC):
657	mov    -0x63(%rdx),%r9
658	mov    %r9,-0x63(%rcx)
659L(P3QB):
660	mov    -0x5b(%rdx),%r11
661	mov    %r11,-0x5b(%rcx)
662L(P3QA):
663	mov    -0x53(%rdx),%r10
664	mov    %r10,-0x53(%rcx)
665L(P3Q9):
666	mov    -0x4b(%rdx),%r9
667	mov    %r9,-0x4b(%rcx)
668L(P3Q8):
669	mov    -0x43(%rdx),%r11
670	mov    %r11,-0x43(%rcx)
671L(P3Q7):
672	mov    -0x3b(%rdx),%r10
673	mov    %r10,-0x3b(%rcx)
674L(P3Q6):
675	mov    -0x33(%rdx),%r9
676	mov    %r9,-0x33(%rcx)
677L(P3Q5):
678	mov    -0x2b(%rdx),%r11
679	mov    %r11,-0x2b(%rcx)
680L(P3Q4):
681	mov    -0x23(%rdx),%r10
682	mov    %r10,-0x23(%rcx)
683L(P3Q3):
684	mov    -0x1b(%rdx),%r9
685	mov    %r9,-0x1b(%rcx)
686L(P3Q2):
687	mov    -0x13(%rdx),%r11
688	mov    %r11,-0x13(%rcx)
689L(P3Q1):
690	mov    -0xb(%rdx),%r10
691	mov    %r10,-0xb(%rcx)
692	/*
693	 * These trailing loads/stores have to do all their loads 1st,
694	 * then do the stores.
695	 */
696L(P3Q0):
697	movzwq -0x3(%rdx),%r9
698	movzbq -0x1(%rdx),%r10
699	mov    %r9w,-0x3(%rcx)
700	mov    %r10b,-0x1(%rcx)
701	ret
702
703	.balign 16
704L(P4QF):
705	mov    -0x7c(%rdx),%r9
706	mov    %r9,-0x7c(%rcx)
707L(P4QE):
708	mov    -0x74(%rdx),%r11
709	mov    %r11,-0x74(%rcx)
710L(P4QD):
711	mov    -0x6c(%rdx),%r10
712	mov    %r10,-0x6c(%rcx)
713L(P4QC):
714	mov    -0x64(%rdx),%r9
715	mov    %r9,-0x64(%rcx)
716L(P4QB):
717	mov    -0x5c(%rdx),%r11
718	mov    %r11,-0x5c(%rcx)
719L(P4QA):
720	mov    -0x54(%rdx),%r10
721	mov    %r10,-0x54(%rcx)
722L(P4Q9):
723	mov    -0x4c(%rdx),%r9
724	mov    %r9,-0x4c(%rcx)
725L(P4Q8):
726	mov    -0x44(%rdx),%r11
727	mov    %r11,-0x44(%rcx)
728L(P4Q7):
729	mov    -0x3c(%rdx),%r10
730	mov    %r10,-0x3c(%rcx)
731L(P4Q6):
732	mov    -0x34(%rdx),%r9
733	mov    %r9,-0x34(%rcx)
734L(P4Q5):
735	mov    -0x2c(%rdx),%r11
736	mov    %r11,-0x2c(%rcx)
737L(P4Q4):
738	mov    -0x24(%rdx),%r10
739	mov    %r10,-0x24(%rcx)
740L(P4Q3):
741	mov    -0x1c(%rdx),%r9
742	mov    %r9,-0x1c(%rcx)
743L(P4Q2):
744	mov    -0x14(%rdx),%r11
745	mov    %r11,-0x14(%rcx)
746L(P4Q1):
747	mov    -0xc(%rdx),%r10
748	mov    %r10,-0xc(%rcx)
749L(P4Q0):
750	mov    -0x4(%rdx),%r9d
751	mov    %r9d,-0x4(%rcx)
752	ret
753
754	.balign 16
755L(P5QF):
756	mov    -0x7d(%rdx),%r9
757	mov    %r9,-0x7d(%rcx)
758L(P5QE):
759	mov    -0x75(%rdx),%r11
760	mov    %r11,-0x75(%rcx)
761L(P5QD):
762	mov    -0x6d(%rdx),%r10
763	mov    %r10,-0x6d(%rcx)
764L(P5QC):
765	mov    -0x65(%rdx),%r9
766	mov    %r9,-0x65(%rcx)
767L(P5QB):
768	mov    -0x5d(%rdx),%r11
769	mov    %r11,-0x5d(%rcx)
770L(P5QA):
771	mov    -0x55(%rdx),%r10
772	mov    %r10,-0x55(%rcx)
773L(P5Q9):
774	mov    -0x4d(%rdx),%r9
775	mov    %r9,-0x4d(%rcx)
776L(P5Q8):
777	mov    -0x45(%rdx),%r11
778	mov    %r11,-0x45(%rcx)
779L(P5Q7):
780	mov    -0x3d(%rdx),%r10
781	mov    %r10,-0x3d(%rcx)
782L(P5Q6):
783	mov    -0x35(%rdx),%r9
784	mov    %r9,-0x35(%rcx)
785L(P5Q5):
786	mov    -0x2d(%rdx),%r11
787	mov    %r11,-0x2d(%rcx)
788L(P5Q4):
789	mov    -0x25(%rdx),%r10
790	mov    %r10,-0x25(%rcx)
791L(P5Q3):
792	mov    -0x1d(%rdx),%r9
793	mov    %r9,-0x1d(%rcx)
794L(P5Q2):
795	mov    -0x15(%rdx),%r11
796	mov    %r11,-0x15(%rcx)
797L(P5Q1):
798	mov    -0xd(%rdx),%r10
799	mov    %r10,-0xd(%rcx)
800	/*
801	 * These trailing loads/stores have to do all their loads 1st,
802	 * then do the stores.
803	 */
804L(P5Q0):
805	mov    -0x5(%rdx),%r9d
806	movzbq -0x1(%rdx),%r10
807	mov    %r9d,-0x5(%rcx)
808	mov    %r10b,-0x1(%rcx)
809	ret
810
811	.balign 16
812L(P6QF):
813	mov    -0x7e(%rdx),%r9
814	mov    %r9,-0x7e(%rcx)
815L(P6QE):
816	mov    -0x76(%rdx),%r11
817	mov    %r11,-0x76(%rcx)
818L(P6QD):
819	mov    -0x6e(%rdx),%r10
820	mov    %r10,-0x6e(%rcx)
821L(P6QC):
822	mov    -0x66(%rdx),%r9
823	mov    %r9,-0x66(%rcx)
824L(P6QB):
825	mov    -0x5e(%rdx),%r11
826	mov    %r11,-0x5e(%rcx)
827L(P6QA):
828	mov    -0x56(%rdx),%r10
829	mov    %r10,-0x56(%rcx)
830L(P6Q9):
831	mov    -0x4e(%rdx),%r9
832	mov    %r9,-0x4e(%rcx)
833L(P6Q8):
834	mov    -0x46(%rdx),%r11
835	mov    %r11,-0x46(%rcx)
836L(P6Q7):
837	mov    -0x3e(%rdx),%r10
838	mov    %r10,-0x3e(%rcx)
839L(P6Q6):
840	mov    -0x36(%rdx),%r9
841	mov    %r9,-0x36(%rcx)
842L(P6Q5):
843	mov    -0x2e(%rdx),%r11
844	mov    %r11,-0x2e(%rcx)
845L(P6Q4):
846	mov    -0x26(%rdx),%r10
847	mov    %r10,-0x26(%rcx)
848L(P6Q3):
849	mov    -0x1e(%rdx),%r9
850	mov    %r9,-0x1e(%rcx)
851L(P6Q2):
852	mov    -0x16(%rdx),%r11
853	mov    %r11,-0x16(%rcx)
854L(P6Q1):
855	mov    -0xe(%rdx),%r10
856	mov    %r10,-0xe(%rcx)
857	/*
858	 * These trailing loads/stores have to do all their loads 1st,
859	 * then do the stores.
860	 */
861L(P6Q0):
862	mov    -0x6(%rdx),%r9d
863	movzwq -0x2(%rdx),%r10
864	mov    %r9d,-0x6(%rcx)
865	mov    %r10w,-0x2(%rcx)
866	ret
867
868	.balign 16
869L(P7QF):
870	mov    -0x7f(%rdx),%r9
871	mov    %r9,-0x7f(%rcx)
872L(P7QE):
873	mov    -0x77(%rdx),%r11
874	mov    %r11,-0x77(%rcx)
875L(P7QD):
876	mov    -0x6f(%rdx),%r10
877	mov    %r10,-0x6f(%rcx)
878L(P7QC):
879	mov    -0x67(%rdx),%r9
880	mov    %r9,-0x67(%rcx)
881L(P7QB):
882	mov    -0x5f(%rdx),%r11
883	mov    %r11,-0x5f(%rcx)
884L(P7QA):
885	mov    -0x57(%rdx),%r10
886	mov    %r10,-0x57(%rcx)
887L(P7Q9):
888	mov    -0x4f(%rdx),%r9
889	mov    %r9,-0x4f(%rcx)
890L(P7Q8):
891	mov    -0x47(%rdx),%r11
892	mov    %r11,-0x47(%rcx)
893L(P7Q7):
894	mov    -0x3f(%rdx),%r10
895	mov    %r10,-0x3f(%rcx)
896L(P7Q6):
897	mov    -0x37(%rdx),%r9
898	mov    %r9,-0x37(%rcx)
899L(P7Q5):
900	mov    -0x2f(%rdx),%r11
901	mov    %r11,-0x2f(%rcx)
902L(P7Q4):
903	mov    -0x27(%rdx),%r10
904	mov    %r10,-0x27(%rcx)
905L(P7Q3):
906	mov    -0x1f(%rdx),%r9
907	mov    %r9,-0x1f(%rcx)
908L(P7Q2):
909	mov    -0x17(%rdx),%r11
910	mov    %r11,-0x17(%rcx)
911L(P7Q1):
912	mov    -0xf(%rdx),%r10
913	mov    %r10,-0xf(%rcx)
914	/*
915	 * These trailing loads/stores have to do all their loads 1st,
916	 * then do the stores.
917	 */
918L(P7Q0):
919	mov    -0x7(%rdx),%r9d
920	movzwq -0x3(%rdx),%r10
921	movzbq -0x1(%rdx),%r11
922	mov    %r9d,-0x7(%rcx)
923	mov    %r10w,-0x3(%rcx)
924	mov    %r11b,-0x1(%rcx)
925	ret
926
927	.balign 16
928L(ck_use_sse2):
929	/*
930	 * Align dest to 16 byte boundary.
931	 */
932	test   $0xf,%rcx
933	jnz    L(ShrtAlignNew)
934
935L(now_qw_aligned):
936	cmpl   $NO_SSE,.memops_method(%rip)
937	je     L(Loop8byte_pre)
938
939	/*
940	 * The fall-through path is to do SSE2 16-byte load/stores
941	 */
942
943	/*
944	 * If current move size is larger than half of the highest level cache
945	 * size, then do non-temporal moves.
946	 */
947	mov    .largest_level_cache_size(%rip),%r9d
948	shr    %r9		# take half of it
949	cmp    %r9,%r8
950	jg     L(sse2_nt_move)
951
952	/*
953	 * If both the source and dest are aligned, then use the both aligned
954	 * logic. Well aligned data should reap the rewards.
955	 */
956	test   $0xf,%rdx
957	jz     L(pre_both_aligned)
958
959	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
960	testl  $USE_SSSE3,.memops_method(%rip)
961	jz     1f
962	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
963
9641:
965	/*
966	 * if the src is not 16 byte aligned...
967	 */
968	mov    %rdx,%r11
969	and    $0xf,%r11
970	movdqu (%rdx),%xmm0
971	movdqa %xmm0,(%rcx)
972	add    $0x10,%rdx
973	sub    %r11,%rdx
974	add    $0x10,%rcx
975	sub    $0x10,%r8
976	movdqa (%rdx),%xmm1
977
978	movslq (%r10,%r11,4),%r9
979	lea    (%r9,%r10,1),%r10
980	jmpq   *%r10
981
982	    .balign 16
983L(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
984	    .int        L(mov3dqa1) -L(SSSE3_src)
985	    .int        L(mov3dqa2) -L(SSSE3_src)
986	    .int        L(mov3dqa3) -L(SSSE3_src)
987	    .int        L(mov3dqa4) -L(SSSE3_src)
988	    .int        L(mov3dqa5) -L(SSSE3_src)
989	    .int        L(mov3dqa6) -L(SSSE3_src)
990	    .int        L(mov3dqa7) -L(SSSE3_src)
991	    .int        L(movdqa8)  -L(SSSE3_src)
992	    .int        L(mov3dqa9) -L(SSSE3_src)
993	    .int        L(mov3dqa10)-L(SSSE3_src)
994	    .int        L(mov3dqa11)-L(SSSE3_src)
995	    .int        L(mov3dqa12)-L(SSSE3_src)
996	    .int        L(mov3dqa13)-L(SSSE3_src)
997	    .int        L(mov3dqa14)-L(SSSE3_src)
998	    .int        L(mov3dqa15)-L(SSSE3_src)
999L(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
1000	    .int        L(movdqa1) -L(SSE_src)
1001	    .int        L(movdqa2) -L(SSE_src)
1002	    .int        L(movdqa3) -L(SSE_src)
1003	    .int        L(movdqa4) -L(SSE_src)
1004	    .int        L(movdqa5) -L(SSE_src)
1005	    .int        L(movdqa6) -L(SSE_src)
1006	    .int        L(movdqa7) -L(SSE_src)
1007	    .int        L(movdqa8) -L(SSE_src)
1008	    .int        L(movdqa9) -L(SSE_src)
1009	    .int        L(movdqa10)-L(SSE_src)
1010	    .int        L(movdqa11)-L(SSE_src)
1011	    .int        L(movdqa12)-L(SSE_src)
1012	    .int        L(movdqa13)-L(SSE_src)
1013	    .int        L(movdqa14)-L(SSE_src)
1014	    .int        L(movdqa15)-L(SSE_src)
1015
1016	.balign 16
1017L(movdqa1):
1018	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
1019	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
1020	lea    0x20(%rdx),%rdx
1021	lea    -0x20(%r8),%r8
1022
1023	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
1024	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
1025	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
1026	por    %xmm1,%xmm3 # OR them together
1027	cmp    $0x20,%r8
1028
1029	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
1030	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
1031	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
1032	por    %xmm2,%xmm0 # OR them together
1033	movdqa %xmm3,(%rcx)     # store it
1034	movdqa %xmm0,0x10(%rcx) # store it
1035	lea    0x20(%rcx),%rcx
1036
1037	jge    L(movdqa1)
1038	jmp    L(movdqa_epi)
1039
1040	.balign 16
1041L(movdqa2):
1042	sub    $0x20,%r8
1043	movdqa 0x10(%rdx),%xmm3
1044	movdqa 0x20(%rdx),%xmm0
1045	add    $0x20,%rdx
1046
1047	psrldq $0x2,%xmm1
1048	movdqa %xmm3,%xmm2
1049	pslldq $0xe,%xmm3
1050	por    %xmm1,%xmm3
1051
1052	psrldq $0x2,%xmm2
1053	movdqa %xmm0,%xmm1
1054	pslldq $0xe,%xmm0
1055	por    %xmm2,%xmm0
1056	movdqa %xmm3,(%rcx)
1057	movdqa %xmm0,0x10(%rcx)
1058
1059	add    $0x20,%rcx
1060	cmp    $0x20,%r8
1061	jge    L(movdqa2)
1062	jmp    L(movdqa_epi)
1063
1064	.balign 16
1065L(movdqa3):
1066	sub    $0x20,%r8
1067	movdqa 0x10(%rdx),%xmm3
1068	movdqa 0x20(%rdx),%xmm0
1069	add    $0x20,%rdx
1070
1071	psrldq $0x3,%xmm1
1072	movdqa %xmm3,%xmm2
1073	pslldq $0xd,%xmm3
1074	por    %xmm1,%xmm3
1075
1076	psrldq $0x3,%xmm2
1077	movdqa %xmm0,%xmm1
1078	pslldq $0xd,%xmm0
1079	por    %xmm2,%xmm0
1080	movdqa %xmm3,(%rcx)
1081	movdqa %xmm0,0x10(%rcx)
1082
1083	add    $0x20,%rcx
1084	cmp    $0x20,%r8
1085	jge    L(movdqa3)
1086	jmp    L(movdqa_epi)
1087
1088	.balign 16
1089L(movdqa4):
1090	sub    $0x20,%r8
1091	movdqa 0x10(%rdx),%xmm3
1092	movdqa 0x20(%rdx),%xmm0
1093	add    $0x20,%rdx
1094
1095	psrldq $0x4,%xmm1
1096	movdqa %xmm3,%xmm2
1097	pslldq $0xc,%xmm3
1098	por    %xmm1,%xmm3
1099
1100	psrldq $0x4,%xmm2
1101	movdqa %xmm0,%xmm1
1102	pslldq $0xc,%xmm0
1103	por    %xmm2,%xmm0
1104
1105	movdqa %xmm3,(%rcx)
1106	movdqa %xmm0,0x10(%rcx)
1107
1108	add    $0x20,%rcx
1109	cmp    $0x20,%r8
1110	jge    L(movdqa4)
1111	jmp    L(movdqa_epi)
1112
1113	.balign 16
1114L(movdqa5):
1115	sub    $0x20,%r8
1116	movdqa 0x10(%rdx),%xmm3
1117	movdqa 0x20(%rdx),%xmm0
1118	add    $0x20,%rdx
1119
1120	psrldq $0x5,%xmm1
1121	movdqa %xmm3,%xmm2
1122	pslldq $0xb,%xmm3
1123	por    %xmm1,%xmm3
1124
1125	psrldq $0x5,%xmm2
1126	movdqa %xmm0,%xmm1
1127	pslldq $0xb,%xmm0
1128	por    %xmm2,%xmm0
1129
1130	movdqa %xmm3,(%rcx)
1131	movdqa %xmm0,0x10(%rcx)
1132
1133	add    $0x20,%rcx
1134	cmp    $0x20,%r8
1135	jge    L(movdqa5)
1136	jmp    L(movdqa_epi)
1137
1138	.balign 16
1139L(movdqa6):
1140	sub    $0x20,%r8
1141	movdqa 0x10(%rdx),%xmm3
1142	movdqa 0x20(%rdx),%xmm0
1143	add    $0x20,%rdx
1144
1145	psrldq $0x6,%xmm1
1146	movdqa %xmm3,%xmm2
1147	pslldq $0xa,%xmm3
1148	por    %xmm1,%xmm3
1149
1150	psrldq $0x6,%xmm2
1151	movdqa %xmm0,%xmm1
1152	pslldq $0xa,%xmm0
1153	por    %xmm2,%xmm0
1154	movdqa %xmm3,(%rcx)
1155	movdqa %xmm0,0x10(%rcx)
1156
1157	add    $0x20,%rcx
1158	cmp    $0x20,%r8
1159	jge    L(movdqa6)
1160	jmp    L(movdqa_epi)
1161
1162	.balign 16
1163L(movdqa7):
1164	sub    $0x20,%r8
1165	movdqa 0x10(%rdx),%xmm3
1166	movdqa 0x20(%rdx),%xmm0
1167	add    $0x20,%rdx
1168
1169	psrldq $0x7,%xmm1
1170	movdqa %xmm3,%xmm2
1171	pslldq $0x9,%xmm3
1172	por    %xmm1,%xmm3
1173
1174	psrldq $0x7,%xmm2
1175	movdqa %xmm0,%xmm1
1176	pslldq $0x9,%xmm0
1177	por    %xmm2,%xmm0
1178	movdqa %xmm3,(%rcx)
1179	movdqa %xmm0,0x10(%rcx)
1180
1181	add    $0x20,%rcx
1182	cmp    $0x20,%r8
1183	jge    L(movdqa7)
1184	jmp    L(movdqa_epi)
1185
1186	.balign 16
1187L(movdqa8):
1188	movdqa 0x10(%rdx),%xmm3
1189	sub    $0x30,%r8
1190	movdqa 0x20(%rdx),%xmm0
1191	movdqa 0x30(%rdx),%xmm5
1192	lea    0x30(%rdx),%rdx
1193
1194	shufpd $0x1,%xmm3,%xmm1
1195	movdqa %xmm1,(%rcx)
1196
1197	cmp    $0x30,%r8
1198
1199	shufpd $0x1,%xmm0,%xmm3
1200	movdqa %xmm3,0x10(%rcx)
1201
1202	movdqa %xmm5,%xmm1
1203	shufpd $0x1,%xmm5,%xmm0
1204	movdqa %xmm0,0x20(%rcx)
1205
1206	lea    0x30(%rcx),%rcx
1207
1208	jge    L(movdqa8)
1209	jmp    L(movdqa_epi)
1210
1211	.balign 16
1212L(movdqa9):
1213	sub    $0x20,%r8
1214	movdqa 0x10(%rdx),%xmm3
1215	movdqa 0x20(%rdx),%xmm0
1216	add    $0x20,%rdx
1217
1218	psrldq $0x9,%xmm1
1219	movdqa %xmm3,%xmm2
1220	pslldq $0x7,%xmm3
1221	por    %xmm1,%xmm3
1222
1223	psrldq $0x9,%xmm2
1224	movdqa %xmm0,%xmm1
1225	pslldq $0x7,%xmm0
1226	por    %xmm2,%xmm0
1227	movdqa %xmm3,(%rcx)
1228	movdqa %xmm0,0x10(%rcx)
1229
1230	add    $0x20,%rcx
1231	cmp    $0x20,%r8
1232	jge    L(movdqa9)
1233	jmp    L(movdqa_epi)
1234
1235	.balign 16
1236L(movdqa10):
1237	sub    $0x20,%r8
1238	movdqa 0x10(%rdx),%xmm3
1239	movdqa 0x20(%rdx),%xmm0
1240	add    $0x20,%rdx
1241
1242	psrldq $0xa,%xmm1
1243	movdqa %xmm3,%xmm2
1244	pslldq $0x6,%xmm3
1245	por    %xmm1,%xmm3
1246
1247	psrldq $0xa,%xmm2
1248	movdqa %xmm0,%xmm1
1249	pslldq $0x6,%xmm0
1250	por    %xmm2,%xmm0
1251	movdqa %xmm3,(%rcx)
1252	movdqa %xmm0,0x10(%rcx)
1253
1254	add    $0x20,%rcx
1255	cmp    $0x20,%r8
1256	jge    L(movdqa10)
1257	jmp    L(movdqa_epi)
1258
1259	.balign 16
1260L(movdqa11):
1261	sub    $0x20,%r8
1262	movdqa 0x10(%rdx),%xmm3
1263	movdqa 0x20(%rdx),%xmm0
1264	add    $0x20,%rdx
1265
1266	psrldq $0xb,%xmm1
1267	movdqa %xmm3,%xmm2
1268	pslldq $0x5,%xmm3
1269	por    %xmm1,%xmm3
1270
1271	psrldq $0xb,%xmm2
1272	movdqa %xmm0,%xmm1
1273	pslldq $0x5,%xmm0
1274	por    %xmm2,%xmm0
1275	movdqa %xmm3,(%rcx)
1276	movdqa %xmm0,0x10(%rcx)
1277
1278	add    $0x20,%rcx
1279	cmp    $0x20,%r8
1280	jge    L(movdqa11)
1281	jmp    L(movdqa_epi)
1282
1283	.balign 16
1284L(movdqa12):
1285	sub    $0x20,%r8
1286	movdqa 0x10(%rdx),%xmm3
1287	movdqa 0x20(%rdx),%xmm0
1288	add    $0x20,%rdx
1289
1290	psrldq $0xc,%xmm1
1291	movdqa %xmm3,%xmm2
1292	pslldq $0x4,%xmm3
1293	por    %xmm1,%xmm3
1294
1295	psrldq $0xc,%xmm2
1296	movdqa %xmm0,%xmm1
1297	pslldq $0x4,%xmm0
1298	por    %xmm2,%xmm0
1299	movdqa %xmm3,(%rcx)
1300	movdqa %xmm0,0x10(%rcx)
1301
1302	add    $0x20,%rcx
1303	cmp    $0x20,%r8
1304	jge    L(movdqa12)
1305	jmp    L(movdqa_epi)
1306
1307	.balign 16
1308L(movdqa13):
1309	sub    $0x20,%r8
1310	movdqa 0x10(%rdx),%xmm3
1311	movdqa 0x20(%rdx),%xmm0
1312	add    $0x20,%rdx
1313
1314	psrldq $0xd,%xmm1
1315	movdqa %xmm3,%xmm2
1316	pslldq $0x3,%xmm3
1317	por    %xmm1,%xmm3
1318
1319	psrldq $0xd,%xmm2
1320	movdqa %xmm0,%xmm1
1321	pslldq $0x3,%xmm0
1322	por    %xmm2,%xmm0
1323	movdqa %xmm3,(%rcx)
1324	movdqa %xmm0,0x10(%rcx)
1325
1326	add    $0x20,%rcx
1327	cmp    $0x20,%r8
1328	jge    L(movdqa13)
1329	jmp    L(movdqa_epi)
1330
1331	.balign 16
1332L(movdqa14):
1333	sub    $0x20,%r8
1334	movdqa 0x10(%rdx),%xmm3
1335	movdqa 0x20(%rdx),%xmm0
1336	add    $0x20,%rdx
1337
1338	psrldq $0xe,%xmm1
1339	movdqa %xmm3,%xmm2
1340	pslldq $0x2,%xmm3
1341	por    %xmm1,%xmm3
1342
1343	psrldq $0xe,%xmm2
1344	movdqa %xmm0,%xmm1
1345	pslldq $0x2,%xmm0
1346	por    %xmm2,%xmm0
1347	movdqa %xmm3,(%rcx)
1348	movdqa %xmm0,0x10(%rcx)
1349
1350	add    $0x20,%rcx
1351	cmp    $0x20,%r8
1352	jge    L(movdqa14)
1353	jmp    L(movdqa_epi)
1354
1355	.balign 16
1356L(movdqa15):
1357	sub    $0x20,%r8
1358	movdqa 0x10(%rdx),%xmm3
1359	movdqa 0x20(%rdx),%xmm0
1360	add    $0x20,%rdx
1361
1362	psrldq $0xf,%xmm1
1363	movdqa %xmm3,%xmm2
1364	pslldq $0x1,%xmm3
1365	por    %xmm1,%xmm3
1366
1367	psrldq $0xf,%xmm2
1368	movdqa %xmm0,%xmm1
1369	pslldq $0x1,%xmm0
1370	por    %xmm2,%xmm0
1371	movdqa %xmm3,(%rcx)
1372	movdqa %xmm0,0x10(%rcx)
1373
1374	add    $0x20,%rcx
1375	cmp    $0x20,%r8
1376	jge    L(movdqa15)
1377	#jmp   L(movdqa_epi)
1378
1379	.balign 16
1380L(movdqa_epi):
1381	lea    L(fwdPxQx)(%rip),%r10
1382	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
1383	add    %r8,%rcx
1384	add    %r8,%rdx
1385
1386	movslq (%r10,%r8,4),%r9
1387	lea    (%r9,%r10,1),%r10
1388	jmpq   *%r10
1389
1390	.balign 16
1391L(mov3dqa1):
1392	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
1393	sub	$0x30,%r8
1394	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
1395	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
1396	lea	0x30(%rdx),%rdx
1397	cmp	$0x30,%r8
1398
1399	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
1400	#palignr	$0x1,%xmm1,%xmm3
1401	.byte	0x66,0x0f,0x3a,0x0f
1402	.byte	0xd9,0x01
1403	movdqa	%xmm3,(%rcx)      # store it
1404
1405	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
1406	#palignr	$0x1,%xmm2,%xmm0
1407	.byte	0x66,0x0f,0x3a,0x0f
1408	.byte	0xc2,0x01
1409	movdqa	%xmm0,0x10(%rcx)  # store it
1410
1411	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
1412	#palignr	$0x1,%xmm4,%xmm5
1413	.byte	0x66,0x0f,0x3a,0x0f
1414	.byte	0xec,0x01
1415	movdqa	%xmm5,0x20(%rcx)  # store it
1416
1417	lea	0x30(%rcx),%rcx
1418	jge	L(mov3dqa1)
1419
1420	cmp	$0x10,%r8
1421	jl	L(movdqa_epi)
1422	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1423	sub	$0x10,%r8
1424	lea	0x10(%rdx),%rdx
1425	movdqa	%xmm3,%xmm2		# save for use next concat
1426	#palignr	$0x1,%xmm1,%xmm3
1427	.byte	0x66,0x0f,0x3a,0x0f
1428	.byte	0xd9,0x01
1429
1430	cmp	$0x10,%r8
1431	movdqa	%xmm3,(%rcx)      	# store it
1432	lea	0x10(%rcx),%rcx
1433	jl	L(movdqa_epi)
1434
1435	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1436	sub	$0x10,%r8
1437	lea	0x10(%rdx),%rdx
1438	#palignr	$0x1,%xmm2,%xmm0
1439	.byte	0x66,0x0f,0x3a,0x0f
1440	.byte	0xc2,0x01
1441	movdqa	%xmm0,(%rcx)      	# store it
1442	lea	0x10(%rcx),%rcx
1443	jmp	L(movdqa_epi)
1444
1445	.balign 16
1446L(mov3dqa2):
1447	movdqa	0x10(%rdx),%xmm3
1448	sub	$0x30,%r8
1449	movdqa	0x20(%rdx),%xmm0
1450	movdqa	0x30(%rdx),%xmm5
1451	lea	0x30(%rdx),%rdx
1452	cmp	$0x30,%r8
1453
1454	movdqa	%xmm3,%xmm2
1455	#palignr	$0x2,%xmm1,%xmm3
1456	.byte	0x66,0x0f,0x3a,0x0f
1457	.byte	0xd9,0x02
1458	movdqa	%xmm3,(%rcx)
1459
1460	movdqa	%xmm0,%xmm4
1461	#palignr	$0x2,%xmm2,%xmm0
1462	.byte	0x66,0x0f,0x3a,0x0f
1463	.byte	0xc2,0x02
1464	movdqa	%xmm0,0x10(%rcx)
1465
1466	movdqa	%xmm5,%xmm1
1467	#palignr	$0x2,%xmm4,%xmm5
1468	.byte	0x66,0x0f,0x3a,0x0f
1469	.byte	0xec,0x02
1470	movdqa	%xmm5,0x20(%rcx)
1471
1472	lea	0x30(%rcx),%rcx
1473	jge	L(mov3dqa2)
1474
1475	cmp	$0x10,%r8
1476	jl	L(movdqa_epi)
1477	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1478	sub	$0x10,%r8
1479	lea	0x10(%rdx),%rdx
1480	movdqa	%xmm3,%xmm2		# save for use next concat
1481	#palignr	$0x2,%xmm1,%xmm3
1482	.byte	0x66,0x0f,0x3a,0x0f
1483	.byte	0xd9,0x02
1484
1485	cmp	$0x10,%r8
1486	movdqa	%xmm3,(%rcx)      	# store it
1487	lea	0x10(%rcx),%rcx
1488	jl	L(movdqa_epi)
1489
1490	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1491	sub	$0x10,%r8
1492	lea	0x10(%rdx),%rdx
1493	#palignr	$0x2,%xmm2,%xmm0
1494	.byte	0x66,0x0f,0x3a,0x0f
1495	.byte	0xc2,0x02
1496	movdqa	%xmm0,(%rcx)      	# store it
1497	lea	0x10(%rcx),%rcx
1498	jmp	L(movdqa_epi)
1499
1500	.balign 16
1501L(mov3dqa3):
1502	movdqa	0x10(%rdx),%xmm3
1503	sub	$0x30,%r8
1504	movdqa	0x20(%rdx),%xmm0
1505	movdqa	0x30(%rdx),%xmm5
1506	lea	0x30(%rdx),%rdx
1507	cmp	$0x30,%r8
1508
1509	movdqa	%xmm3,%xmm2
1510	#palignr	$0x3,%xmm1,%xmm3
1511	.byte	0x66,0x0f,0x3a,0x0f
1512	.byte	0xd9,0x03
1513	movdqa	%xmm3,(%rcx)
1514
1515	movdqa	%xmm0,%xmm4
1516	#palignr	$0x3,%xmm2,%xmm0
1517	.byte	0x66,0x0f,0x3a,0x0f
1518	.byte	0xc2,0x03
1519	movdqa	%xmm0,0x10(%rcx)
1520
1521	movdqa	%xmm5,%xmm1
1522	#palignr	$0x3,%xmm4,%xmm5
1523	.byte	0x66,0x0f,0x3a,0x0f
1524	.byte	0xec,0x03
1525	movdqa	%xmm5,0x20(%rcx)
1526
1527	lea	0x30(%rcx),%rcx
1528	jge	L(mov3dqa3)
1529
1530	cmp	$0x10,%r8
1531	jl	L(movdqa_epi)
1532	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1533	sub	$0x10,%r8
1534	lea	0x10(%rdx),%rdx
1535	movdqa	%xmm3,%xmm2		# save for use next concat
1536	#palignr	$0x3,%xmm1,%xmm3
1537	.byte	0x66,0x0f,0x3a,0x0f
1538	.byte	0xd9,0x03
1539
1540	cmp	$0x10,%r8
1541	movdqa	%xmm3,(%rcx)      	# store it
1542	lea	0x10(%rcx),%rcx
1543	jl	L(movdqa_epi)
1544
1545	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1546	sub	$0x10,%r8
1547	lea	0x10(%rdx),%rdx
1548	#palignr	$0x3,%xmm2,%xmm0
1549	.byte	0x66,0x0f,0x3a,0x0f
1550	.byte	0xc2,0x03
1551	movdqa	%xmm0,(%rcx)      	# store it
1552	lea	0x10(%rcx),%rcx
1553	jmp	L(movdqa_epi)
1554
1555	.balign 16
1556L(mov3dqa4):
1557	movdqa	0x10(%rdx),%xmm3
1558	sub	$0x30,%r8
1559	movdqa	0x20(%rdx),%xmm0
1560	movdqa	0x30(%rdx),%xmm5
1561	lea	0x30(%rdx),%rdx
1562	cmp	$0x30,%r8
1563
1564	movdqa	%xmm3,%xmm2
1565	#palignr	$0x4,%xmm1,%xmm3
1566	.byte	0x66,0x0f,0x3a,0x0f
1567	.byte	0xd9,0x04
1568	movdqa	%xmm3,(%rcx)
1569
1570	movdqa	%xmm0,%xmm4
1571	#palignr	$0x4,%xmm2,%xmm0
1572	.byte	0x66,0x0f,0x3a,0x0f
1573	.byte	0xc2,0x04
1574	movdqa	%xmm0,0x10(%rcx)
1575
1576	movdqa	%xmm5,%xmm1
1577	#palignr	$0x4,%xmm4,%xmm5
1578	.byte	0x66,0x0f,0x3a,0x0f
1579	.byte	0xec,0x04
1580	movdqa	%xmm5,0x20(%rcx)
1581
1582	lea	0x30(%rcx),%rcx
1583	jge	L(mov3dqa4)
1584
1585	cmp	$0x10,%r8
1586	jl	L(movdqa_epi)
1587	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1588	sub	$0x10,%r8
1589	lea	0x10(%rdx),%rdx
1590	movdqa	%xmm3,%xmm2		# save for use next concat
1591	#palignr	$0x4,%xmm1,%xmm3
1592	.byte	0x66,0x0f,0x3a,0x0f
1593	.byte	0xd9,0x04
1594
1595	cmp	$0x10,%r8
1596	movdqa	%xmm3,(%rcx)      	# store it
1597	lea	0x10(%rcx),%rcx
1598	jl	L(movdqa_epi)
1599
1600	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1601	sub	$0x10,%r8
1602	lea	0x10(%rdx),%rdx
1603	#palignr	$0x4,%xmm2,%xmm0
1604	.byte	0x66,0x0f,0x3a,0x0f
1605	.byte	0xc2,0x04
1606	movdqa	%xmm0,(%rcx)      	# store it
1607	lea	0x10(%rcx),%rcx
1608	jmp	L(movdqa_epi)
1609
1610	.balign 16
1611L(mov3dqa5):
1612	movdqa	0x10(%rdx),%xmm3
1613	sub	$0x30,%r8
1614	movdqa	0x20(%rdx),%xmm0
1615	movdqa	0x30(%rdx),%xmm5
1616	lea	0x30(%rdx),%rdx
1617	cmp	$0x30,%r8
1618
1619	movdqa	%xmm3,%xmm2
1620	#palignr	$0x5,%xmm1,%xmm3
1621	.byte	0x66,0x0f,0x3a,0x0f
1622	.byte	0xd9,0x05
1623	movdqa	%xmm3,(%rcx)
1624
1625	movdqa	%xmm0,%xmm4
1626	#palignr	$0x5,%xmm2,%xmm0
1627	.byte	0x66,0x0f,0x3a,0x0f
1628	.byte	0xc2,0x05
1629	movdqa	%xmm0,0x10(%rcx)
1630
1631	movdqa	%xmm5,%xmm1
1632	#palignr	$0x5,%xmm4,%xmm5
1633	.byte	0x66,0x0f,0x3a,0x0f
1634	.byte	0xec,0x05
1635	movdqa	%xmm5,0x20(%rcx)
1636
1637	lea	0x30(%rcx),%rcx
1638	jge	L(mov3dqa5)
1639
1640	cmp	$0x10,%r8
1641	jl	L(movdqa_epi)
1642	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1643	sub	$0x10,%r8
1644	lea	0x10(%rdx),%rdx
1645	movdqa	%xmm3,%xmm2		# save for use next concat
1646	#palignr	$0x5,%xmm1,%xmm3
1647	.byte	0x66,0x0f,0x3a,0x0f
1648	.byte	0xd9,0x05
1649
1650	cmp	$0x10,%r8
1651	movdqa	%xmm3,(%rcx)      	# store it
1652	lea	0x10(%rcx),%rcx
1653	jl	L(movdqa_epi)
1654
1655	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1656	sub	$0x10,%r8
1657	lea	0x10(%rdx),%rdx
1658	#palignr	$0x5,%xmm2,%xmm0
1659	.byte	0x66,0x0f,0x3a,0x0f
1660	.byte	0xc2,0x05
1661	movdqa	%xmm0,(%rcx)      	# store it
1662	lea	0x10(%rcx),%rcx
1663	jmp	L(movdqa_epi)
1664
1665	.balign 16
1666L(mov3dqa6):
1667	movdqa	0x10(%rdx),%xmm3
1668	sub	$0x30,%r8
1669	movdqa	0x20(%rdx),%xmm0
1670	movdqa	0x30(%rdx),%xmm5
1671	lea	0x30(%rdx),%rdx
1672	cmp	$0x30,%r8
1673
1674	movdqa	%xmm3,%xmm2
1675	#palignr	$0x6,%xmm1,%xmm3
1676	.byte	0x66,0x0f,0x3a,0x0f
1677	.byte	0xd9,0x06
1678	movdqa	%xmm3,(%rcx)
1679
1680	movdqa	%xmm0,%xmm4
1681	#palignr	$0x6,%xmm2,%xmm0
1682	.byte	0x66,0x0f,0x3a,0x0f
1683	.byte	0xc2,0x06
1684	movdqa	%xmm0,0x10(%rcx)
1685
1686	movdqa	%xmm5,%xmm1
1687	#palignr	$0x6,%xmm4,%xmm5
1688	.byte	0x66,0x0f,0x3a,0x0f
1689	.byte	0xec,0x06
1690	movdqa	%xmm5,0x20(%rcx)
1691
1692	lea	0x30(%rcx),%rcx
1693	jge	L(mov3dqa6)
1694
1695	cmp	$0x10,%r8
1696	jl	L(movdqa_epi)
1697	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1698	sub	$0x10,%r8
1699	lea	0x10(%rdx),%rdx
1700	movdqa	%xmm3,%xmm2		# save for use next concat
1701	#palignr	$0x6,%xmm1,%xmm3
1702	.byte	0x66,0x0f,0x3a,0x0f
1703	.byte	0xd9,0x06
1704
1705	cmp	$0x10,%r8
1706	movdqa	%xmm3,(%rcx)      	# store it
1707	lea	0x10(%rcx),%rcx
1708	jl	L(movdqa_epi)
1709
1710	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1711	sub	$0x10,%r8
1712	lea	0x10(%rdx),%rdx
1713	#palignr	$0x6,%xmm2,%xmm0
1714	.byte	0x66,0x0f,0x3a,0x0f
1715	.byte	0xc2,0x06
1716	movdqa	%xmm0,(%rcx)      	# store it
1717	lea	0x10(%rcx),%rcx
1718	jmp	L(movdqa_epi)
1719
1720	.balign 16
1721L(mov3dqa7):
1722	movdqa	0x10(%rdx),%xmm3
1723	sub	$0x30,%r8
1724	movdqa	0x20(%rdx),%xmm0
1725	movdqa	0x30(%rdx),%xmm5
1726	lea	0x30(%rdx),%rdx
1727	cmp	$0x30,%r8
1728
1729	movdqa	%xmm3,%xmm2
1730	#palignr	$0x7,%xmm1,%xmm3
1731	.byte	0x66,0x0f,0x3a,0x0f
1732	.byte	0xd9,0x07
1733	movdqa	%xmm3,(%rcx)
1734
1735	movdqa	%xmm0,%xmm4
1736	#palignr	$0x7,%xmm2,%xmm0
1737	.byte	0x66,0x0f,0x3a,0x0f
1738	.byte	0xc2,0x07
1739	movdqa	%xmm0,0x10(%rcx)
1740
1741	movdqa	%xmm5,%xmm1
1742	#palignr	$0x7,%xmm4,%xmm5
1743	.byte	0x66,0x0f,0x3a,0x0f
1744	.byte	0xec,0x07
1745	movdqa	%xmm5,0x20(%rcx)
1746
1747	lea	0x30(%rcx),%rcx
1748	jge	L(mov3dqa7)
1749
1750	cmp	$0x10,%r8
1751	jl	L(movdqa_epi)
1752	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1753	sub	$0x10,%r8
1754	lea	0x10(%rdx),%rdx
1755	movdqa	%xmm3,%xmm2		# save for use next concat
1756	#palignr	$0x7,%xmm1,%xmm3
1757	.byte	0x66,0x0f,0x3a,0x0f
1758	.byte	0xd9,0x07
1759
1760	cmp	$0x10,%r8
1761	movdqa	%xmm3,(%rcx)      	# store it
1762	lea	0x10(%rcx),%rcx
1763	jl	L(movdqa_epi)
1764
1765	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1766	sub	$0x10,%r8
1767	lea	0x10(%rdx),%rdx
1768	#palignr	$0x7,%xmm2,%xmm0
1769	.byte	0x66,0x0f,0x3a,0x0f
1770	.byte	0xc2,0x07
1771	movdqa	%xmm0,(%rcx)      	# store it
1772	lea	0x10(%rcx),%rcx
1773	jmp	L(movdqa_epi)
1774
1775	.balign 16
1776L(mov3dqa9):
1777	movdqa	0x10(%rdx),%xmm3
1778	sub	$0x30,%r8
1779	movdqa	0x20(%rdx),%xmm0
1780	movdqa	0x30(%rdx),%xmm5
1781	lea	0x30(%rdx),%rdx
1782	cmp	$0x30,%r8
1783
1784	movdqa	%xmm3,%xmm2
1785	#palignr	$0x9,%xmm1,%xmm3
1786	.byte	0x66,0x0f,0x3a,0x0f
1787	.byte	0xd9,0x09
1788	movdqa	%xmm3,(%rcx)
1789
1790	movdqa	%xmm0,%xmm4
1791	#palignr	$0x9,%xmm2,%xmm0
1792	.byte	0x66,0x0f,0x3a,0x0f
1793	.byte	0xc2,0x09
1794	movdqa	%xmm0,0x10(%rcx)
1795
1796	movdqa	%xmm5,%xmm1
1797	#palignr	$0x9,%xmm4,%xmm5
1798	.byte	0x66,0x0f,0x3a,0x0f
1799	.byte	0xec,0x09
1800	movdqa	%xmm5,0x20(%rcx)
1801
1802	lea	0x30(%rcx),%rcx
1803	jge	L(mov3dqa9)
1804
1805	cmp	$0x10,%r8
1806	jl	L(movdqa_epi)
1807	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1808	sub	$0x10,%r8
1809	lea	0x10(%rdx),%rdx
1810	movdqa	%xmm3,%xmm2		# save for use next concat
1811	#palignr	$0x9,%xmm1,%xmm3
1812	.byte	0x66,0x0f,0x3a,0x0f
1813	.byte	0xd9,0x09
1814
1815	cmp	$0x10,%r8
1816	movdqa	%xmm3,(%rcx)      	# store it
1817	lea	0x10(%rcx),%rcx
1818	jl	L(movdqa_epi)
1819
1820	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1821	sub	$0x10,%r8
1822	lea	0x10(%rdx),%rdx
1823	#palignr	$0x9,%xmm2,%xmm0
1824	.byte	0x66,0x0f,0x3a,0x0f
1825	.byte	0xc2,0x09
1826	movdqa	%xmm0,(%rcx)      	# store it
1827	lea	0x10(%rcx),%rcx
1828	jmp	L(movdqa_epi)
1829
1830	.balign 16
1831L(mov3dqa10):
1832	movdqa	0x10(%rdx),%xmm3
1833	sub	$0x30,%r8
1834	movdqa	0x20(%rdx),%xmm0
1835	movdqa	0x30(%rdx),%xmm5
1836	lea	0x30(%rdx),%rdx
1837	cmp	$0x30,%r8
1838
1839	movdqa	%xmm3,%xmm2
1840	#palignr	$0xa,%xmm1,%xmm3
1841	.byte	0x66,0x0f,0x3a,0x0f
1842	.byte	0xd9,0x0a
1843	movdqa	%xmm3,(%rcx)
1844
1845	movdqa	%xmm0,%xmm4
1846	#palignr	$0xa,%xmm2,%xmm0
1847	.byte	0x66,0x0f,0x3a,0x0f
1848	.byte	0xc2,0x0a
1849	movdqa	%xmm0,0x10(%rcx)
1850
1851	movdqa	%xmm5,%xmm1
1852	#palignr	$0xa,%xmm4,%xmm5
1853	.byte	0x66,0x0f,0x3a,0x0f
1854	.byte	0xec,0x0a
1855	movdqa	%xmm5,0x20(%rcx)
1856
1857	lea	0x30(%rcx),%rcx
1858	jge	L(mov3dqa10)
1859
1860	cmp	$0x10,%r8
1861	jl	L(movdqa_epi)
1862	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1863	sub	$0x10,%r8
1864	lea	0x10(%rdx),%rdx
1865	movdqa	%xmm3,%xmm2		# save for use next concat
1866	#palignr	$0xa,%xmm1,%xmm3
1867	.byte	0x66,0x0f,0x3a,0x0f
1868	.byte	0xd9,0x0a
1869
1870	cmp	$0x10,%r8
1871	movdqa	%xmm3,(%rcx)      	# store it
1872	lea	0x10(%rcx),%rcx
1873	jl	L(movdqa_epi)
1874
1875	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1876	sub	$0x10,%r8
1877	lea	0x10(%rdx),%rdx
1878	#palignr	$0xa,%xmm2,%xmm0
1879	.byte	0x66,0x0f,0x3a,0x0f
1880	.byte	0xc2,0x0a
1881	movdqa	%xmm0,(%rcx)      	# store it
1882	lea	0x10(%rcx),%rcx
1883	jmp	L(movdqa_epi)
1884
1885	.balign 16
1886L(mov3dqa11):
1887	movdqa	0x10(%rdx),%xmm3
1888	sub	$0x30,%r8
1889	movdqa	0x20(%rdx),%xmm0
1890	movdqa	0x30(%rdx),%xmm5
1891	lea	0x30(%rdx),%rdx
1892	cmp	$0x30,%r8
1893
1894	movdqa	%xmm3,%xmm2
1895	#palignr	$0xb,%xmm1,%xmm3
1896	.byte	0x66,0x0f,0x3a,0x0f
1897	.byte	0xd9,0x0b
1898	movdqa	%xmm3,(%rcx)
1899
1900	movdqa	%xmm0,%xmm4
1901	#palignr	$0xb,%xmm2,%xmm0
1902	.byte	0x66,0x0f,0x3a,0x0f
1903	.byte	0xc2,0x0b
1904	movdqa	%xmm0,0x10(%rcx)
1905
1906	movdqa	%xmm5,%xmm1
1907	#palignr	$0xb,%xmm4,%xmm5
1908	.byte	0x66,0x0f,0x3a,0x0f
1909	.byte	0xec,0x0b
1910	movdqa	%xmm5,0x20(%rcx)
1911
1912	lea	0x30(%rcx),%rcx
1913	jge	L(mov3dqa11)
1914
1915	cmp	$0x10,%r8
1916	jl	L(movdqa_epi)
1917	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1918	sub	$0x10,%r8
1919	lea	0x10(%rdx),%rdx
1920	movdqa	%xmm3,%xmm2		# save for use next concat
1921	#palignr	$0xb,%xmm1,%xmm3
1922	.byte	0x66,0x0f,0x3a,0x0f
1923	.byte	0xd9,0x0b
1924
1925	cmp	$0x10,%r8
1926	movdqa	%xmm3,(%rcx)      	# store it
1927	lea	0x10(%rcx),%rcx
1928	jl	L(movdqa_epi)
1929
1930	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1931	sub	$0x10,%r8
1932	lea	0x10(%rdx),%rdx
1933	#palignr	$0xb,%xmm2,%xmm0
1934	.byte	0x66,0x0f,0x3a,0x0f
1935	.byte	0xc2,0x0b
1936	movdqa	%xmm0,(%rcx)      	# store it
1937	lea	0x10(%rcx),%rcx
1938	jmp	L(movdqa_epi)
1939
1940	.balign 16
1941L(mov3dqa12):
1942	movdqa	0x10(%rdx),%xmm3
1943	sub	$0x30,%r8
1944	movdqa	0x20(%rdx),%xmm0
1945	movdqa	0x30(%rdx),%xmm5
1946	lea	0x30(%rdx),%rdx
1947	cmp	$0x30,%r8
1948
1949	movdqa	%xmm3,%xmm2
1950	#palignr	$0xc,%xmm1,%xmm3
1951	.byte	0x66,0x0f,0x3a,0x0f
1952	.byte	0xd9,0x0c
1953	movdqa	%xmm3,(%rcx)
1954
1955	movdqa	%xmm0,%xmm4
1956	#palignr	$0xc,%xmm2,%xmm0
1957	.byte	0x66,0x0f,0x3a,0x0f
1958	.byte	0xc2,0x0c
1959	movdqa	%xmm0,0x10(%rcx)
1960
1961	movdqa	%xmm5,%xmm1
1962	#palignr	$0xc,%xmm4,%xmm5
1963	.byte	0x66,0x0f,0x3a,0x0f
1964	.byte	0xec,0x0c
1965	movdqa	%xmm5,0x20(%rcx)
1966
1967	lea	0x30(%rcx),%rcx
1968	jge	L(mov3dqa12)
1969
1970	cmp	$0x10,%r8
1971	jl	L(movdqa_epi)
1972	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1973	sub	$0x10,%r8
1974	lea	0x10(%rdx),%rdx
1975	movdqa	%xmm3,%xmm2		# save for use next concat
1976	#palignr	$0xc,%xmm1,%xmm3
1977	.byte	0x66,0x0f,0x3a,0x0f
1978	.byte	0xd9,0x0c
1979
1980	cmp	$0x10,%r8
1981	movdqa	%xmm3,(%rcx)      	# store it
1982	lea	0x10(%rcx),%rcx
1983	jl	L(movdqa_epi)
1984
1985	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1986	sub	$0x10,%r8
1987	lea	0x10(%rdx),%rdx
1988	#palignr	$0xc,%xmm2,%xmm0
1989	.byte	0x66,0x0f,0x3a,0x0f
1990	.byte	0xc2,0x0c
1991	movdqa	%xmm0,(%rcx)      	# store it
1992	lea	0x10(%rcx),%rcx
1993	jmp	L(movdqa_epi)
1994
1995	.balign 16
1996L(mov3dqa13):
1997	movdqa	0x10(%rdx),%xmm3
1998	sub	$0x30,%r8
1999	movdqa	0x20(%rdx),%xmm0
2000	movdqa	0x30(%rdx),%xmm5
2001	lea	0x30(%rdx),%rdx
2002	cmp	$0x30,%r8
2003
2004	movdqa	%xmm3,%xmm2
2005	#palignr	$0xd,%xmm1,%xmm3
2006	.byte	0x66,0x0f,0x3a,0x0f
2007	.byte	0xd9,0x0d
2008	movdqa	%xmm3,(%rcx)
2009
2010	movdqa	%xmm0,%xmm4
2011	#palignr	$0xd,%xmm2,%xmm0
2012	.byte	0x66,0x0f,0x3a,0x0f
2013	.byte	0xc2,0x0d
2014	movdqa	%xmm0,0x10(%rcx)
2015
2016	movdqa	%xmm5,%xmm1
2017	#palignr	$0xd,%xmm4,%xmm5
2018	.byte	0x66,0x0f,0x3a,0x0f
2019	.byte	0xec,0x0d
2020	movdqa	%xmm5,0x20(%rcx)
2021
2022	lea	0x30(%rcx),%rcx
2023	jge	L(mov3dqa13)
2024
2025	cmp	$0x10,%r8
2026	jl	L(movdqa_epi)
2027	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2028	sub	$0x10,%r8
2029	lea	0x10(%rdx),%rdx
2030	movdqa	%xmm3,%xmm2		# save for use next concat
2031	#palignr	$0xd,%xmm1,%xmm3
2032	.byte	0x66,0x0f,0x3a,0x0f
2033	.byte	0xd9,0x0d
2034
2035	cmp	$0x10,%r8
2036	movdqa	%xmm3,(%rcx)      	# store it
2037	lea	0x10(%rcx),%rcx
2038	jl	L(movdqa_epi)
2039
2040	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2041	sub	$0x10,%r8
2042	lea	0x10(%rdx),%rdx
2043	#palignr	$0xd,%xmm2,%xmm0
2044	.byte	0x66,0x0f,0x3a,0x0f
2045	.byte	0xc2,0x0d
2046	movdqa	%xmm0,(%rcx)      	# store it
2047	lea	0x10(%rcx),%rcx
2048	jmp	L(movdqa_epi)
2049
2050	.balign 16
2051L(mov3dqa14):
2052	movdqa	0x10(%rdx),%xmm3
2053	sub	$0x30,%r8
2054	movdqa	0x20(%rdx),%xmm0
2055	movdqa	0x30(%rdx),%xmm5
2056	lea	0x30(%rdx),%rdx
2057	cmp	$0x30,%r8
2058
2059	movdqa	%xmm3,%xmm2
2060	#palignr	$0xe,%xmm1,%xmm3
2061	.byte	0x66,0x0f,0x3a,0x0f
2062	.byte	0xd9,0x0e
2063	movdqa	%xmm3,(%rcx)
2064
2065	movdqa	%xmm0,%xmm4
2066	#palignr	$0xe,%xmm2,%xmm0
2067	.byte	0x66,0x0f,0x3a,0x0f
2068	.byte	0xc2,0x0e
2069	movdqa	%xmm0,0x10(%rcx)
2070
2071	movdqa	%xmm5,%xmm1
2072	#palignr	$0xe,%xmm4,%xmm5
2073	.byte	0x66,0x0f,0x3a,0x0f
2074	.byte	0xec,0x0e
2075	movdqa	%xmm5,0x20(%rcx)
2076
2077	lea	0x30(%rcx),%rcx
2078	jge	L(mov3dqa14)
2079
2080	cmp	$0x10,%r8
2081	jl	L(movdqa_epi)
2082	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2083	sub	$0x10,%r8
2084	lea	0x10(%rdx),%rdx
2085	movdqa	%xmm3,%xmm2		# save for use next concat
2086	#palignr	$0xe,%xmm1,%xmm3
2087	.byte	0x66,0x0f,0x3a,0x0f
2088	.byte	0xd9,0x0e
2089
2090	cmp	$0x10,%r8
2091	movdqa	%xmm3,(%rcx)      	# store it
2092	lea	0x10(%rcx),%rcx
2093	jl	L(movdqa_epi)
2094
2095	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2096	sub	$0x10,%r8
2097	lea	0x10(%rdx),%rdx
2098	#palignr	$0xe,%xmm2,%xmm0
2099	.byte	0x66,0x0f,0x3a,0x0f
2100	.byte	0xc2,0x0e
2101	movdqa	%xmm0,(%rcx)      	# store it
2102	lea	0x10(%rcx),%rcx
2103	jmp	L(movdqa_epi)
2104
2105	.balign 16
2106L(mov3dqa15):
2107	movdqa	0x10(%rdx),%xmm3
2108	sub	$0x30,%r8
2109	movdqa	0x20(%rdx),%xmm0
2110	movdqa	0x30(%rdx),%xmm5
2111	lea	0x30(%rdx),%rdx
2112	cmp	$0x30,%r8
2113
2114	movdqa	%xmm3,%xmm2
2115	#palignr	$0xf,%xmm1,%xmm3
2116	.byte	0x66,0x0f,0x3a,0x0f
2117	.byte	0xd9,0x0f
2118	movdqa	%xmm3,(%rcx)
2119
2120	movdqa	%xmm0,%xmm4
2121	#palignr	$0xf,%xmm2,%xmm0
2122	.byte	0x66,0x0f,0x3a,0x0f
2123	.byte	0xc2,0x0f
2124	movdqa	%xmm0,0x10(%rcx)
2125
2126	movdqa	%xmm5,%xmm1
2127	#palignr	$0xf,%xmm4,%xmm5
2128	.byte	0x66,0x0f,0x3a,0x0f
2129	.byte	0xec,0x0f
2130	movdqa	%xmm5,0x20(%rcx)
2131
2132	lea	0x30(%rcx),%rcx
2133	jge	L(mov3dqa15)
2134
2135	cmp	$0x10,%r8
2136	jl	L(movdqa_epi)
2137	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2138	sub	$0x10,%r8
2139	lea	0x10(%rdx),%rdx
2140	movdqa	%xmm3,%xmm2		# save for use next concat
2141	#palignr	$0xf,%xmm1,%xmm3
2142	.byte	0x66,0x0f,0x3a,0x0f
2143	.byte	0xd9,0x0f
2144
2145	cmp	$0x10,%r8
2146	movdqa	%xmm3,(%rcx)      	# store it
2147	lea	0x10(%rcx),%rcx
2148	jl	L(movdqa_epi)
2149
2150	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2151	sub	$0x10,%r8
2152	lea	0x10(%rdx),%rdx
2153	#palignr	$0xf,%xmm2,%xmm0
2154	.byte	0x66,0x0f,0x3a,0x0f
2155	.byte	0xc2,0x0f
2156	movdqa	%xmm0,(%rcx)      	# store it
2157	lea	0x10(%rcx),%rcx
2158	jmp	L(movdqa_epi)
2159
2160	.balign 16
2161L(sse2_nt_move):
2162	lea	0x40(%rcx),%rcx
2163	lea	0x40(%rdx),%rdx
2164	lea	-0x40(%r8),%r8
2165
2166	/*
2167	 * doesn't matter if source is aligned for stuff out of cache.
2168	 * the mis-aligned penalty is masked by the slowness of main memory.
2169	 */
2170	prefetchnta 0x180(%rdx)
2171	movdqu	-0x40(%rdx),%xmm0
2172	movdqu	-0x30(%rdx),%xmm1
2173
2174	cmp	$0x40,%r8
2175	movntdq	%xmm0,-0x40(%rcx)
2176	movntdq	%xmm1,-0x30(%rcx)
2177
2178	movdqu	-0x20(%rdx),%xmm2
2179	movdqu	-0x10(%rdx),%xmm3
2180
2181	movntdq	%xmm2,-0x20(%rcx)
2182	movntdq	%xmm3,-0x10(%rcx)
2183
2184	jge	L(sse2_nt_move)
2185
2186	lea	L(Fix16EndTable)(%rip),%r10
2187	mov	%r8,%r9
2188	and	$0xFFFFFFFFFFFFFFF0,%r9
2189	add	%r9,%rcx
2190	add	%r9,%rdx
2191	sub	%r9,%r8
2192	shr	$0x4,%r9
2193	sfence
2194
2195	movslq	(%r10,%r9,4),%r11
2196	lea	(%r11,%r10,1),%r10
2197	jmpq	*%r10
2198
2199	.balign 16
2200L(Fix16EndTable):
2201	.int    L(fix16_0)-L(Fix16EndTable)
2202	.int    L(fix16_1)-L(Fix16EndTable)
2203	.int    L(fix16_2)-L(Fix16EndTable)
2204	.int    L(fix16_3)-L(Fix16EndTable)
2205
2206	.balign 16
2207L(fix16_3):
2208	movdqu -0x30(%rdx),%xmm1
2209	movdqa %xmm1,-0x30(%rcx)
2210L(fix16_2):
2211	movdqu -0x20(%rdx),%xmm2
2212	movdqa %xmm2,-0x20(%rcx)
2213L(fix16_1):
2214	movdqu -0x10(%rdx),%xmm3
2215	movdqa %xmm3,-0x10(%rcx)
2216L(fix16_0):
2217	lea    L(fwdPxQx)(%rip),%r10
2218	add    %r8,%rdx
2219	add    %r8,%rcx
2220
2221	movslq (%r10,%r8,4),%r9
2222	lea    (%r9,%r10,1),%r10
2223	jmpq   *%r10
2224
2225	.balign 16
2226L(pre_both_aligned):
2227	cmp    $0x80,%r8
2228	jl     L(fix_16b)
2229
2230	.balign 16
2231L(both_aligned):
2232
2233	/*
2234	 * this 'paired' load/load/store/store seems to do best.
2235	 */
2236	movdqa (%rdx),%xmm0
2237	movdqa 0x10(%rdx),%xmm1
2238
2239	movdqa %xmm0,(%rcx)
2240	movdqa %xmm1,0x10(%rcx)
2241	lea    -0x80(%r8),%r8
2242
2243	movdqa 0x20(%rdx),%xmm2
2244	movdqa 0x30(%rdx),%xmm3
2245
2246	movdqa %xmm2,0x20(%rcx)
2247	movdqa %xmm3,0x30(%rcx)
2248
2249	movdqa 0x40(%rdx),%xmm0
2250	movdqa 0x50(%rdx),%xmm1
2251	cmp    $0x80,%r8
2252
2253	movdqa %xmm0,0x40(%rcx)
2254	movdqa %xmm1,0x50(%rcx)
2255
2256	movdqa 0x60(%rdx),%xmm2
2257	movdqa 0x70(%rdx),%xmm3
2258	lea    0x80(%rdx),%rdx
2259	movdqa %xmm2,0x60(%rcx)
2260	movdqa %xmm3,0x70(%rcx)
2261	lea    0x80(%rcx),%rcx
2262	jge    L(both_aligned)
2263
2264L(fix_16b):
2265	add    %r8,%rcx
2266	lea    L(fwdPxQx)(%rip),%r10
2267	add    %r8,%rdx
2268
2269	movslq (%r10,%r8,4),%r9
2270	lea    (%r9,%r10,1),%r10
2271	jmpq   *%r10
2272
2273	.balign 16
2274L(Loop8byte_pre):
2275	# Use 8-byte moves
2276	mov    .largest_level_cache_size(%rip),%r9d
2277	shr    %r9		# take half of it
2278	cmp    %r9,%r8
2279	jg     L(byte8_nt_top)
2280	# Find out whether to use rep movsq
2281	cmp    $4096,%r8
2282	jle    L(byte8_top)
2283	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
2284	cmp    %r9,%r8
2285	jle    L(use_rep)
2286
2287	.balign     16
2288L(byte8_top):
2289	mov    (%rdx),%r9
2290	mov    0x8(%rdx),%r10
2291	lea    -0x40(%r8),%r8
2292	mov    %r9,(%rcx)
2293	mov    %r10,0x8(%rcx)
2294	mov    0x10(%rdx),%r11
2295	mov    0x18(%rdx),%r9
2296	mov    %r11,0x10(%rcx)
2297	mov    %r9,0x18(%rcx)
2298
2299	cmp    $0x40,%r8
2300	mov    0x20(%rdx),%r10
2301	mov    0x28(%rdx),%r11
2302	mov    %r10,0x20(%rcx)
2303	mov    %r11,0x28(%rcx)
2304	mov    0x30(%rdx),%r9
2305	mov    0x38(%rdx),%r10
2306	lea    0x40(%rdx),%rdx
2307	mov    %r9,0x30(%rcx)
2308	mov    %r10,0x38(%rcx)
2309	lea    0x40(%rcx),%rcx
2310	jg     L(byte8_top)
2311
2312L(byte8_end):
2313	lea    L(fwdPxQx)(%rip),%r10
2314	lea    (%rdx,%r8,1),%rdx
2315	lea    (%rcx,%r8,1),%rcx
2316
2317	movslq (%r10,%r8,4),%r9
2318	lea    (%r9,%r10,1),%r10
2319	jmpq   *%r10
2320
2321	.balign	16
2322L(use_rep):
2323	mov    %rdx,%rsi		# %rsi = source
2324	mov    %rcx,%rdi		# %rdi = destination
2325	mov    %r8,%rcx			# %rcx = count
2326	shrq   $3,%rcx			# 8-byte word count
2327	rep
2328	  movsq
2329	mov    %rsi,%rdx		# source
2330	mov    %rdi,%rcx		# destination
2331	andq   $7,%r8			# remainder
2332	jnz    L(byte8_end)
2333	ret
2334
2335	.balign 16
2336L(byte8_nt_top):
2337	sub    $0x40,%r8
2338	prefetchnta 0x180(%rdx)
2339	mov    (%rdx),%r9
2340	movnti %r9,(%rcx)
2341	mov    0x8(%rdx),%r10
2342	movnti %r10,0x8(%rcx)
2343	mov    0x10(%rdx),%r11
2344	movnti %r11,0x10(%rcx)
2345	mov    0x18(%rdx),%r9
2346	movnti %r9,0x18(%rcx)
2347	mov    0x20(%rdx),%r10
2348	movnti %r10,0x20(%rcx)
2349	mov    0x28(%rdx),%r11
2350	movnti %r11,0x28(%rcx)
2351	mov    0x30(%rdx),%r9
2352	movnti %r9,0x30(%rcx)
2353	mov    0x38(%rdx),%r10
2354	movnti %r10,0x38(%rcx)
2355
2356	lea    0x40(%rdx),%rdx
2357	lea    0x40(%rcx),%rcx
2358	cmp    $0x40,%r8
2359	jge    L(byte8_nt_top)
2360	sfence
2361	jmp    L(byte8_end)
2362
2363	SET_SIZE(memcpy)
2364
2365	.balign 16
2366L(CopyBackwards):
2367	mov    %rdx,%r8
2368	mov    %rdi,%rcx
2369	mov    %rsi,%rdx
2370	mov    %rdi,%rax		# return value
2371
2372	# ck alignment of last byte
2373	lea    (%rcx,%r8,1),%rcx
2374	test   $0x7,%rcx
2375	lea    (%rdx,%r8,1),%rdx
2376	jne    L(bk_align)
2377
2378L(bk_qw_aligned):
2379	lea    L(bkPxQx)(%rip),%r10
2380
2381	cmp    $0x90,%r8		# 144
2382	jg     L(bk_ck_sse2_alignment)
2383
2384	sub    %r8,%rcx
2385	sub    %r8,%rdx
2386
2387	movslq (%r10,%r8,4),%r9
2388	lea    (%r9,%r10,1),%r10
2389	jmpq   *%r10
2390
2391	.balign 16
2392L(bk_align):
2393	# only align if len > 8
2394	cmp    $8,%r8
2395	jle    L(bk_qw_aligned)
2396	test   $0x1,%rcx
2397	je     L(bk_tst2)
2398	dec    %rcx
2399	dec    %rdx
2400	dec    %r8
2401	mov    (%rdx),%r9b
2402	mov    %r9b,(%rcx)
2403
2404L(bk_tst2):
2405	test   $0x2,%rcx
2406	je     L(bk_tst3)
2407
2408L(bk_got2):
2409	sub    $0x2,%rcx
2410	sub    $0x2,%rdx
2411	sub    $0x2,%r8
2412	movzwq (%rdx),%r9
2413	mov    %r9w,(%rcx)
2414
2415L(bk_tst3):
2416	test   $0x4,%rcx
2417	je     L(bk_qw_aligned)
2418
2419L(bk_got3):
2420	sub    $0x4,%rcx
2421	sub    $0x4,%rdx
2422	sub    $0x4,%r8
2423	mov    (%rdx),%r9d
2424	mov    %r9d,(%rcx)
2425	jmp    L(bk_qw_aligned)
2426
2427	.balign 16
2428L(bk_ck_sse2_alignment):
2429	cmpl   $NO_SSE,.memops_method(%rip)
2430	je     L(bk_use_rep)
2431	# check alignment of last byte
2432	test   $0xf,%rcx
2433	jz     L(bk_sse2_cpy)
2434
2435L(bk_sse2_align):
2436	# only here if already aligned on at least a qword bndry
2437	sub    $0x8,%rcx
2438	sub    $0x8,%rdx
2439	sub    $0x8,%r8
2440	mov    (%rdx),%r9
2441	mov    %r9,(%rcx)
2442	#jmp   L(bk_sse2_cpy)
2443
2444	.balign 16
2445L(bk_sse2_cpy):
2446	sub    $0x80,%rcx		# 128
2447	sub    $0x80,%rdx
2448	movdqu 0x70(%rdx),%xmm3
2449	movdqu 0x60(%rdx),%xmm2
2450	movdqa %xmm3,0x70(%rcx)
2451	movdqa %xmm2,0x60(%rcx)
2452	sub    $0x80,%r8
2453	movdqu 0x50(%rdx),%xmm1
2454	movdqu 0x40(%rdx),%xmm0
2455	movdqa %xmm1,0x50(%rcx)
2456	movdqa %xmm0,0x40(%rcx)
2457
2458	cmp    $0x80,%r8
2459	movdqu 0x30(%rdx),%xmm3
2460	movdqu 0x20(%rdx),%xmm2
2461	movdqa %xmm3,0x30(%rcx)
2462	movdqa %xmm2,0x20(%rcx)
2463	movdqu 0x10(%rdx),%xmm1
2464	movdqu (%rdx),%xmm0
2465	movdqa %xmm1,0x10(%rcx)
2466	movdqa %xmm0,(%rcx)
2467	jge    L(bk_sse2_cpy)
2468
2469L(bk_sse2_cpy_end):
2470	lea    L(bkPxQx)(%rip),%r10
2471	sub    %r8,%rdx
2472	sub    %r8,%rcx
2473	movslq (%r10,%r8,4),%r9
2474	lea    (%r9,%r10,1),%r10
2475	jmpq   *%r10
2476
2477	.balign 16
2478L(bk_use_rep):
2479	xchg   %rcx,%r9
2480	mov    %rdx,%rsi		# source
2481	mov    %r9,%rdi			# destination
2482	mov    %r8,%rcx			# count
2483	sub    $8,%rsi
2484	sub    $8,%rdi
2485	shr    $3,%rcx
2486	std				# reverse direction
2487	rep
2488	  movsq
2489	cld				# reset direction flag
2490
2491	xchg   %rcx,%r9
2492	lea    L(bkPxQx)(%rip),%r10
2493	sub    %r8,%rdx
2494	sub    %r8,%rcx
2495	andq   $7,%r8			# remainder
2496	jz     2f
2497	movslq (%r10,%r8,4),%r9
2498	lea    (%r9,%r10,1),%r10
2499	jmpq   *%r10
25002:
2501	ret
2502
2503	.balign 16
2504L(bkP0QI):
2505	mov    0x88(%rdx),%r10
2506	mov    %r10,0x88(%rcx)
2507L(bkP0QH):
2508	mov    0x80(%rdx),%r10
2509	mov    %r10,0x80(%rcx)
2510L(bkP0QG):
2511	mov    0x78(%rdx),%r9
2512	mov    %r9,0x78(%rcx)
2513L(bkP0QF):
2514	mov    0x70(%rdx),%r11
2515	mov    %r11,0x70(%rcx)
2516L(bkP0QE):
2517	mov    0x68(%rdx),%r10
2518	mov    %r10,0x68(%rcx)
2519L(bkP0QD):
2520	mov    0x60(%rdx),%r9
2521	mov    %r9,0x60(%rcx)
2522L(bkP0QC):
2523	mov    0x58(%rdx),%r11
2524	mov    %r11,0x58(%rcx)
2525L(bkP0QB):
2526	mov    0x50(%rdx),%r10
2527	mov    %r10,0x50(%rcx)
2528L(bkP0QA):
2529	mov    0x48(%rdx),%r9
2530	mov    %r9,0x48(%rcx)
2531L(bkP0Q9):
2532	mov    0x40(%rdx),%r11
2533	mov    %r11,0x40(%rcx)
2534L(bkP0Q8):
2535	mov    0x38(%rdx),%r10
2536	mov    %r10,0x38(%rcx)
2537L(bkP0Q7):
2538	mov    0x30(%rdx),%r9
2539	mov    %r9,0x30(%rcx)
2540L(bkP0Q6):
2541	mov    0x28(%rdx),%r11
2542	mov    %r11,0x28(%rcx)
2543L(bkP0Q5):
2544	mov    0x20(%rdx),%r10
2545	mov    %r10,0x20(%rcx)
2546L(bkP0Q4):
2547	mov    0x18(%rdx),%r9
2548	mov    %r9,0x18(%rcx)
2549L(bkP0Q3):
2550	mov    0x10(%rdx),%r11
2551	mov    %r11,0x10(%rcx)
2552L(bkP0Q2):
2553	mov    0x8(%rdx),%r10
2554	mov    %r10,0x8(%rcx)
2555L(bkP0Q1):
2556	mov    (%rdx),%r9
2557	mov    %r9,(%rcx)
2558L(bkP0Q0):
2559	ret
2560
2561	.balign 16
2562L(bkP1QI):
2563	mov    0x89(%rdx),%r10
2564	mov    %r10,0x89(%rcx)
2565L(bkP1QH):
2566	mov    0x81(%rdx),%r11
2567	mov    %r11,0x81(%rcx)
2568L(bkP1QG):
2569	mov    0x79(%rdx),%r10
2570	mov    %r10,0x79(%rcx)
2571L(bkP1QF):
2572	mov    0x71(%rdx),%r9
2573	mov    %r9,0x71(%rcx)
2574L(bkP1QE):
2575	mov    0x69(%rdx),%r11
2576	mov    %r11,0x69(%rcx)
2577L(bkP1QD):
2578	mov    0x61(%rdx),%r10
2579	mov    %r10,0x61(%rcx)
2580L(bkP1QC):
2581	mov    0x59(%rdx),%r9
2582	mov    %r9,0x59(%rcx)
2583L(bkP1QB):
2584	mov    0x51(%rdx),%r11
2585	mov    %r11,0x51(%rcx)
2586L(bkP1QA):
2587	mov    0x49(%rdx),%r10
2588	mov    %r10,0x49(%rcx)
2589L(bkP1Q9):
2590	mov    0x41(%rdx),%r9
2591	mov    %r9,0x41(%rcx)
2592L(bkP1Q8):
2593	mov    0x39(%rdx),%r11
2594	mov    %r11,0x39(%rcx)
2595L(bkP1Q7):
2596	mov    0x31(%rdx),%r10
2597	mov    %r10,0x31(%rcx)
2598L(bkP1Q6):
2599	mov    0x29(%rdx),%r9
2600	mov    %r9,0x29(%rcx)
2601L(bkP1Q5):
2602	mov    0x21(%rdx),%r11
2603	mov    %r11,0x21(%rcx)
2604L(bkP1Q4):
2605	mov    0x19(%rdx),%r10
2606	mov    %r10,0x19(%rcx)
2607L(bkP1Q3):
2608	mov    0x11(%rdx),%r9
2609	mov    %r9,0x11(%rcx)
2610L(bkP1Q2):
2611	mov    0x9(%rdx),%r11
2612	mov    %r11,0x9(%rcx)
2613L(bkP1Q1):
2614	mov    0x1(%rdx),%r10
2615	mov    %r10,0x1(%rcx)
2616L(bkP1Q0):
2617	mov    (%rdx),%r9b
2618	mov    %r9b,(%rcx)
2619	ret
2620
2621	.balign 16
2622L(bkP2QI):
2623	mov    0x8a(%rdx),%r10
2624	mov    %r10,0x8a(%rcx)
2625L(bkP2QH):
2626	mov    0x82(%rdx),%r11
2627	mov    %r11,0x82(%rcx)
2628L(bkP2QG):
2629	mov    0x7a(%rdx),%r10
2630	mov    %r10,0x7a(%rcx)
2631L(bkP2QF):
2632	mov    0x72(%rdx),%r9
2633	mov    %r9,0x72(%rcx)
2634L(bkP2QE):
2635	mov    0x6a(%rdx),%r11
2636	mov    %r11,0x6a(%rcx)
2637L(bkP2QD):
2638	mov    0x62(%rdx),%r10
2639	mov    %r10,0x62(%rcx)
2640L(bkP2QC):
2641	mov    0x5a(%rdx),%r9
2642	mov    %r9,0x5a(%rcx)
2643L(bkP2QB):
2644	mov    0x52(%rdx),%r11
2645	mov    %r11,0x52(%rcx)
2646L(bkP2QA):
2647	mov    0x4a(%rdx),%r10
2648	mov    %r10,0x4a(%rcx)
2649L(bkP2Q9):
2650	mov    0x42(%rdx),%r9
2651	mov    %r9,0x42(%rcx)
2652L(bkP2Q8):
2653	mov    0x3a(%rdx),%r11
2654	mov    %r11,0x3a(%rcx)
2655L(bkP2Q7):
2656	mov    0x32(%rdx),%r10
2657	mov    %r10,0x32(%rcx)
2658L(bkP2Q6):
2659	mov    0x2a(%rdx),%r9
2660	mov    %r9,0x2a(%rcx)
2661L(bkP2Q5):
2662	mov    0x22(%rdx),%r11
2663	mov    %r11,0x22(%rcx)
2664L(bkP2Q4):
2665	mov    0x1a(%rdx),%r10
2666	mov    %r10,0x1a(%rcx)
2667L(bkP2Q3):
2668	mov    0x12(%rdx),%r9
2669	mov    %r9,0x12(%rcx)
2670L(bkP2Q2):
2671	mov    0xa(%rdx),%r11
2672	mov    %r11,0xa(%rcx)
2673L(bkP2Q1):
2674	mov    0x2(%rdx),%r10
2675	mov    %r10,0x2(%rcx)
2676L(bkP2Q0):
2677	mov    (%rdx),%r9w
2678	mov    %r9w,(%rcx)
2679	ret
2680
2681	.balign 16
2682L(bkP3QI):
2683	mov    0x8b(%rdx),%r10
2684	mov    %r10,0x8b(%rcx)
2685L(bkP3QH):
2686	mov    0x83(%rdx),%r11
2687	mov    %r11,0x83(%rcx)
2688L(bkP3QG):
2689	mov    0x7b(%rdx),%r10
2690	mov    %r10,0x7b(%rcx)
2691L(bkP3QF):
2692	mov    0x73(%rdx),%r9
2693	mov    %r9,0x73(%rcx)
2694L(bkP3QE):
2695	mov    0x6b(%rdx),%r11
2696	mov    %r11,0x6b(%rcx)
2697L(bkP3QD):
2698	mov    0x63(%rdx),%r10
2699	mov    %r10,0x63(%rcx)
2700L(bkP3QC):
2701	mov    0x5b(%rdx),%r9
2702	mov    %r9,0x5b(%rcx)
2703L(bkP3QB):
2704	mov    0x53(%rdx),%r11
2705	mov    %r11,0x53(%rcx)
2706L(bkP3QA):
2707	mov    0x4b(%rdx),%r10
2708	mov    %r10,0x4b(%rcx)
2709L(bkP3Q9):
2710	mov    0x43(%rdx),%r9
2711	mov    %r9,0x43(%rcx)
2712L(bkP3Q8):
2713	mov    0x3b(%rdx),%r11
2714	mov    %r11,0x3b(%rcx)
2715L(bkP3Q7):
2716	mov    0x33(%rdx),%r10
2717	mov    %r10,0x33(%rcx)
2718L(bkP3Q6):
2719	mov    0x2b(%rdx),%r9
2720	mov    %r9,0x2b(%rcx)
2721L(bkP3Q5):
2722	mov    0x23(%rdx),%r11
2723	mov    %r11,0x23(%rcx)
2724L(bkP3Q4):
2725	mov    0x1b(%rdx),%r10
2726	mov    %r10,0x1b(%rcx)
2727L(bkP3Q3):
2728	mov    0x13(%rdx),%r9
2729	mov    %r9,0x13(%rcx)
2730L(bkP3Q2):
2731	mov    0xb(%rdx),%r11
2732	mov    %r11,0xb(%rcx)
2733L(bkP3Q1):
2734	mov    0x3(%rdx),%r10
2735	mov    %r10,0x3(%rcx)
2736L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
2737	mov    0x1(%rdx),%r9w
2738	mov    %r9w,0x1(%rcx)
2739	mov    (%rdx),%r10b
2740	mov    %r10b,(%rcx)
2741	ret
2742
2743	.balign 16
2744L(bkP4QI):
2745	mov    0x8c(%rdx),%r10
2746	mov    %r10,0x8c(%rcx)
2747L(bkP4QH):
2748	mov    0x84(%rdx),%r11
2749	mov    %r11,0x84(%rcx)
2750L(bkP4QG):
2751	mov    0x7c(%rdx),%r10
2752	mov    %r10,0x7c(%rcx)
2753L(bkP4QF):
2754	mov    0x74(%rdx),%r9
2755	mov    %r9,0x74(%rcx)
2756L(bkP4QE):
2757	mov    0x6c(%rdx),%r11
2758	mov    %r11,0x6c(%rcx)
2759L(bkP4QD):
2760	mov    0x64(%rdx),%r10
2761	mov    %r10,0x64(%rcx)
2762L(bkP4QC):
2763	mov    0x5c(%rdx),%r9
2764	mov    %r9,0x5c(%rcx)
2765L(bkP4QB):
2766	mov    0x54(%rdx),%r11
2767	mov    %r11,0x54(%rcx)
2768L(bkP4QA):
2769	mov    0x4c(%rdx),%r10
2770	mov    %r10,0x4c(%rcx)
2771L(bkP4Q9):
2772	mov    0x44(%rdx),%r9
2773	mov    %r9,0x44(%rcx)
2774L(bkP4Q8):
2775	mov    0x3c(%rdx),%r11
2776	mov    %r11,0x3c(%rcx)
2777L(bkP4Q7):
2778	mov    0x34(%rdx),%r10
2779	mov    %r10,0x34(%rcx)
2780L(bkP4Q6):
2781	mov    0x2c(%rdx),%r9
2782	mov    %r9,0x2c(%rcx)
2783L(bkP4Q5):
2784	mov    0x24(%rdx),%r11
2785	mov    %r11,0x24(%rcx)
2786L(bkP4Q4):
2787	mov    0x1c(%rdx),%r10
2788	mov    %r10,0x1c(%rcx)
2789L(bkP4Q3):
2790	mov    0x14(%rdx),%r9
2791	mov    %r9,0x14(%rcx)
2792L(bkP4Q2):
2793	mov    0xc(%rdx),%r11
2794	mov    %r11,0xc(%rcx)
2795L(bkP4Q1):
2796	mov    0x4(%rdx),%r10
2797	mov    %r10,0x4(%rcx)
2798L(bkP4Q0):
2799	mov    (%rdx),%r9d
2800	mov    %r9d,(%rcx)
2801	ret
2802
2803	.balign 16
2804L(bkP5QI):
2805	mov    0x8d(%rdx),%r10
2806	mov    %r10,0x8d(%rcx)
2807L(bkP5QH):
2808	mov    0x85(%rdx),%r9
2809	mov    %r9,0x85(%rcx)
2810L(bkP5QG):
2811	mov    0x7d(%rdx),%r11
2812	mov    %r11,0x7d(%rcx)
2813L(bkP5QF):
2814	mov    0x75(%rdx),%r10
2815	mov    %r10,0x75(%rcx)
2816L(bkP5QE):
2817	mov    0x6d(%rdx),%r9
2818	mov    %r9,0x6d(%rcx)
2819L(bkP5QD):
2820	mov    0x65(%rdx),%r11
2821	mov    %r11,0x65(%rcx)
2822L(bkP5QC):
2823	mov    0x5d(%rdx),%r10
2824	mov    %r10,0x5d(%rcx)
2825L(bkP5QB):
2826	mov    0x55(%rdx),%r9
2827	mov    %r9,0x55(%rcx)
2828L(bkP5QA):
2829	mov    0x4d(%rdx),%r11
2830	mov    %r11,0x4d(%rcx)
2831L(bkP5Q9):
2832	mov    0x45(%rdx),%r10
2833	mov    %r10,0x45(%rcx)
2834L(bkP5Q8):
2835	mov    0x3d(%rdx),%r9
2836	mov    %r9,0x3d(%rcx)
2837L(bkP5Q7):
2838	mov    0x35(%rdx),%r11
2839	mov    %r11,0x35(%rcx)
2840L(bkP5Q6):
2841	mov    0x2d(%rdx),%r10
2842	mov    %r10,0x2d(%rcx)
2843L(bkP5Q5):
2844	mov    0x25(%rdx),%r9
2845	mov    %r9,0x25(%rcx)
2846L(bkP5Q4):
2847	mov    0x1d(%rdx),%r11
2848	mov    %r11,0x1d(%rcx)
2849L(bkP5Q3):
2850	mov    0x15(%rdx),%r10
2851	mov    %r10,0x15(%rcx)
2852L(bkP5Q2):
2853	mov    0xd(%rdx),%r9
2854	mov    %r9,0xd(%rcx)
2855L(bkP5Q1):
2856	mov    0x5(%rdx),%r11
2857	mov    %r11,0x5(%rcx)
2858L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
2859	mov    0x1(%rdx),%r9d
2860	mov    %r9d,0x1(%rcx)
2861	mov    (%rdx),%r10b
2862	mov    %r10b,(%rcx)
2863	ret
2864
2865	.balign 16
2866L(bkP6QI):
2867	mov    0x8e(%rdx),%r10
2868	mov    %r10,0x8e(%rcx)
2869L(bkP6QH):
2870	mov    0x86(%rdx),%r11
2871	mov    %r11,0x86(%rcx)
2872L(bkP6QG):
2873	mov    0x7e(%rdx),%r10
2874	mov    %r10,0x7e(%rcx)
2875L(bkP6QF):
2876	mov    0x76(%rdx),%r9
2877	mov    %r9,0x76(%rcx)
2878L(bkP6QE):
2879	mov    0x6e(%rdx),%r11
2880	mov    %r11,0x6e(%rcx)
2881L(bkP6QD):
2882	mov    0x66(%rdx),%r10
2883	mov    %r10,0x66(%rcx)
2884L(bkP6QC):
2885	mov    0x5e(%rdx),%r9
2886	mov    %r9,0x5e(%rcx)
2887L(bkP6QB):
2888	mov    0x56(%rdx),%r11
2889	mov    %r11,0x56(%rcx)
2890L(bkP6QA):
2891	mov    0x4e(%rdx),%r10
2892	mov    %r10,0x4e(%rcx)
2893L(bkP6Q9):
2894	mov    0x46(%rdx),%r9
2895	mov    %r9,0x46(%rcx)
2896L(bkP6Q8):
2897	mov    0x3e(%rdx),%r11
2898	mov    %r11,0x3e(%rcx)
2899L(bkP6Q7):
2900	mov    0x36(%rdx),%r10
2901	mov    %r10,0x36(%rcx)
2902L(bkP6Q6):
2903	mov    0x2e(%rdx),%r9
2904	mov    %r9,0x2e(%rcx)
2905L(bkP6Q5):
2906	mov    0x26(%rdx),%r11
2907	mov    %r11,0x26(%rcx)
2908L(bkP6Q4):
2909	mov    0x1e(%rdx),%r10
2910	mov    %r10,0x1e(%rcx)
2911L(bkP6Q3):
2912	mov    0x16(%rdx),%r9
2913	mov    %r9,0x16(%rcx)
2914L(bkP6Q2):
2915	mov    0xe(%rdx),%r11
2916	mov    %r11,0xe(%rcx)
2917L(bkP6Q1):
2918	mov    0x6(%rdx),%r10
2919	mov    %r10,0x6(%rcx)
2920L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
2921	mov    0x2(%rdx),%r9d
2922	mov    %r9d,0x2(%rcx)
2923	mov    (%rdx),%r10w
2924	mov    %r10w,(%rcx)
2925	ret
2926
2927	.balign 16
2928L(bkP7QI):
2929	mov    0x8f(%rdx),%r10
2930	mov    %r10,0x8f(%rcx)
2931L(bkP7QH):
2932	mov    0x87(%rdx),%r11
2933	mov    %r11,0x87(%rcx)
2934L(bkP7QG):
2935	mov    0x7f(%rdx),%r10
2936	mov    %r10,0x7f(%rcx)
2937L(bkP7QF):
2938	mov    0x77(%rdx),%r9
2939	mov    %r9,0x77(%rcx)
2940L(bkP7QE):
2941	mov    0x6f(%rdx),%r11
2942	mov    %r11,0x6f(%rcx)
2943L(bkP7QD):
2944	mov    0x67(%rdx),%r10
2945	mov    %r10,0x67(%rcx)
2946L(bkP7QC):
2947	mov    0x5f(%rdx),%r9
2948	mov    %r9,0x5f(%rcx)
2949L(bkP7QB):
2950	mov    0x57(%rdx),%r11
2951	mov    %r11,0x57(%rcx)
2952L(bkP7QA):
2953	mov    0x4f(%rdx),%r10
2954	mov    %r10,0x4f(%rcx)
2955L(bkP7Q9):
2956	mov    0x47(%rdx),%r9
2957	mov    %r9,0x47(%rcx)
2958L(bkP7Q8):
2959	mov    0x3f(%rdx),%r11
2960	mov    %r11,0x3f(%rcx)
2961L(bkP7Q7):
2962	mov    0x37(%rdx),%r10
2963	mov    %r10,0x37(%rcx)
2964L(bkP7Q6):
2965	mov    0x2f(%rdx),%r9
2966	mov    %r9,0x2f(%rcx)
2967L(bkP7Q5):
2968	mov    0x27(%rdx),%r11
2969	mov    %r11,0x27(%rcx)
2970L(bkP7Q4):
2971	mov    0x1f(%rdx),%r10
2972	mov    %r10,0x1f(%rcx)
2973L(bkP7Q3):
2974	mov    0x17(%rdx),%r9
2975	mov    %r9,0x17(%rcx)
2976L(bkP7Q2):
2977	mov    0xf(%rdx),%r11
2978	mov    %r11,0xf(%rcx)
2979L(bkP7Q1):
2980	mov    0x7(%rdx),%r10
2981	mov    %r10,0x7(%rcx)
2982L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
2983	mov    0x3(%rdx),%r9d
2984	mov    %r9d,0x3(%rcx)
2985	mov    0x1(%rdx),%r10w
2986	mov    %r10w,0x1(%rcx)
2987	mov    (%rdx),%r11b
2988	mov    %r11b,(%rcx)
2989	ret
2990
2991		.balign 16
2992L(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
2993		.int L(bkP1Q0)-L(bkPxQx)
2994		.int L(bkP2Q0)-L(bkPxQx)
2995		.int L(bkP3Q0)-L(bkPxQx)
2996		.int L(bkP4Q0)-L(bkPxQx)
2997		.int L(bkP5Q0)-L(bkPxQx)
2998		.int L(bkP6Q0)-L(bkPxQx)
2999		.int L(bkP7Q0)-L(bkPxQx)
3000
3001		.int L(bkP0Q1)-L(bkPxQx)
3002		.int L(bkP1Q1)-L(bkPxQx)
3003		.int L(bkP2Q1)-L(bkPxQx)
3004		.int L(bkP3Q1)-L(bkPxQx)
3005		.int L(bkP4Q1)-L(bkPxQx)
3006		.int L(bkP5Q1)-L(bkPxQx)
3007		.int L(bkP6Q1)-L(bkPxQx)
3008		.int L(bkP7Q1)-L(bkPxQx)
3009
3010		.int L(bkP0Q2)-L(bkPxQx)
3011		.int L(bkP1Q2)-L(bkPxQx)
3012		.int L(bkP2Q2)-L(bkPxQx)
3013		.int L(bkP3Q2)-L(bkPxQx)
3014		.int L(bkP4Q2)-L(bkPxQx)
3015		.int L(bkP5Q2)-L(bkPxQx)
3016		.int L(bkP6Q2)-L(bkPxQx)
3017		.int L(bkP7Q2)-L(bkPxQx)
3018
3019		.int L(bkP0Q3)-L(bkPxQx)
3020		.int L(bkP1Q3)-L(bkPxQx)
3021		.int L(bkP2Q3)-L(bkPxQx)
3022		.int L(bkP3Q3)-L(bkPxQx)
3023		.int L(bkP4Q3)-L(bkPxQx)
3024		.int L(bkP5Q3)-L(bkPxQx)
3025		.int L(bkP6Q3)-L(bkPxQx)
3026		.int L(bkP7Q3)-L(bkPxQx)
3027
3028		.int L(bkP0Q4)-L(bkPxQx)
3029		.int L(bkP1Q4)-L(bkPxQx)
3030		.int L(bkP2Q4)-L(bkPxQx)
3031		.int L(bkP3Q4)-L(bkPxQx)
3032		.int L(bkP4Q4)-L(bkPxQx)
3033		.int L(bkP5Q4)-L(bkPxQx)
3034		.int L(bkP6Q4)-L(bkPxQx)
3035		.int L(bkP7Q4)-L(bkPxQx)
3036
3037		.int L(bkP0Q5)-L(bkPxQx)
3038		.int L(bkP1Q5)-L(bkPxQx)
3039		.int L(bkP2Q5)-L(bkPxQx)
3040		.int L(bkP3Q5)-L(bkPxQx)
3041		.int L(bkP4Q5)-L(bkPxQx)
3042		.int L(bkP5Q5)-L(bkPxQx)
3043		.int L(bkP6Q5)-L(bkPxQx)
3044		.int L(bkP7Q5)-L(bkPxQx)
3045
3046		.int L(bkP0Q6)-L(bkPxQx)
3047		.int L(bkP1Q6)-L(bkPxQx)
3048		.int L(bkP2Q6)-L(bkPxQx)
3049		.int L(bkP3Q6)-L(bkPxQx)
3050		.int L(bkP4Q6)-L(bkPxQx)
3051		.int L(bkP5Q6)-L(bkPxQx)
3052		.int L(bkP6Q6)-L(bkPxQx)
3053		.int L(bkP7Q6)-L(bkPxQx)
3054
3055		.int L(bkP0Q7)-L(bkPxQx)
3056		.int L(bkP1Q7)-L(bkPxQx)
3057		.int L(bkP2Q7)-L(bkPxQx)
3058		.int L(bkP3Q7)-L(bkPxQx)
3059		.int L(bkP4Q7)-L(bkPxQx)
3060		.int L(bkP5Q7)-L(bkPxQx)
3061		.int L(bkP6Q7)-L(bkPxQx)
3062		.int L(bkP7Q7)-L(bkPxQx)
3063
3064		.int L(bkP0Q8)-L(bkPxQx)
3065		.int L(bkP1Q8)-L(bkPxQx)
3066		.int L(bkP2Q8)-L(bkPxQx)
3067		.int L(bkP3Q8)-L(bkPxQx)
3068		.int L(bkP4Q8)-L(bkPxQx)
3069		.int L(bkP5Q8)-L(bkPxQx)
3070		.int L(bkP6Q8)-L(bkPxQx)
3071		.int L(bkP7Q8)-L(bkPxQx)
3072
3073		.int L(bkP0Q9)-L(bkPxQx)
3074		.int L(bkP1Q9)-L(bkPxQx)
3075		.int L(bkP2Q9)-L(bkPxQx)
3076		.int L(bkP3Q9)-L(bkPxQx)
3077		.int L(bkP4Q9)-L(bkPxQx)
3078		.int L(bkP5Q9)-L(bkPxQx)
3079		.int L(bkP6Q9)-L(bkPxQx)
3080		.int L(bkP7Q9)-L(bkPxQx)
3081
3082		.int L(bkP0QA)-L(bkPxQx)
3083		.int L(bkP1QA)-L(bkPxQx)
3084		.int L(bkP2QA)-L(bkPxQx)
3085		.int L(bkP3QA)-L(bkPxQx)
3086		.int L(bkP4QA)-L(bkPxQx)
3087		.int L(bkP5QA)-L(bkPxQx)
3088		.int L(bkP6QA)-L(bkPxQx)
3089		.int L(bkP7QA)-L(bkPxQx)
3090
3091		.int L(bkP0QB)-L(bkPxQx)
3092		.int L(bkP1QB)-L(bkPxQx)
3093		.int L(bkP2QB)-L(bkPxQx)
3094		.int L(bkP3QB)-L(bkPxQx)
3095		.int L(bkP4QB)-L(bkPxQx)
3096		.int L(bkP5QB)-L(bkPxQx)
3097		.int L(bkP6QB)-L(bkPxQx)
3098		.int L(bkP7QB)-L(bkPxQx)
3099
3100		.int L(bkP0QC)-L(bkPxQx)
3101		.int L(bkP1QC)-L(bkPxQx)
3102		.int L(bkP2QC)-L(bkPxQx)
3103		.int L(bkP3QC)-L(bkPxQx)
3104		.int L(bkP4QC)-L(bkPxQx)
3105		.int L(bkP5QC)-L(bkPxQx)
3106		.int L(bkP6QC)-L(bkPxQx)
3107		.int L(bkP7QC)-L(bkPxQx)
3108
3109		.int L(bkP0QD)-L(bkPxQx)
3110		.int L(bkP1QD)-L(bkPxQx)
3111		.int L(bkP2QD)-L(bkPxQx)
3112		.int L(bkP3QD)-L(bkPxQx)
3113		.int L(bkP4QD)-L(bkPxQx)
3114		.int L(bkP5QD)-L(bkPxQx)
3115		.int L(bkP6QD)-L(bkPxQx)
3116		.int L(bkP7QD)-L(bkPxQx)
3117
3118		.int L(bkP0QE)-L(bkPxQx)
3119		.int L(bkP1QE)-L(bkPxQx)
3120		.int L(bkP2QE)-L(bkPxQx)
3121		.int L(bkP3QE)-L(bkPxQx)
3122		.int L(bkP4QE)-L(bkPxQx)
3123		.int L(bkP5QE)-L(bkPxQx)
3124		.int L(bkP6QE)-L(bkPxQx)
3125		.int L(bkP7QE)-L(bkPxQx)
3126
3127		.int L(bkP0QF)-L(bkPxQx)
3128		.int L(bkP1QF)-L(bkPxQx)
3129		.int L(bkP2QF)-L(bkPxQx)
3130		.int L(bkP3QF)-L(bkPxQx)
3131		.int L(bkP4QF)-L(bkPxQx)
3132		.int L(bkP5QF)-L(bkPxQx)
3133		.int L(bkP6QF)-L(bkPxQx)
3134		.int L(bkP7QF)-L(bkPxQx)
3135
3136		.int L(bkP0QG)-L(bkPxQx)
3137		.int L(bkP1QG)-L(bkPxQx)
3138		.int L(bkP2QG)-L(bkPxQx)
3139		.int L(bkP3QG)-L(bkPxQx)
3140		.int L(bkP4QG)-L(bkPxQx)
3141		.int L(bkP5QG)-L(bkPxQx)
3142		.int L(bkP6QG)-L(bkPxQx)
3143		.int L(bkP7QG)-L(bkPxQx)
3144
3145		.int L(bkP0QH)-L(bkPxQx)
3146		.int L(bkP1QH)-L(bkPxQx)
3147		.int L(bkP2QH)-L(bkPxQx)
3148		.int L(bkP3QH)-L(bkPxQx)
3149		.int L(bkP4QH)-L(bkPxQx)
3150		.int L(bkP5QH)-L(bkPxQx)
3151		.int L(bkP6QH)-L(bkPxQx)
3152		.int L(bkP7QH)-L(bkPxQx)
3153
3154		.int L(bkP0QI)-L(bkPxQx)
3155		.int L(bkP1QI)-L(bkPxQx)
3156		.int L(bkP2QI)-L(bkPxQx)
3157		.int L(bkP3QI)-L(bkPxQx)
3158		.int L(bkP4QI)-L(bkPxQx)
3159		.int L(bkP5QI)-L(bkPxQx)
3160		.int L(bkP6QI)-L(bkPxQx)
3161		.int L(bkP7QI)-L(bkPxQx)
3162
3163	SET_SIZE(memmove)
3164