xref: /titanic_50/usr/src/lib/libc/amd64/gen/memcpy.s (revision 66ea84940ca8687745ad2a165ef9bf49ec13996f)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2008, Intel Corporation
29 * All rights reserved.
30 */
31
32/*
33 * memcpy.s - copies two blocks of memory
34 *	Implements memcpy() and memmove() libc primitives.
35 */
36
37	.file	"memcpy.s"
38
39#include <sys/asm_linkage.h>
40
41	ANSI_PRAGMA_WEAK(memmove,function)
42	ANSI_PRAGMA_WEAK(memcpy,function)
43
44#include "cache.h"
45#include "proc64_id.h"
46
47#define L(s) .memcpy/**/s
48
49/*
50 * memcpy algorithm overview:
51 *
52 * Thresholds used below were determined experimentally.
53 *
54 * Pseudo code:
55 *
56 * If (size <= 128 bytes) {
57 *	do unrolled code (primarily 8-byte loads/stores) regardless of
58 *	alignment.
59 * } else {
60 *	Align destination to 16-byte boundary
61 *
62 *      if (NO_SSE) {
63 *		If (size > half of the largest level cache) {
64 *			Use 8-byte non-temporal stores (64-bytes/loop)
65 *		} else {
66 *			if (size > 4K && size <= half l1 cache size) {
67 *				Use rep movsq
68 *			} else {
69 *				Use 8-byte loads/stores (64 bytes per loop)
70 *			}
71 *		}
72 *
73 *	} else { **USE SSE**
74 *		If (size > half of the largest level cache) {
75 *			Use 16-byte non-temporal stores (128-bytes per loop)
76 *		} else {
77 *			If (both source and destination are aligned) {
78 *			    Use 16-byte aligned loads and stores (128 bytes/loop)
79 *			} else {
80 *			    use pairs of xmm registers with SSE2 or SSSE3
81 *			    instructions to concatenate and shift appropriately
82 *			    to account for source unalignment. This enables
83 *			    16-byte aligned loads to be done.
84 *			}
85 *		}
86	}
87 *
88 *	Finish any remaining bytes via unrolled code above.
89 * }
90 *
91 * memmove overview:
92 *	memmove is the same as memcpy except one case where copy needs to be
93 *	done backwards. The copy backwards code is done in a similar manner.
94 */
95
96	ENTRY(memmove)
97	cmp	%rsi,%rdi		# if dst <= src
98	jbe	L(CopyForward)		# then do copy forward
99	mov	%rsi,%r9		# move src to r9
100	add	%rdx,%r9		# add len to get addr of end of src
101	cmp	%r9,%rdi		# if dst < end of src
102	jb	L(CopyBackwards)	# then do copy backwards
103	jmp	L(CopyForward)
104
105	ENTRY (memcpy)
106L(CopyForward):
107	mov    %rdx,%r8
108	mov    %rdi,%rcx
109	mov    %rsi,%rdx
110	mov    %rdi,%rax
111	lea    L(fwdPxQx)(%rip),%r11
112	cmp    $0x80,%r8		# 128
113	jg     L(ck_use_sse2)
114	add    %r8,%rcx
115	add    %r8,%rdx
116
117	movslq (%r11,%r8,4),%r10
118	lea    (%r10,%r11,1),%r11
119	jmpq   *%r11
120
121	.balign 16
122L(ShrtAlignNew):
123	lea    L(AliPxQx)(%rip),%r11
124	mov    %rcx,%r9
125	and    $0xf,%r9
126
127	movslq (%r11,%r9,4),%r10
128	lea    (%r10,%r11,1),%r11
129	jmpq   *%r11
130
131	.balign 16
132L(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
133           .int        L(P1Q0)-L(fwdPxQx)
134           .int        L(P2Q0)-L(fwdPxQx)
135           .int        L(P3Q0)-L(fwdPxQx)
136           .int        L(P4Q0)-L(fwdPxQx)
137           .int        L(P5Q0)-L(fwdPxQx)
138           .int        L(P6Q0)-L(fwdPxQx)
139           .int        L(P7Q0)-L(fwdPxQx)
140
141           .int        L(P0Q1)-L(fwdPxQx)
142           .int        L(P1Q1)-L(fwdPxQx)
143           .int        L(P2Q1)-L(fwdPxQx)
144           .int        L(P3Q1)-L(fwdPxQx)
145           .int        L(P4Q1)-L(fwdPxQx)
146           .int        L(P5Q1)-L(fwdPxQx)
147           .int        L(P6Q1)-L(fwdPxQx)
148           .int        L(P7Q1)-L(fwdPxQx)
149
150           .int        L(P0Q2)-L(fwdPxQx)
151           .int        L(P1Q2)-L(fwdPxQx)
152           .int        L(P2Q2)-L(fwdPxQx)
153           .int        L(P3Q2)-L(fwdPxQx)
154           .int        L(P4Q2)-L(fwdPxQx)
155           .int        L(P5Q2)-L(fwdPxQx)
156           .int        L(P6Q2)-L(fwdPxQx)
157           .int        L(P7Q2)-L(fwdPxQx)
158
159           .int        L(P0Q3)-L(fwdPxQx)
160           .int        L(P1Q3)-L(fwdPxQx)
161           .int        L(P2Q3)-L(fwdPxQx)
162           .int        L(P3Q3)-L(fwdPxQx)
163           .int        L(P4Q3)-L(fwdPxQx)
164           .int        L(P5Q3)-L(fwdPxQx)
165           .int        L(P6Q3)-L(fwdPxQx)
166           .int        L(P7Q3)-L(fwdPxQx)
167
168           .int        L(P0Q4)-L(fwdPxQx)
169           .int        L(P1Q4)-L(fwdPxQx)
170           .int        L(P2Q4)-L(fwdPxQx)
171           .int        L(P3Q4)-L(fwdPxQx)
172           .int        L(P4Q4)-L(fwdPxQx)
173           .int        L(P5Q4)-L(fwdPxQx)
174           .int        L(P6Q4)-L(fwdPxQx)
175           .int        L(P7Q4)-L(fwdPxQx)
176
177           .int        L(P0Q5)-L(fwdPxQx)
178           .int        L(P1Q5)-L(fwdPxQx)
179           .int        L(P2Q5)-L(fwdPxQx)
180           .int        L(P3Q5)-L(fwdPxQx)
181           .int        L(P4Q5)-L(fwdPxQx)
182           .int        L(P5Q5)-L(fwdPxQx)
183           .int        L(P6Q5)-L(fwdPxQx)
184           .int        L(P7Q5)-L(fwdPxQx)
185
186           .int        L(P0Q6)-L(fwdPxQx)
187           .int        L(P1Q6)-L(fwdPxQx)
188           .int        L(P2Q6)-L(fwdPxQx)
189           .int        L(P3Q6)-L(fwdPxQx)
190           .int        L(P4Q6)-L(fwdPxQx)
191           .int        L(P5Q6)-L(fwdPxQx)
192           .int        L(P6Q6)-L(fwdPxQx)
193           .int        L(P7Q6)-L(fwdPxQx)
194
195           .int        L(P0Q7)-L(fwdPxQx)
196           .int        L(P1Q7)-L(fwdPxQx)
197           .int        L(P2Q7)-L(fwdPxQx)
198           .int        L(P3Q7)-L(fwdPxQx)
199           .int        L(P4Q7)-L(fwdPxQx)
200           .int        L(P5Q7)-L(fwdPxQx)
201           .int        L(P6Q7)-L(fwdPxQx)
202           .int        L(P7Q7)-L(fwdPxQx)
203
204           .int        L(P0Q8)-L(fwdPxQx)
205           .int        L(P1Q8)-L(fwdPxQx)
206           .int        L(P2Q8)-L(fwdPxQx)
207           .int        L(P3Q8)-L(fwdPxQx)
208           .int        L(P4Q8)-L(fwdPxQx)
209           .int        L(P5Q8)-L(fwdPxQx)
210           .int        L(P6Q8)-L(fwdPxQx)
211           .int        L(P7Q8)-L(fwdPxQx)
212
213           .int        L(P0Q9)-L(fwdPxQx)
214           .int        L(P1Q9)-L(fwdPxQx)
215           .int        L(P2Q9)-L(fwdPxQx)
216           .int        L(P3Q9)-L(fwdPxQx)
217           .int        L(P4Q9)-L(fwdPxQx)
218           .int        L(P5Q9)-L(fwdPxQx)
219           .int        L(P6Q9)-L(fwdPxQx)
220           .int        L(P7Q9)-L(fwdPxQx)
221
222           .int        L(P0QA)-L(fwdPxQx)
223           .int        L(P1QA)-L(fwdPxQx)
224           .int        L(P2QA)-L(fwdPxQx)
225           .int        L(P3QA)-L(fwdPxQx)
226           .int        L(P4QA)-L(fwdPxQx)
227           .int        L(P5QA)-L(fwdPxQx)
228           .int        L(P6QA)-L(fwdPxQx)
229           .int        L(P7QA)-L(fwdPxQx)
230
231           .int        L(P0QB)-L(fwdPxQx)
232           .int        L(P1QB)-L(fwdPxQx)
233           .int        L(P2QB)-L(fwdPxQx)
234           .int        L(P3QB)-L(fwdPxQx)
235           .int        L(P4QB)-L(fwdPxQx)
236           .int        L(P5QB)-L(fwdPxQx)
237           .int        L(P6QB)-L(fwdPxQx)
238           .int        L(P7QB)-L(fwdPxQx)
239
240           .int        L(P0QC)-L(fwdPxQx)
241           .int        L(P1QC)-L(fwdPxQx)
242           .int        L(P2QC)-L(fwdPxQx)
243           .int        L(P3QC)-L(fwdPxQx)
244           .int        L(P4QC)-L(fwdPxQx)
245           .int        L(P5QC)-L(fwdPxQx)
246           .int        L(P6QC)-L(fwdPxQx)
247           .int        L(P7QC)-L(fwdPxQx)
248
249           .int        L(P0QD)-L(fwdPxQx)
250           .int        L(P1QD)-L(fwdPxQx)
251           .int        L(P2QD)-L(fwdPxQx)
252           .int        L(P3QD)-L(fwdPxQx)
253           .int        L(P4QD)-L(fwdPxQx)
254           .int        L(P5QD)-L(fwdPxQx)
255           .int        L(P6QD)-L(fwdPxQx)
256           .int        L(P7QD)-L(fwdPxQx)
257
258           .int        L(P0QE)-L(fwdPxQx)
259           .int        L(P1QE)-L(fwdPxQx)
260           .int        L(P2QE)-L(fwdPxQx)
261           .int        L(P3QE)-L(fwdPxQx)
262           .int        L(P4QE)-L(fwdPxQx)
263           .int        L(P5QE)-L(fwdPxQx)
264           .int        L(P6QE)-L(fwdPxQx)
265           .int        L(P7QE)-L(fwdPxQx)
266
267           .int        L(P0QF)-L(fwdPxQx)
268           .int        L(P1QF)-L(fwdPxQx)
269           .int        L(P2QF)-L(fwdPxQx)
270           .int        L(P3QF)-L(fwdPxQx)
271           .int        L(P4QF)-L(fwdPxQx)
272           .int        L(P5QF)-L(fwdPxQx)
273           .int        L(P6QF)-L(fwdPxQx)
274           .int        L(P7QF)-L(fwdPxQx)
275
276           .int        L(P0QG)-L(fwdPxQx)	# 0x80
277
278	   .balign 16
279L(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
280           .int        L(A1Q0)-L(AliPxQx)
281           .int        L(A2Q0)-L(AliPxQx)
282           .int        L(A3Q0)-L(AliPxQx)
283           .int        L(A4Q0)-L(AliPxQx)
284           .int        L(A5Q0)-L(AliPxQx)
285           .int        L(A6Q0)-L(AliPxQx)
286           .int        L(A7Q0)-L(AliPxQx)
287           .int        L(A0Q1)-L(AliPxQx)
288           .int        L(A1Q1)-L(AliPxQx)
289           .int        L(A2Q1)-L(AliPxQx)
290           .int        L(A3Q1)-L(AliPxQx)
291           .int        L(A4Q1)-L(AliPxQx)
292           .int        L(A5Q1)-L(AliPxQx)
293           .int        L(A6Q1)-L(AliPxQx)
294           .int        L(A7Q1)-L(AliPxQx)
295
296	.balign 16
297L(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
298	movzbq (%rdx),%r11
299	sub    $0xf,%r8
300	mov    %r11b,(%rcx)
301
302	movzwq 0x1(%rdx),%r10
303	mov    %r10w,0x1(%rcx)
304
305	mov    0x3(%rdx),%r9d
306	mov    %r9d,0x3(%rcx)
307
308	mov    0x7(%rdx),%r11
309	add    $0xf,%rdx
310	mov    %r11,0x7(%rcx)
311
312	add    $0xf,%rcx
313	jmp    L(now_qw_aligned)
314
315	.balign 16
316L(A2Q0):			# ; need to move 8+ 6=2+4 bytes
317	movzwq (%rdx),%r10
318	sub    $0xe,%r8
319	mov    %r10w,(%rcx)
320
321	mov    0x2(%rdx),%r9d
322	mov    %r9d,0x2(%rcx)
323
324	mov    0x6(%rdx),%r11
325	add    $0xe,%rdx
326	mov    %r11,0x6(%rcx)
327	add    $0xe,%rcx
328	jmp    L(now_qw_aligned)
329
330	.balign 16
331L(A3Q0):			# ; need to move 8+ 5=1+4 bytes
332	movzbq (%rdx),%r11
333	sub    $0xd,%r8
334	mov    %r11b,(%rcx)
335
336	mov    0x1(%rdx),%r9d
337	mov    %r9d,0x1(%rcx)
338
339	mov    0x5(%rdx),%r10
340	add    $0xd,%rdx
341	mov    %r10,0x5(%rcx)
342
343	add    $0xd,%rcx
344	jmp    L(now_qw_aligned)
345
346	.balign 16
347L(A4Q0):			# ; need to move 8+4 bytes
348	mov    (%rdx),%r9d
349	sub    $0xc,%r8
350	mov    %r9d,(%rcx)
351
352	mov    0x4(%rdx),%r10
353	add    $0xc,%rdx
354	mov    %r10,0x4(%rcx)
355
356	add    $0xc,%rcx
357	jmp    L(now_qw_aligned)
358
359	.balign 16
360L(A5Q0):			# ; need to move 8+ 3=1+2 bytes
361	movzbq (%rdx),%r11
362	sub    $0xb,%r8
363	mov    %r11b,(%rcx)
364
365	movzwq 0x1(%rdx),%r10
366	mov    %r10w,0x1(%rcx)
367
368	mov    0x3(%rdx),%r9
369	add    $0xb,%rdx
370	mov    %r9,0x3(%rcx)
371
372	add    $0xb,%rcx
373	jmp    L(now_qw_aligned)
374
375	.balign 16
376L(A6Q0):			# ; need to move 8+2 bytes
377	movzwq (%rdx),%r10
378	sub    $0xa,%r8
379	mov    %r10w,(%rcx)
380
381	mov    0x2(%rdx),%r9
382	add    $0xa,%rdx
383	mov    %r9,0x2(%rcx)
384
385	add    $0xa,%rcx
386	jmp    L(now_qw_aligned)
387
388	.balign 16
389L(A7Q0):			# ; need to move 8+1 byte
390	movzbq (%rdx),%r11
391	sub    $0x9,%r8
392	mov    %r11b,(%rcx)
393
394	mov    0x1(%rdx),%r10
395	add    $0x9,%rdx
396	mov    %r10,0x1(%rcx)
397
398	add    $0x9,%rcx
399	jmp    L(now_qw_aligned)
400
401	.balign 16
402L(A0Q1):			# ; need to move 8 bytes
403
404	mov    (%rdx),%r10
405	add    $0x8,%rdx
406	sub    $0x8,%r8
407	mov    %r10,(%rcx)
408
409	add    $0x8,%rcx
410	jmp    L(now_qw_aligned)
411
412	.balign 16
413L(A1Q1):			# ; need to move 7=1+2+4 bytes
414	movzbq (%rdx),%r11
415	sub    $0x7,%r8
416	mov    %r11b,(%rcx)
417
418	movzwq 0x1(%rdx),%r10
419	mov    %r10w,0x1(%rcx)
420
421	mov    0x3(%rdx),%r9d
422	add    $0x7,%rdx
423	mov    %r9d,0x3(%rcx)
424	add    $0x7,%rcx
425	jmp    L(now_qw_aligned)
426
427	.balign 16
428L(A2Q1):			# ; need to move 6=2+4 bytes
429	movzwq (%rdx),%r10
430	sub    $0x6,%r8
431	mov    %r10w,(%rcx)
432	mov    0x2(%rdx),%r9d
433	add    $0x6,%rdx
434	mov    %r9d,0x2(%rcx)
435	add    $0x6,%rcx
436	jmp    L(now_qw_aligned)
437
438	.balign 16
439L(A3Q1):			# ; need to move 5=1+4 bytes
440	movzbq (%rdx),%r11
441	sub    $0x5,%r8
442	mov    %r11b,(%rcx)
443	mov    0x1(%rdx),%r9d
444	add    $0x5,%rdx
445	mov    %r9d,0x1(%rcx)
446	add    $0x5,%rcx
447	jmp    L(now_qw_aligned)
448
449	.balign 16
450L(A4Q1):			# ; need to move 4 bytes
451	mov    (%rdx),%r9d
452	sub    $0x4,%r8
453	add    $0x4,%rdx
454	mov    %r9d,(%rcx)
455	add    $0x4,%rcx
456	jmp    L(now_qw_aligned)
457
458	.balign 16
459L(A5Q1):			# ; need to move 3=1+2 bytes
460	movzbq (%rdx),%r11
461	sub    $0x3,%r8
462	mov    %r11b,(%rcx)
463
464	movzwq 0x1(%rdx),%r10
465	add    $0x3,%rdx
466	mov    %r10w,0x1(%rcx)
467
468	add    $0x3,%rcx
469	jmp    L(now_qw_aligned)
470
471	.balign 16
472L(A6Q1):			# ; need to move 2 bytes
473	movzwq (%rdx),%r10
474	sub    $0x2,%r8
475	add    $0x2,%rdx
476	mov    %r10w,(%rcx)
477	add    $0x2,%rcx
478	jmp    L(now_qw_aligned)
479
480	.balign 16
481L(A7Q1):			# ; need to move 1 byte
482	movzbq (%rdx),%r11
483	dec    %r8
484	inc    %rdx
485	mov    %r11b,(%rcx)
486	inc    %rcx
487	jmp    L(now_qw_aligned)
488
489
490	.balign 16
491L(P0QG):
492	mov    -0x80(%rdx),%r9
493	mov    %r9,-0x80(%rcx)
494L(P0QF):
495	mov    -0x78(%rdx),%r10
496	mov    %r10,-0x78(%rcx)
497L(P0QE):
498	mov    -0x70(%rdx),%r9
499	mov    %r9,-0x70(%rcx)
500L(P0QD):
501	mov    -0x68(%rdx),%r10
502	mov    %r10,-0x68(%rcx)
503L(P0QC):
504	mov    -0x60(%rdx),%r9
505	mov    %r9,-0x60(%rcx)
506L(P0QB):
507	mov    -0x58(%rdx),%r10
508	mov    %r10,-0x58(%rcx)
509L(P0QA):
510	mov    -0x50(%rdx),%r9
511	mov    %r9,-0x50(%rcx)
512L(P0Q9):
513	mov    -0x48(%rdx),%r10
514	mov    %r10,-0x48(%rcx)
515L(P0Q8):
516	mov    -0x40(%rdx),%r9
517	mov    %r9,-0x40(%rcx)
518L(P0Q7):
519	mov    -0x38(%rdx),%r10
520	mov    %r10,-0x38(%rcx)
521L(P0Q6):
522	mov    -0x30(%rdx),%r9
523	mov    %r9,-0x30(%rcx)
524L(P0Q5):
525	mov    -0x28(%rdx),%r10
526	mov    %r10,-0x28(%rcx)
527L(P0Q4):
528	mov    -0x20(%rdx),%r9
529	mov    %r9,-0x20(%rcx)
530L(P0Q3):
531	mov    -0x18(%rdx),%r10
532	mov    %r10,-0x18(%rcx)
533L(P0Q2):
534	mov    -0x10(%rdx),%r9
535	mov    %r9,-0x10(%rcx)
536L(P0Q1):
537	mov    -0x8(%rdx),%r10
538	mov    %r10,-0x8(%rcx)
539L(P0Q0):
540	ret
541
542	.balign 16
543L(P1QF):
544	mov    -0x79(%rdx),%r9
545	mov    %r9,-0x79(%rcx)
546L(P1QE):
547	mov    -0x71(%rdx),%r11
548	mov    %r11,-0x71(%rcx)
549L(P1QD):
550	mov    -0x69(%rdx),%r10
551	mov    %r10,-0x69(%rcx)
552L(P1QC):
553	mov    -0x61(%rdx),%r9
554	mov    %r9,-0x61(%rcx)
555L(P1QB):
556	mov    -0x59(%rdx),%r11
557	mov    %r11,-0x59(%rcx)
558L(P1QA):
559	mov    -0x51(%rdx),%r10
560	mov    %r10,-0x51(%rcx)
561L(P1Q9):
562	mov    -0x49(%rdx),%r9
563	mov    %r9,-0x49(%rcx)
564L(P1Q8):
565	mov    -0x41(%rdx),%r11
566	mov    %r11,-0x41(%rcx)
567L(P1Q7):
568	mov    -0x39(%rdx),%r10
569	mov    %r10,-0x39(%rcx)
570L(P1Q6):
571	mov    -0x31(%rdx),%r9
572	mov    %r9,-0x31(%rcx)
573L(P1Q5):
574	mov    -0x29(%rdx),%r11
575	mov    %r11,-0x29(%rcx)
576L(P1Q4):
577	mov    -0x21(%rdx),%r10
578	mov    %r10,-0x21(%rcx)
579L(P1Q3):
580	mov    -0x19(%rdx),%r9
581	mov    %r9,-0x19(%rcx)
582L(P1Q2):
583	mov    -0x11(%rdx),%r11
584	mov    %r11,-0x11(%rcx)
585L(P1Q1):
586	mov    -0x9(%rdx),%r10
587	mov    %r10,-0x9(%rcx)
588L(P1Q0):
589	movzbq -0x1(%rdx),%r9
590	mov    %r9b,-0x1(%rcx)
591	ret
592
593	.balign 16
594L(P2QF):
595	mov    -0x7a(%rdx),%r9
596	mov    %r9,-0x7a(%rcx)
597L(P2QE):
598	mov    -0x72(%rdx),%r11
599	mov    %r11,-0x72(%rcx)
600L(P2QD):
601	mov    -0x6a(%rdx),%r10
602	mov    %r10,-0x6a(%rcx)
603L(P2QC):
604	mov    -0x62(%rdx),%r9
605	mov    %r9,-0x62(%rcx)
606L(P2QB):
607	mov    -0x5a(%rdx),%r11
608	mov    %r11,-0x5a(%rcx)
609L(P2QA):
610	mov    -0x52(%rdx),%r10
611	mov    %r10,-0x52(%rcx)
612L(P2Q9):
613	mov    -0x4a(%rdx),%r9
614	mov    %r9,-0x4a(%rcx)
615L(P2Q8):
616	mov    -0x42(%rdx),%r11
617	mov    %r11,-0x42(%rcx)
618L(P2Q7):
619	mov    -0x3a(%rdx),%r10
620	mov    %r10,-0x3a(%rcx)
621L(P2Q6):
622	mov    -0x32(%rdx),%r9
623	mov    %r9,-0x32(%rcx)
624L(P2Q5):
625	mov    -0x2a(%rdx),%r11
626	mov    %r11,-0x2a(%rcx)
627L(P2Q4):
628	mov    -0x22(%rdx),%r10
629	mov    %r10,-0x22(%rcx)
630L(P2Q3):
631	mov    -0x1a(%rdx),%r9
632	mov    %r9,-0x1a(%rcx)
633L(P2Q2):
634	mov    -0x12(%rdx),%r11
635	mov    %r11,-0x12(%rcx)
636L(P2Q1):
637	mov    -0xa(%rdx),%r10
638	mov    %r10,-0xa(%rcx)
639L(P2Q0):
640	movzwq -0x2(%rdx),%r9
641	mov    %r9w,-0x2(%rcx)
642	ret
643
644	.balign 16
645L(P3QF):
646	mov    -0x7b(%rdx),%r9
647	mov    %r9,-0x7b(%rcx)
648L(P3QE):
649	mov    -0x73(%rdx),%r11
650	mov    %r11,-0x73(%rcx)
651L(P3QD):
652	mov    -0x6b(%rdx),%r10
653	mov    %r10,-0x6b(%rcx)
654L(P3QC):
655	mov    -0x63(%rdx),%r9
656	mov    %r9,-0x63(%rcx)
657L(P3QB):
658	mov    -0x5b(%rdx),%r11
659	mov    %r11,-0x5b(%rcx)
660L(P3QA):
661	mov    -0x53(%rdx),%r10
662	mov    %r10,-0x53(%rcx)
663L(P3Q9):
664	mov    -0x4b(%rdx),%r9
665	mov    %r9,-0x4b(%rcx)
666L(P3Q8):
667	mov    -0x43(%rdx),%r11
668	mov    %r11,-0x43(%rcx)
669L(P3Q7):
670	mov    -0x3b(%rdx),%r10
671	mov    %r10,-0x3b(%rcx)
672L(P3Q6):
673	mov    -0x33(%rdx),%r9
674	mov    %r9,-0x33(%rcx)
675L(P3Q5):
676	mov    -0x2b(%rdx),%r11
677	mov    %r11,-0x2b(%rcx)
678L(P3Q4):
679	mov    -0x23(%rdx),%r10
680	mov    %r10,-0x23(%rcx)
681L(P3Q3):
682	mov    -0x1b(%rdx),%r9
683	mov    %r9,-0x1b(%rcx)
684L(P3Q2):
685	mov    -0x13(%rdx),%r11
686	mov    %r11,-0x13(%rcx)
687L(P3Q1):
688	mov    -0xb(%rdx),%r10
689	mov    %r10,-0xb(%rcx)
690	/*
691	 * These trailing loads/stores have to do all their loads 1st,
692	 * then do the stores.
693	 */
694L(P3Q0):
695	movzwq -0x3(%rdx),%r9
696	movzbq -0x1(%rdx),%r10
697	mov    %r9w,-0x3(%rcx)
698	mov    %r10b,-0x1(%rcx)
699	ret
700
701	.balign 16
702L(P4QF):
703	mov    -0x7c(%rdx),%r9
704	mov    %r9,-0x7c(%rcx)
705L(P4QE):
706	mov    -0x74(%rdx),%r11
707	mov    %r11,-0x74(%rcx)
708L(P4QD):
709	mov    -0x6c(%rdx),%r10
710	mov    %r10,-0x6c(%rcx)
711L(P4QC):
712	mov    -0x64(%rdx),%r9
713	mov    %r9,-0x64(%rcx)
714L(P4QB):
715	mov    -0x5c(%rdx),%r11
716	mov    %r11,-0x5c(%rcx)
717L(P4QA):
718	mov    -0x54(%rdx),%r10
719	mov    %r10,-0x54(%rcx)
720L(P4Q9):
721	mov    -0x4c(%rdx),%r9
722	mov    %r9,-0x4c(%rcx)
723L(P4Q8):
724	mov    -0x44(%rdx),%r11
725	mov    %r11,-0x44(%rcx)
726L(P4Q7):
727	mov    -0x3c(%rdx),%r10
728	mov    %r10,-0x3c(%rcx)
729L(P4Q6):
730	mov    -0x34(%rdx),%r9
731	mov    %r9,-0x34(%rcx)
732L(P4Q5):
733	mov    -0x2c(%rdx),%r11
734	mov    %r11,-0x2c(%rcx)
735L(P4Q4):
736	mov    -0x24(%rdx),%r10
737	mov    %r10,-0x24(%rcx)
738L(P4Q3):
739	mov    -0x1c(%rdx),%r9
740	mov    %r9,-0x1c(%rcx)
741L(P4Q2):
742	mov    -0x14(%rdx),%r11
743	mov    %r11,-0x14(%rcx)
744L(P4Q1):
745	mov    -0xc(%rdx),%r10
746	mov    %r10,-0xc(%rcx)
747L(P4Q0):
748	mov    -0x4(%rdx),%r9d
749	mov    %r9d,-0x4(%rcx)
750	ret
751
752	.balign 16
753L(P5QF):
754	mov    -0x7d(%rdx),%r9
755	mov    %r9,-0x7d(%rcx)
756L(P5QE):
757	mov    -0x75(%rdx),%r11
758	mov    %r11,-0x75(%rcx)
759L(P5QD):
760	mov    -0x6d(%rdx),%r10
761	mov    %r10,-0x6d(%rcx)
762L(P5QC):
763	mov    -0x65(%rdx),%r9
764	mov    %r9,-0x65(%rcx)
765L(P5QB):
766	mov    -0x5d(%rdx),%r11
767	mov    %r11,-0x5d(%rcx)
768L(P5QA):
769	mov    -0x55(%rdx),%r10
770	mov    %r10,-0x55(%rcx)
771L(P5Q9):
772	mov    -0x4d(%rdx),%r9
773	mov    %r9,-0x4d(%rcx)
774L(P5Q8):
775	mov    -0x45(%rdx),%r11
776	mov    %r11,-0x45(%rcx)
777L(P5Q7):
778	mov    -0x3d(%rdx),%r10
779	mov    %r10,-0x3d(%rcx)
780L(P5Q6):
781	mov    -0x35(%rdx),%r9
782	mov    %r9,-0x35(%rcx)
783L(P5Q5):
784	mov    -0x2d(%rdx),%r11
785	mov    %r11,-0x2d(%rcx)
786L(P5Q4):
787	mov    -0x25(%rdx),%r10
788	mov    %r10,-0x25(%rcx)
789L(P5Q3):
790	mov    -0x1d(%rdx),%r9
791	mov    %r9,-0x1d(%rcx)
792L(P5Q2):
793	mov    -0x15(%rdx),%r11
794	mov    %r11,-0x15(%rcx)
795L(P5Q1):
796	mov    -0xd(%rdx),%r10
797	mov    %r10,-0xd(%rcx)
798	/*
799	 * These trailing loads/stores have to do all their loads 1st,
800	 * then do the stores.
801	 */
802L(P5Q0):
803	mov    -0x5(%rdx),%r9d
804	movzbq -0x1(%rdx),%r10
805	mov    %r9d,-0x5(%rcx)
806	mov    %r10b,-0x1(%rcx)
807	ret
808
809	.balign 16
810L(P6QF):
811	mov    -0x7e(%rdx),%r9
812	mov    %r9,-0x7e(%rcx)
813L(P6QE):
814	mov    -0x76(%rdx),%r11
815	mov    %r11,-0x76(%rcx)
816L(P6QD):
817	mov    -0x6e(%rdx),%r10
818	mov    %r10,-0x6e(%rcx)
819L(P6QC):
820	mov    -0x66(%rdx),%r9
821	mov    %r9,-0x66(%rcx)
822L(P6QB):
823	mov    -0x5e(%rdx),%r11
824	mov    %r11,-0x5e(%rcx)
825L(P6QA):
826	mov    -0x56(%rdx),%r10
827	mov    %r10,-0x56(%rcx)
828L(P6Q9):
829	mov    -0x4e(%rdx),%r9
830	mov    %r9,-0x4e(%rcx)
831L(P6Q8):
832	mov    -0x46(%rdx),%r11
833	mov    %r11,-0x46(%rcx)
834L(P6Q7):
835	mov    -0x3e(%rdx),%r10
836	mov    %r10,-0x3e(%rcx)
837L(P6Q6):
838	mov    -0x36(%rdx),%r9
839	mov    %r9,-0x36(%rcx)
840L(P6Q5):
841	mov    -0x2e(%rdx),%r11
842	mov    %r11,-0x2e(%rcx)
843L(P6Q4):
844	mov    -0x26(%rdx),%r10
845	mov    %r10,-0x26(%rcx)
846L(P6Q3):
847	mov    -0x1e(%rdx),%r9
848	mov    %r9,-0x1e(%rcx)
849L(P6Q2):
850	mov    -0x16(%rdx),%r11
851	mov    %r11,-0x16(%rcx)
852L(P6Q1):
853	mov    -0xe(%rdx),%r10
854	mov    %r10,-0xe(%rcx)
855	/*
856	 * These trailing loads/stores have to do all their loads 1st,
857	 * then do the stores.
858	 */
859L(P6Q0):
860	mov    -0x6(%rdx),%r9d
861	movzwq -0x2(%rdx),%r10
862	mov    %r9d,-0x6(%rcx)
863	mov    %r10w,-0x2(%rcx)
864	ret
865
866	.balign 16
867L(P7QF):
868	mov    -0x7f(%rdx),%r9
869	mov    %r9,-0x7f(%rcx)
870L(P7QE):
871	mov    -0x77(%rdx),%r11
872	mov    %r11,-0x77(%rcx)
873L(P7QD):
874	mov    -0x6f(%rdx),%r10
875	mov    %r10,-0x6f(%rcx)
876L(P7QC):
877	mov    -0x67(%rdx),%r9
878	mov    %r9,-0x67(%rcx)
879L(P7QB):
880	mov    -0x5f(%rdx),%r11
881	mov    %r11,-0x5f(%rcx)
882L(P7QA):
883	mov    -0x57(%rdx),%r10
884	mov    %r10,-0x57(%rcx)
885L(P7Q9):
886	mov    -0x4f(%rdx),%r9
887	mov    %r9,-0x4f(%rcx)
888L(P7Q8):
889	mov    -0x47(%rdx),%r11
890	mov    %r11,-0x47(%rcx)
891L(P7Q7):
892	mov    -0x3f(%rdx),%r10
893	mov    %r10,-0x3f(%rcx)
894L(P7Q6):
895	mov    -0x37(%rdx),%r9
896	mov    %r9,-0x37(%rcx)
897L(P7Q5):
898	mov    -0x2f(%rdx),%r11
899	mov    %r11,-0x2f(%rcx)
900L(P7Q4):
901	mov    -0x27(%rdx),%r10
902	mov    %r10,-0x27(%rcx)
903L(P7Q3):
904	mov    -0x1f(%rdx),%r9
905	mov    %r9,-0x1f(%rcx)
906L(P7Q2):
907	mov    -0x17(%rdx),%r11
908	mov    %r11,-0x17(%rcx)
909L(P7Q1):
910	mov    -0xf(%rdx),%r10
911	mov    %r10,-0xf(%rcx)
912	/*
913	 * These trailing loads/stores have to do all their loads 1st,
914	 * then do the stores.
915	 */
916L(P7Q0):
917	mov    -0x7(%rdx),%r9d
918	movzwq -0x3(%rdx),%r10
919	movzbq -0x1(%rdx),%r11
920	mov    %r9d,-0x7(%rcx)
921	mov    %r10w,-0x3(%rcx)
922	mov    %r11b,-0x1(%rcx)
923	ret
924
925	.balign 16
926L(ck_use_sse2):
927	/*
928	 * Align dest to 16 byte boundary.
929	 */
930	test   $0xf,%rcx
931	jnz    L(ShrtAlignNew)
932
933L(now_qw_aligned):
934	cmpl   $NO_SSE,.memops_method(%rip)
935	je     L(Loop8byte_pre)
936
937	/*
938	 * The fall-through path is to do SSE2 16-byte load/stores
939	 */
940
941	/*
942	 * If current move size is larger than half of the highest level cache
943	 * size, then do non-temporal moves.
944	 */
945	mov    .largest_level_cache_size(%rip),%r9d
946	shr    %r9		# take half of it
947	cmp    %r9,%r8
948	jg     L(sse2_nt_move)
949
950	/*
951	 * If both the source and dest are aligned, then use the both aligned
952	 * logic. Well aligned data should reap the rewards.
953	 */
954	test   $0xf,%rdx
955	jz     L(pre_both_aligned)
956
957	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
958	testl  $USE_SSSE3,.memops_method(%rip)
959	jz     1f
960	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
961
9621:
963	/*
964	 * if the src is not 16 byte aligned...
965	 */
966	mov    %rdx,%r11
967	and    $0xf,%r11
968	movdqu (%rdx),%xmm0
969	movdqa %xmm0,(%rcx)
970	add    $0x10,%rdx
971	sub    %r11,%rdx
972	add    $0x10,%rcx
973	sub    $0x10,%r8
974	movdqa (%rdx),%xmm1
975
976	movslq (%r10,%r11,4),%r9
977	lea    (%r9,%r10,1),%r10
978	jmpq   *%r10
979
980	    .balign 16
981L(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
982	    .int        L(mov3dqa1) -L(SSSE3_src)
983	    .int        L(mov3dqa2) -L(SSSE3_src)
984	    .int        L(mov3dqa3) -L(SSSE3_src)
985	    .int        L(mov3dqa4) -L(SSSE3_src)
986	    .int        L(mov3dqa5) -L(SSSE3_src)
987	    .int        L(mov3dqa6) -L(SSSE3_src)
988	    .int        L(mov3dqa7) -L(SSSE3_src)
989	    .int        L(movdqa8)  -L(SSSE3_src)
990	    .int        L(mov3dqa9) -L(SSSE3_src)
991	    .int        L(mov3dqa10)-L(SSSE3_src)
992	    .int        L(mov3dqa11)-L(SSSE3_src)
993	    .int        L(mov3dqa12)-L(SSSE3_src)
994	    .int        L(mov3dqa13)-L(SSSE3_src)
995	    .int        L(mov3dqa14)-L(SSSE3_src)
996	    .int        L(mov3dqa15)-L(SSSE3_src)
997L(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
998	    .int        L(movdqa1) -L(SSE_src)
999	    .int        L(movdqa2) -L(SSE_src)
1000	    .int        L(movdqa3) -L(SSE_src)
1001	    .int        L(movdqa4) -L(SSE_src)
1002	    .int        L(movdqa5) -L(SSE_src)
1003	    .int        L(movdqa6) -L(SSE_src)
1004	    .int        L(movdqa7) -L(SSE_src)
1005	    .int        L(movdqa8) -L(SSE_src)
1006	    .int        L(movdqa9) -L(SSE_src)
1007	    .int        L(movdqa10)-L(SSE_src)
1008	    .int        L(movdqa11)-L(SSE_src)
1009	    .int        L(movdqa12)-L(SSE_src)
1010	    .int        L(movdqa13)-L(SSE_src)
1011	    .int        L(movdqa14)-L(SSE_src)
1012	    .int        L(movdqa15)-L(SSE_src)
1013
1014	.balign 16
1015L(movdqa1):
1016	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
1017	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
1018	lea    0x20(%rdx),%rdx
1019	lea    -0x20(%r8),%r8
1020
1021	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
1022	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
1023	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
1024	por    %xmm1,%xmm3 # OR them together
1025	cmp    $0x20,%r8
1026
1027	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
1028	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
1029	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
1030	por    %xmm2,%xmm0 # OR them together
1031	movdqa %xmm3,(%rcx)     # store it
1032	movdqa %xmm0,0x10(%rcx) # store it
1033	lea    0x20(%rcx),%rcx
1034
1035	jge    L(movdqa1)
1036	jmp    L(movdqa_epi)
1037
1038	.balign 16
1039L(movdqa2):
1040	sub    $0x20,%r8
1041	movdqa 0x10(%rdx),%xmm3
1042	movdqa 0x20(%rdx),%xmm0
1043	add    $0x20,%rdx
1044
1045	psrldq $0x2,%xmm1
1046	movdqa %xmm3,%xmm2
1047	pslldq $0xe,%xmm3
1048	por    %xmm1,%xmm3
1049
1050	psrldq $0x2,%xmm2
1051	movdqa %xmm0,%xmm1
1052	pslldq $0xe,%xmm0
1053	por    %xmm2,%xmm0
1054	movdqa %xmm3,(%rcx)
1055	movdqa %xmm0,0x10(%rcx)
1056
1057	add    $0x20,%rcx
1058	cmp    $0x20,%r8
1059	jge    L(movdqa2)
1060	jmp    L(movdqa_epi)
1061
1062	.balign 16
1063L(movdqa3):
1064	sub    $0x20,%r8
1065	movdqa 0x10(%rdx),%xmm3
1066	movdqa 0x20(%rdx),%xmm0
1067	add    $0x20,%rdx
1068
1069	psrldq $0x3,%xmm1
1070	movdqa %xmm3,%xmm2
1071	pslldq $0xd,%xmm3
1072	por    %xmm1,%xmm3
1073
1074	psrldq $0x3,%xmm2
1075	movdqa %xmm0,%xmm1
1076	pslldq $0xd,%xmm0
1077	por    %xmm2,%xmm0
1078	movdqa %xmm3,(%rcx)
1079	movdqa %xmm0,0x10(%rcx)
1080
1081	add    $0x20,%rcx
1082	cmp    $0x20,%r8
1083	jge    L(movdqa3)
1084	jmp    L(movdqa_epi)
1085
1086	.balign 16
1087L(movdqa4):
1088	sub    $0x20,%r8
1089	movdqa 0x10(%rdx),%xmm3
1090	movdqa 0x20(%rdx),%xmm0
1091	add    $0x20,%rdx
1092
1093	psrldq $0x4,%xmm1
1094	movdqa %xmm3,%xmm2
1095	pslldq $0xc,%xmm3
1096	por    %xmm1,%xmm3
1097
1098	psrldq $0x4,%xmm2
1099	movdqa %xmm0,%xmm1
1100	pslldq $0xc,%xmm0
1101	por    %xmm2,%xmm0
1102
1103	movdqa %xmm3,(%rcx)
1104	movdqa %xmm0,0x10(%rcx)
1105
1106	add    $0x20,%rcx
1107	cmp    $0x20,%r8
1108	jge    L(movdqa4)
1109	jmp    L(movdqa_epi)
1110
1111	.balign 16
1112L(movdqa5):
1113	sub    $0x20,%r8
1114	movdqa 0x10(%rdx),%xmm3
1115	movdqa 0x20(%rdx),%xmm0
1116	add    $0x20,%rdx
1117
1118	psrldq $0x5,%xmm1
1119	movdqa %xmm3,%xmm2
1120	pslldq $0xb,%xmm3
1121	por    %xmm1,%xmm3
1122
1123	psrldq $0x5,%xmm2
1124	movdqa %xmm0,%xmm1
1125	pslldq $0xb,%xmm0
1126	por    %xmm2,%xmm0
1127
1128	movdqa %xmm3,(%rcx)
1129	movdqa %xmm0,0x10(%rcx)
1130
1131	add    $0x20,%rcx
1132	cmp    $0x20,%r8
1133	jge    L(movdqa5)
1134	jmp    L(movdqa_epi)
1135
1136	.balign 16
1137L(movdqa6):
1138	sub    $0x20,%r8
1139	movdqa 0x10(%rdx),%xmm3
1140	movdqa 0x20(%rdx),%xmm0
1141	add    $0x20,%rdx
1142
1143	psrldq $0x6,%xmm1
1144	movdqa %xmm3,%xmm2
1145	pslldq $0xa,%xmm3
1146	por    %xmm1,%xmm3
1147
1148	psrldq $0x6,%xmm2
1149	movdqa %xmm0,%xmm1
1150	pslldq $0xa,%xmm0
1151	por    %xmm2,%xmm0
1152	movdqa %xmm3,(%rcx)
1153	movdqa %xmm0,0x10(%rcx)
1154
1155	add    $0x20,%rcx
1156	cmp    $0x20,%r8
1157	jge    L(movdqa6)
1158	jmp    L(movdqa_epi)
1159
1160	.balign 16
1161L(movdqa7):
1162	sub    $0x20,%r8
1163	movdqa 0x10(%rdx),%xmm3
1164	movdqa 0x20(%rdx),%xmm0
1165	add    $0x20,%rdx
1166
1167	psrldq $0x7,%xmm1
1168	movdqa %xmm3,%xmm2
1169	pslldq $0x9,%xmm3
1170	por    %xmm1,%xmm3
1171
1172	psrldq $0x7,%xmm2
1173	movdqa %xmm0,%xmm1
1174	pslldq $0x9,%xmm0
1175	por    %xmm2,%xmm0
1176	movdqa %xmm3,(%rcx)
1177	movdqa %xmm0,0x10(%rcx)
1178
1179	add    $0x20,%rcx
1180	cmp    $0x20,%r8
1181	jge    L(movdqa7)
1182	jmp    L(movdqa_epi)
1183
1184	.balign 16
1185L(movdqa8):
1186	movdqa 0x10(%rdx),%xmm3
1187	sub    $0x30,%r8
1188	movdqa 0x20(%rdx),%xmm0
1189	movdqa 0x30(%rdx),%xmm5
1190	lea    0x30(%rdx),%rdx
1191
1192	shufpd $0x1,%xmm3,%xmm1
1193	movdqa %xmm1,(%rcx)
1194
1195	cmp    $0x30,%r8
1196
1197	shufpd $0x1,%xmm0,%xmm3
1198	movdqa %xmm3,0x10(%rcx)
1199
1200	movdqa %xmm5,%xmm1
1201	shufpd $0x1,%xmm5,%xmm0
1202	movdqa %xmm0,0x20(%rcx)
1203
1204	lea    0x30(%rcx),%rcx
1205
1206	jge    L(movdqa8)
1207	jmp    L(movdqa_epi)
1208
1209	.balign 16
1210L(movdqa9):
1211	sub    $0x20,%r8
1212	movdqa 0x10(%rdx),%xmm3
1213	movdqa 0x20(%rdx),%xmm0
1214	add    $0x20,%rdx
1215
1216	psrldq $0x9,%xmm1
1217	movdqa %xmm3,%xmm2
1218	pslldq $0x7,%xmm3
1219	por    %xmm1,%xmm3
1220
1221	psrldq $0x9,%xmm2
1222	movdqa %xmm0,%xmm1
1223	pslldq $0x7,%xmm0
1224	por    %xmm2,%xmm0
1225	movdqa %xmm3,(%rcx)
1226	movdqa %xmm0,0x10(%rcx)
1227
1228	add    $0x20,%rcx
1229	cmp    $0x20,%r8
1230	jge    L(movdqa9)
1231	jmp    L(movdqa_epi)
1232
1233	.balign 16
1234L(movdqa10):
1235	sub    $0x20,%r8
1236	movdqa 0x10(%rdx),%xmm3
1237	movdqa 0x20(%rdx),%xmm0
1238	add    $0x20,%rdx
1239
1240	psrldq $0xa,%xmm1
1241	movdqa %xmm3,%xmm2
1242	pslldq $0x6,%xmm3
1243	por    %xmm1,%xmm3
1244
1245	psrldq $0xa,%xmm2
1246	movdqa %xmm0,%xmm1
1247	pslldq $0x6,%xmm0
1248	por    %xmm2,%xmm0
1249	movdqa %xmm3,(%rcx)
1250	movdqa %xmm0,0x10(%rcx)
1251
1252	add    $0x20,%rcx
1253	cmp    $0x20,%r8
1254	jge    L(movdqa10)
1255	jmp    L(movdqa_epi)
1256
1257	.balign 16
1258L(movdqa11):
1259	sub    $0x20,%r8
1260	movdqa 0x10(%rdx),%xmm3
1261	movdqa 0x20(%rdx),%xmm0
1262	add    $0x20,%rdx
1263
1264	psrldq $0xb,%xmm1
1265	movdqa %xmm3,%xmm2
1266	pslldq $0x5,%xmm3
1267	por    %xmm1,%xmm3
1268
1269	psrldq $0xb,%xmm2
1270	movdqa %xmm0,%xmm1
1271	pslldq $0x5,%xmm0
1272	por    %xmm2,%xmm0
1273	movdqa %xmm3,(%rcx)
1274	movdqa %xmm0,0x10(%rcx)
1275
1276	add    $0x20,%rcx
1277	cmp    $0x20,%r8
1278	jge    L(movdqa11)
1279	jmp    L(movdqa_epi)
1280
1281	.balign 16
1282L(movdqa12):
1283	sub    $0x20,%r8
1284	movdqa 0x10(%rdx),%xmm3
1285	movdqa 0x20(%rdx),%xmm0
1286	add    $0x20,%rdx
1287
1288	psrldq $0xc,%xmm1
1289	movdqa %xmm3,%xmm2
1290	pslldq $0x4,%xmm3
1291	por    %xmm1,%xmm3
1292
1293	psrldq $0xc,%xmm2
1294	movdqa %xmm0,%xmm1
1295	pslldq $0x4,%xmm0
1296	por    %xmm2,%xmm0
1297	movdqa %xmm3,(%rcx)
1298	movdqa %xmm0,0x10(%rcx)
1299
1300	add    $0x20,%rcx
1301	cmp    $0x20,%r8
1302	jge    L(movdqa12)
1303	jmp    L(movdqa_epi)
1304
1305	.balign 16
1306L(movdqa13):
1307	sub    $0x20,%r8
1308	movdqa 0x10(%rdx),%xmm3
1309	movdqa 0x20(%rdx),%xmm0
1310	add    $0x20,%rdx
1311
1312	psrldq $0xd,%xmm1
1313	movdqa %xmm3,%xmm2
1314	pslldq $0x3,%xmm3
1315	por    %xmm1,%xmm3
1316
1317	psrldq $0xd,%xmm2
1318	movdqa %xmm0,%xmm1
1319	pslldq $0x3,%xmm0
1320	por    %xmm2,%xmm0
1321	movdqa %xmm3,(%rcx)
1322	movdqa %xmm0,0x10(%rcx)
1323
1324	add    $0x20,%rcx
1325	cmp    $0x20,%r8
1326	jge    L(movdqa13)
1327	jmp    L(movdqa_epi)
1328
1329	.balign 16
1330L(movdqa14):
1331	sub    $0x20,%r8
1332	movdqa 0x10(%rdx),%xmm3
1333	movdqa 0x20(%rdx),%xmm0
1334	add    $0x20,%rdx
1335
1336	psrldq $0xe,%xmm1
1337	movdqa %xmm3,%xmm2
1338	pslldq $0x2,%xmm3
1339	por    %xmm1,%xmm3
1340
1341	psrldq $0xe,%xmm2
1342	movdqa %xmm0,%xmm1
1343	pslldq $0x2,%xmm0
1344	por    %xmm2,%xmm0
1345	movdqa %xmm3,(%rcx)
1346	movdqa %xmm0,0x10(%rcx)
1347
1348	add    $0x20,%rcx
1349	cmp    $0x20,%r8
1350	jge    L(movdqa14)
1351	jmp    L(movdqa_epi)
1352
1353	.balign 16
1354L(movdqa15):
1355	sub    $0x20,%r8
1356	movdqa 0x10(%rdx),%xmm3
1357	movdqa 0x20(%rdx),%xmm0
1358	add    $0x20,%rdx
1359
1360	psrldq $0xf,%xmm1
1361	movdqa %xmm3,%xmm2
1362	pslldq $0x1,%xmm3
1363	por    %xmm1,%xmm3
1364
1365	psrldq $0xf,%xmm2
1366	movdqa %xmm0,%xmm1
1367	pslldq $0x1,%xmm0
1368	por    %xmm2,%xmm0
1369	movdqa %xmm3,(%rcx)
1370	movdqa %xmm0,0x10(%rcx)
1371
1372	add    $0x20,%rcx
1373	cmp    $0x20,%r8
1374	jge    L(movdqa15)
1375	#jmp   L(movdqa_epi)
1376
1377	.balign 16
1378L(movdqa_epi):
1379	lea    L(fwdPxQx)(%rip),%r10
1380	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
1381	add    %r8,%rcx
1382	add    %r8,%rdx
1383
1384	movslq (%r10,%r8,4),%r9
1385	lea    (%r9,%r10,1),%r10
1386	jmpq   *%r10
1387
1388	.balign 16
1389L(mov3dqa1):
1390	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
1391	sub	$0x30,%r8
1392	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
1393	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
1394	lea	0x30(%rdx),%rdx
1395	cmp	$0x30,%r8
1396
1397	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
1398	#palignr	$0x1,%xmm1,%xmm3
1399	.byte	0x66,0x0f,0x3a,0x0f
1400	.byte	0xd9,0x01
1401	movdqa	%xmm3,(%rcx)      # store it
1402
1403	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
1404	#palignr	$0x1,%xmm2,%xmm0
1405	.byte	0x66,0x0f,0x3a,0x0f
1406	.byte	0xc2,0x01
1407	movdqa	%xmm0,0x10(%rcx)  # store it
1408
1409	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
1410	#palignr	$0x1,%xmm4,%xmm5
1411	.byte	0x66,0x0f,0x3a,0x0f
1412	.byte	0xec,0x01
1413	movdqa	%xmm5,0x20(%rcx)  # store it
1414
1415	lea	0x30(%rcx),%rcx
1416	jge	L(mov3dqa1)
1417
1418	cmp	$0x10,%r8
1419	jl	L(movdqa_epi)
1420	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1421	sub	$0x10,%r8
1422	lea	0x10(%rdx),%rdx
1423	movdqa	%xmm3,%xmm2		# save for use next concat
1424	#palignr	$0x1,%xmm1,%xmm3
1425	.byte	0x66,0x0f,0x3a,0x0f
1426	.byte	0xd9,0x01
1427
1428	cmp	$0x10,%r8
1429	movdqa	%xmm3,(%rcx)      	# store it
1430	lea	0x10(%rcx),%rcx
1431	jl	L(movdqa_epi)
1432
1433	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1434	sub	$0x10,%r8
1435	lea	0x10(%rdx),%rdx
1436	#palignr	$0x1,%xmm2,%xmm0
1437	.byte	0x66,0x0f,0x3a,0x0f
1438	.byte	0xc2,0x01
1439	movdqa	%xmm0,(%rcx)      	# store it
1440	lea	0x10(%rcx),%rcx
1441	jmp	L(movdqa_epi)
1442
1443	.balign 16
1444L(mov3dqa2):
1445	movdqa	0x10(%rdx),%xmm3
1446	sub	$0x30,%r8
1447	movdqa	0x20(%rdx),%xmm0
1448	movdqa	0x30(%rdx),%xmm5
1449	lea	0x30(%rdx),%rdx
1450	cmp	$0x30,%r8
1451
1452	movdqa	%xmm3,%xmm2
1453	#palignr	$0x2,%xmm1,%xmm3
1454	.byte	0x66,0x0f,0x3a,0x0f
1455	.byte	0xd9,0x02
1456	movdqa	%xmm3,(%rcx)
1457
1458	movdqa	%xmm0,%xmm4
1459	#palignr	$0x2,%xmm2,%xmm0
1460	.byte	0x66,0x0f,0x3a,0x0f
1461	.byte	0xc2,0x02
1462	movdqa	%xmm0,0x10(%rcx)
1463
1464	movdqa	%xmm5,%xmm1
1465	#palignr	$0x2,%xmm4,%xmm5
1466	.byte	0x66,0x0f,0x3a,0x0f
1467	.byte	0xec,0x02
1468	movdqa	%xmm5,0x20(%rcx)
1469
1470	lea	0x30(%rcx),%rcx
1471	jge	L(mov3dqa2)
1472
1473	cmp	$0x10,%r8
1474	jl	L(movdqa_epi)
1475	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1476	sub	$0x10,%r8
1477	lea	0x10(%rdx),%rdx
1478	movdqa	%xmm3,%xmm2		# save for use next concat
1479	#palignr	$0x2,%xmm1,%xmm3
1480	.byte	0x66,0x0f,0x3a,0x0f
1481	.byte	0xd9,0x02
1482
1483	cmp	$0x10,%r8
1484	movdqa	%xmm3,(%rcx)      	# store it
1485	lea	0x10(%rcx),%rcx
1486	jl	L(movdqa_epi)
1487
1488	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1489	sub	$0x10,%r8
1490	lea	0x10(%rdx),%rdx
1491	#palignr	$0x2,%xmm2,%xmm0
1492	.byte	0x66,0x0f,0x3a,0x0f
1493	.byte	0xc2,0x02
1494	movdqa	%xmm0,(%rcx)      	# store it
1495	lea	0x10(%rcx),%rcx
1496	jmp	L(movdqa_epi)
1497
1498	.balign 16
1499L(mov3dqa3):
1500	movdqa	0x10(%rdx),%xmm3
1501	sub	$0x30,%r8
1502	movdqa	0x20(%rdx),%xmm0
1503	movdqa	0x30(%rdx),%xmm5
1504	lea	0x30(%rdx),%rdx
1505	cmp	$0x30,%r8
1506
1507	movdqa	%xmm3,%xmm2
1508	#palignr	$0x3,%xmm1,%xmm3
1509	.byte	0x66,0x0f,0x3a,0x0f
1510	.byte	0xd9,0x03
1511	movdqa	%xmm3,(%rcx)
1512
1513	movdqa	%xmm0,%xmm4
1514	#palignr	$0x3,%xmm2,%xmm0
1515	.byte	0x66,0x0f,0x3a,0x0f
1516	.byte	0xc2,0x03
1517	movdqa	%xmm0,0x10(%rcx)
1518
1519	movdqa	%xmm5,%xmm1
1520	#palignr	$0x3,%xmm4,%xmm5
1521	.byte	0x66,0x0f,0x3a,0x0f
1522	.byte	0xec,0x03
1523	movdqa	%xmm5,0x20(%rcx)
1524
1525	lea	0x30(%rcx),%rcx
1526	jge	L(mov3dqa3)
1527
1528	cmp	$0x10,%r8
1529	jl	L(movdqa_epi)
1530	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1531	sub	$0x10,%r8
1532	lea	0x10(%rdx),%rdx
1533	movdqa	%xmm3,%xmm2		# save for use next concat
1534	#palignr	$0x3,%xmm1,%xmm3
1535	.byte	0x66,0x0f,0x3a,0x0f
1536	.byte	0xd9,0x03
1537
1538	cmp	$0x10,%r8
1539	movdqa	%xmm3,(%rcx)      	# store it
1540	lea	0x10(%rcx),%rcx
1541	jl	L(movdqa_epi)
1542
1543	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1544	sub	$0x10,%r8
1545	lea	0x10(%rdx),%rdx
1546	#palignr	$0x3,%xmm2,%xmm0
1547	.byte	0x66,0x0f,0x3a,0x0f
1548	.byte	0xc2,0x03
1549	movdqa	%xmm0,(%rcx)      	# store it
1550	lea	0x10(%rcx),%rcx
1551	jmp	L(movdqa_epi)
1552
1553	.balign 16
1554L(mov3dqa4):
1555	movdqa	0x10(%rdx),%xmm3
1556	sub	$0x30,%r8
1557	movdqa	0x20(%rdx),%xmm0
1558	movdqa	0x30(%rdx),%xmm5
1559	lea	0x30(%rdx),%rdx
1560	cmp	$0x30,%r8
1561
1562	movdqa	%xmm3,%xmm2
1563	#palignr	$0x4,%xmm1,%xmm3
1564	.byte	0x66,0x0f,0x3a,0x0f
1565	.byte	0xd9,0x04
1566	movdqa	%xmm3,(%rcx)
1567
1568	movdqa	%xmm0,%xmm4
1569	#palignr	$0x4,%xmm2,%xmm0
1570	.byte	0x66,0x0f,0x3a,0x0f
1571	.byte	0xc2,0x04
1572	movdqa	%xmm0,0x10(%rcx)
1573
1574	movdqa	%xmm5,%xmm1
1575	#palignr	$0x4,%xmm4,%xmm5
1576	.byte	0x66,0x0f,0x3a,0x0f
1577	.byte	0xec,0x04
1578	movdqa	%xmm5,0x20(%rcx)
1579
1580	lea	0x30(%rcx),%rcx
1581	jge	L(mov3dqa4)
1582
1583	cmp	$0x10,%r8
1584	jl	L(movdqa_epi)
1585	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1586	sub	$0x10,%r8
1587	lea	0x10(%rdx),%rdx
1588	movdqa	%xmm3,%xmm2		# save for use next concat
1589	#palignr	$0x4,%xmm1,%xmm3
1590	.byte	0x66,0x0f,0x3a,0x0f
1591	.byte	0xd9,0x04
1592
1593	cmp	$0x10,%r8
1594	movdqa	%xmm3,(%rcx)      	# store it
1595	lea	0x10(%rcx),%rcx
1596	jl	L(movdqa_epi)
1597
1598	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1599	sub	$0x10,%r8
1600	lea	0x10(%rdx),%rdx
1601	#palignr	$0x4,%xmm2,%xmm0
1602	.byte	0x66,0x0f,0x3a,0x0f
1603	.byte	0xc2,0x04
1604	movdqa	%xmm0,(%rcx)      	# store it
1605	lea	0x10(%rcx),%rcx
1606	jmp	L(movdqa_epi)
1607
1608	.balign 16
1609L(mov3dqa5):
1610	movdqa	0x10(%rdx),%xmm3
1611	sub	$0x30,%r8
1612	movdqa	0x20(%rdx),%xmm0
1613	movdqa	0x30(%rdx),%xmm5
1614	lea	0x30(%rdx),%rdx
1615	cmp	$0x30,%r8
1616
1617	movdqa	%xmm3,%xmm2
1618	#palignr	$0x5,%xmm1,%xmm3
1619	.byte	0x66,0x0f,0x3a,0x0f
1620	.byte	0xd9,0x05
1621	movdqa	%xmm3,(%rcx)
1622
1623	movdqa	%xmm0,%xmm4
1624	#palignr	$0x5,%xmm2,%xmm0
1625	.byte	0x66,0x0f,0x3a,0x0f
1626	.byte	0xc2,0x05
1627	movdqa	%xmm0,0x10(%rcx)
1628
1629	movdqa	%xmm5,%xmm1
1630	#palignr	$0x5,%xmm4,%xmm5
1631	.byte	0x66,0x0f,0x3a,0x0f
1632	.byte	0xec,0x05
1633	movdqa	%xmm5,0x20(%rcx)
1634
1635	lea	0x30(%rcx),%rcx
1636	jge	L(mov3dqa5)
1637
1638	cmp	$0x10,%r8
1639	jl	L(movdqa_epi)
1640	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1641	sub	$0x10,%r8
1642	lea	0x10(%rdx),%rdx
1643	movdqa	%xmm3,%xmm2		# save for use next concat
1644	#palignr	$0x5,%xmm1,%xmm3
1645	.byte	0x66,0x0f,0x3a,0x0f
1646	.byte	0xd9,0x05
1647
1648	cmp	$0x10,%r8
1649	movdqa	%xmm3,(%rcx)      	# store it
1650	lea	0x10(%rcx),%rcx
1651	jl	L(movdqa_epi)
1652
1653	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1654	sub	$0x10,%r8
1655	lea	0x10(%rdx),%rdx
1656	#palignr	$0x5,%xmm2,%xmm0
1657	.byte	0x66,0x0f,0x3a,0x0f
1658	.byte	0xc2,0x05
1659	movdqa	%xmm0,(%rcx)      	# store it
1660	lea	0x10(%rcx),%rcx
1661	jmp	L(movdqa_epi)
1662
1663	.balign 16
1664L(mov3dqa6):
1665	movdqa	0x10(%rdx),%xmm3
1666	sub	$0x30,%r8
1667	movdqa	0x20(%rdx),%xmm0
1668	movdqa	0x30(%rdx),%xmm5
1669	lea	0x30(%rdx),%rdx
1670	cmp	$0x30,%r8
1671
1672	movdqa	%xmm3,%xmm2
1673	#palignr	$0x6,%xmm1,%xmm3
1674	.byte	0x66,0x0f,0x3a,0x0f
1675	.byte	0xd9,0x06
1676	movdqa	%xmm3,(%rcx)
1677
1678	movdqa	%xmm0,%xmm4
1679	#palignr	$0x6,%xmm2,%xmm0
1680	.byte	0x66,0x0f,0x3a,0x0f
1681	.byte	0xc2,0x06
1682	movdqa	%xmm0,0x10(%rcx)
1683
1684	movdqa	%xmm5,%xmm1
1685	#palignr	$0x6,%xmm4,%xmm5
1686	.byte	0x66,0x0f,0x3a,0x0f
1687	.byte	0xec,0x06
1688	movdqa	%xmm5,0x20(%rcx)
1689
1690	lea	0x30(%rcx),%rcx
1691	jge	L(mov3dqa6)
1692
1693	cmp	$0x10,%r8
1694	jl	L(movdqa_epi)
1695	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1696	sub	$0x10,%r8
1697	lea	0x10(%rdx),%rdx
1698	movdqa	%xmm3,%xmm2		# save for use next concat
1699	#palignr	$0x6,%xmm1,%xmm3
1700	.byte	0x66,0x0f,0x3a,0x0f
1701	.byte	0xd9,0x06
1702
1703	cmp	$0x10,%r8
1704	movdqa	%xmm3,(%rcx)      	# store it
1705	lea	0x10(%rcx),%rcx
1706	jl	L(movdqa_epi)
1707
1708	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1709	sub	$0x10,%r8
1710	lea	0x10(%rdx),%rdx
1711	#palignr	$0x6,%xmm2,%xmm0
1712	.byte	0x66,0x0f,0x3a,0x0f
1713	.byte	0xc2,0x06
1714	movdqa	%xmm0,(%rcx)      	# store it
1715	lea	0x10(%rcx),%rcx
1716	jmp	L(movdqa_epi)
1717
1718	.balign 16
1719L(mov3dqa7):
1720	movdqa	0x10(%rdx),%xmm3
1721	sub	$0x30,%r8
1722	movdqa	0x20(%rdx),%xmm0
1723	movdqa	0x30(%rdx),%xmm5
1724	lea	0x30(%rdx),%rdx
1725	cmp	$0x30,%r8
1726
1727	movdqa	%xmm3,%xmm2
1728	#palignr	$0x7,%xmm1,%xmm3
1729	.byte	0x66,0x0f,0x3a,0x0f
1730	.byte	0xd9,0x07
1731	movdqa	%xmm3,(%rcx)
1732
1733	movdqa	%xmm0,%xmm4
1734	#palignr	$0x7,%xmm2,%xmm0
1735	.byte	0x66,0x0f,0x3a,0x0f
1736	.byte	0xc2,0x07
1737	movdqa	%xmm0,0x10(%rcx)
1738
1739	movdqa	%xmm5,%xmm1
1740	#palignr	$0x7,%xmm4,%xmm5
1741	.byte	0x66,0x0f,0x3a,0x0f
1742	.byte	0xec,0x07
1743	movdqa	%xmm5,0x20(%rcx)
1744
1745	lea	0x30(%rcx),%rcx
1746	jge	L(mov3dqa7)
1747
1748	cmp	$0x10,%r8
1749	jl	L(movdqa_epi)
1750	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1751	sub	$0x10,%r8
1752	lea	0x10(%rdx),%rdx
1753	movdqa	%xmm3,%xmm2		# save for use next concat
1754	#palignr	$0x7,%xmm1,%xmm3
1755	.byte	0x66,0x0f,0x3a,0x0f
1756	.byte	0xd9,0x07
1757
1758	cmp	$0x10,%r8
1759	movdqa	%xmm3,(%rcx)      	# store it
1760	lea	0x10(%rcx),%rcx
1761	jl	L(movdqa_epi)
1762
1763	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1764	sub	$0x10,%r8
1765	lea	0x10(%rdx),%rdx
1766	#palignr	$0x7,%xmm2,%xmm0
1767	.byte	0x66,0x0f,0x3a,0x0f
1768	.byte	0xc2,0x07
1769	movdqa	%xmm0,(%rcx)      	# store it
1770	lea	0x10(%rcx),%rcx
1771	jmp	L(movdqa_epi)
1772
1773	.balign 16
1774L(mov3dqa9):
1775	movdqa	0x10(%rdx),%xmm3
1776	sub	$0x30,%r8
1777	movdqa	0x20(%rdx),%xmm0
1778	movdqa	0x30(%rdx),%xmm5
1779	lea	0x30(%rdx),%rdx
1780	cmp	$0x30,%r8
1781
1782	movdqa	%xmm3,%xmm2
1783	#palignr	$0x9,%xmm1,%xmm3
1784	.byte	0x66,0x0f,0x3a,0x0f
1785	.byte	0xd9,0x09
1786	movdqa	%xmm3,(%rcx)
1787
1788	movdqa	%xmm0,%xmm4
1789	#palignr	$0x9,%xmm2,%xmm0
1790	.byte	0x66,0x0f,0x3a,0x0f
1791	.byte	0xc2,0x09
1792	movdqa	%xmm0,0x10(%rcx)
1793
1794	movdqa	%xmm5,%xmm1
1795	#palignr	$0x9,%xmm4,%xmm5
1796	.byte	0x66,0x0f,0x3a,0x0f
1797	.byte	0xec,0x09
1798	movdqa	%xmm5,0x20(%rcx)
1799
1800	lea	0x30(%rcx),%rcx
1801	jge	L(mov3dqa9)
1802
1803	cmp	$0x10,%r8
1804	jl	L(movdqa_epi)
1805	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1806	sub	$0x10,%r8
1807	lea	0x10(%rdx),%rdx
1808	movdqa	%xmm3,%xmm2		# save for use next concat
1809	#palignr	$0x9,%xmm1,%xmm3
1810	.byte	0x66,0x0f,0x3a,0x0f
1811	.byte	0xd9,0x09
1812
1813	cmp	$0x10,%r8
1814	movdqa	%xmm3,(%rcx)      	# store it
1815	lea	0x10(%rcx),%rcx
1816	jl	L(movdqa_epi)
1817
1818	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1819	sub	$0x10,%r8
1820	lea	0x10(%rdx),%rdx
1821	#palignr	$0x9,%xmm2,%xmm0
1822	.byte	0x66,0x0f,0x3a,0x0f
1823	.byte	0xc2,0x09
1824	movdqa	%xmm0,(%rcx)      	# store it
1825	lea	0x10(%rcx),%rcx
1826	jmp	L(movdqa_epi)
1827
1828	.balign 16
1829L(mov3dqa10):
1830	movdqa	0x10(%rdx),%xmm3
1831	sub	$0x30,%r8
1832	movdqa	0x20(%rdx),%xmm0
1833	movdqa	0x30(%rdx),%xmm5
1834	lea	0x30(%rdx),%rdx
1835	cmp	$0x30,%r8
1836
1837	movdqa	%xmm3,%xmm2
1838	#palignr	$0xa,%xmm1,%xmm3
1839	.byte	0x66,0x0f,0x3a,0x0f
1840	.byte	0xd9,0x0a
1841	movdqa	%xmm3,(%rcx)
1842
1843	movdqa	%xmm0,%xmm4
1844	#palignr	$0xa,%xmm2,%xmm0
1845	.byte	0x66,0x0f,0x3a,0x0f
1846	.byte	0xc2,0x0a
1847	movdqa	%xmm0,0x10(%rcx)
1848
1849	movdqa	%xmm5,%xmm1
1850	#palignr	$0xa,%xmm4,%xmm5
1851	.byte	0x66,0x0f,0x3a,0x0f
1852	.byte	0xec,0x0a
1853	movdqa	%xmm5,0x20(%rcx)
1854
1855	lea	0x30(%rcx),%rcx
1856	jge	L(mov3dqa10)
1857
1858	cmp	$0x10,%r8
1859	jl	L(movdqa_epi)
1860	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1861	sub	$0x10,%r8
1862	lea	0x10(%rdx),%rdx
1863	movdqa	%xmm3,%xmm2		# save for use next concat
1864	#palignr	$0xa,%xmm1,%xmm3
1865	.byte	0x66,0x0f,0x3a,0x0f
1866	.byte	0xd9,0x0a
1867
1868	cmp	$0x10,%r8
1869	movdqa	%xmm3,(%rcx)      	# store it
1870	lea	0x10(%rcx),%rcx
1871	jl	L(movdqa_epi)
1872
1873	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1874	sub	$0x10,%r8
1875	lea	0x10(%rdx),%rdx
1876	#palignr	$0xa,%xmm2,%xmm0
1877	.byte	0x66,0x0f,0x3a,0x0f
1878	.byte	0xc2,0x0a
1879	movdqa	%xmm0,(%rcx)      	# store it
1880	lea	0x10(%rcx),%rcx
1881	jmp	L(movdqa_epi)
1882
1883	.balign 16
1884L(mov3dqa11):
1885	movdqa	0x10(%rdx),%xmm3
1886	sub	$0x30,%r8
1887	movdqa	0x20(%rdx),%xmm0
1888	movdqa	0x30(%rdx),%xmm5
1889	lea	0x30(%rdx),%rdx
1890	cmp	$0x30,%r8
1891
1892	movdqa	%xmm3,%xmm2
1893	#palignr	$0xb,%xmm1,%xmm3
1894	.byte	0x66,0x0f,0x3a,0x0f
1895	.byte	0xd9,0x0b
1896	movdqa	%xmm3,(%rcx)
1897
1898	movdqa	%xmm0,%xmm4
1899	#palignr	$0xb,%xmm2,%xmm0
1900	.byte	0x66,0x0f,0x3a,0x0f
1901	.byte	0xc2,0x0b
1902	movdqa	%xmm0,0x10(%rcx)
1903
1904	movdqa	%xmm5,%xmm1
1905	#palignr	$0xb,%xmm4,%xmm5
1906	.byte	0x66,0x0f,0x3a,0x0f
1907	.byte	0xec,0x0b
1908	movdqa	%xmm5,0x20(%rcx)
1909
1910	lea	0x30(%rcx),%rcx
1911	jge	L(mov3dqa11)
1912
1913	cmp	$0x10,%r8
1914	jl	L(movdqa_epi)
1915	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1916	sub	$0x10,%r8
1917	lea	0x10(%rdx),%rdx
1918	movdqa	%xmm3,%xmm2		# save for use next concat
1919	#palignr	$0xb,%xmm1,%xmm3
1920	.byte	0x66,0x0f,0x3a,0x0f
1921	.byte	0xd9,0x0b
1922
1923	cmp	$0x10,%r8
1924	movdqa	%xmm3,(%rcx)      	# store it
1925	lea	0x10(%rcx),%rcx
1926	jl	L(movdqa_epi)
1927
1928	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1929	sub	$0x10,%r8
1930	lea	0x10(%rdx),%rdx
1931	#palignr	$0xb,%xmm2,%xmm0
1932	.byte	0x66,0x0f,0x3a,0x0f
1933	.byte	0xc2,0x0b
1934	movdqa	%xmm0,(%rcx)      	# store it
1935	lea	0x10(%rcx),%rcx
1936	jmp	L(movdqa_epi)
1937
1938	.balign 16
1939L(mov3dqa12):
1940	movdqa	0x10(%rdx),%xmm3
1941	sub	$0x30,%r8
1942	movdqa	0x20(%rdx),%xmm0
1943	movdqa	0x30(%rdx),%xmm5
1944	lea	0x30(%rdx),%rdx
1945	cmp	$0x30,%r8
1946
1947	movdqa	%xmm3,%xmm2
1948	#palignr	$0xc,%xmm1,%xmm3
1949	.byte	0x66,0x0f,0x3a,0x0f
1950	.byte	0xd9,0x0c
1951	movdqa	%xmm3,(%rcx)
1952
1953	movdqa	%xmm0,%xmm4
1954	#palignr	$0xc,%xmm2,%xmm0
1955	.byte	0x66,0x0f,0x3a,0x0f
1956	.byte	0xc2,0x0c
1957	movdqa	%xmm0,0x10(%rcx)
1958
1959	movdqa	%xmm5,%xmm1
1960	#palignr	$0xc,%xmm4,%xmm5
1961	.byte	0x66,0x0f,0x3a,0x0f
1962	.byte	0xec,0x0c
1963	movdqa	%xmm5,0x20(%rcx)
1964
1965	lea	0x30(%rcx),%rcx
1966	jge	L(mov3dqa12)
1967
1968	cmp	$0x10,%r8
1969	jl	L(movdqa_epi)
1970	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1971	sub	$0x10,%r8
1972	lea	0x10(%rdx),%rdx
1973	movdqa	%xmm3,%xmm2		# save for use next concat
1974	#palignr	$0xc,%xmm1,%xmm3
1975	.byte	0x66,0x0f,0x3a,0x0f
1976	.byte	0xd9,0x0c
1977
1978	cmp	$0x10,%r8
1979	movdqa	%xmm3,(%rcx)      	# store it
1980	lea	0x10(%rcx),%rcx
1981	jl	L(movdqa_epi)
1982
1983	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1984	sub	$0x10,%r8
1985	lea	0x10(%rdx),%rdx
1986	#palignr	$0xc,%xmm2,%xmm0
1987	.byte	0x66,0x0f,0x3a,0x0f
1988	.byte	0xc2,0x0c
1989	movdqa	%xmm0,(%rcx)      	# store it
1990	lea	0x10(%rcx),%rcx
1991	jmp	L(movdqa_epi)
1992
1993	.balign 16
1994L(mov3dqa13):
1995	movdqa	0x10(%rdx),%xmm3
1996	sub	$0x30,%r8
1997	movdqa	0x20(%rdx),%xmm0
1998	movdqa	0x30(%rdx),%xmm5
1999	lea	0x30(%rdx),%rdx
2000	cmp	$0x30,%r8
2001
2002	movdqa	%xmm3,%xmm2
2003	#palignr	$0xd,%xmm1,%xmm3
2004	.byte	0x66,0x0f,0x3a,0x0f
2005	.byte	0xd9,0x0d
2006	movdqa	%xmm3,(%rcx)
2007
2008	movdqa	%xmm0,%xmm4
2009	#palignr	$0xd,%xmm2,%xmm0
2010	.byte	0x66,0x0f,0x3a,0x0f
2011	.byte	0xc2,0x0d
2012	movdqa	%xmm0,0x10(%rcx)
2013
2014	movdqa	%xmm5,%xmm1
2015	#palignr	$0xd,%xmm4,%xmm5
2016	.byte	0x66,0x0f,0x3a,0x0f
2017	.byte	0xec,0x0d
2018	movdqa	%xmm5,0x20(%rcx)
2019
2020	lea	0x30(%rcx),%rcx
2021	jge	L(mov3dqa13)
2022
2023	cmp	$0x10,%r8
2024	jl	L(movdqa_epi)
2025	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2026	sub	$0x10,%r8
2027	lea	0x10(%rdx),%rdx
2028	movdqa	%xmm3,%xmm2		# save for use next concat
2029	#palignr	$0xd,%xmm1,%xmm3
2030	.byte	0x66,0x0f,0x3a,0x0f
2031	.byte	0xd9,0x0d
2032
2033	cmp	$0x10,%r8
2034	movdqa	%xmm3,(%rcx)      	# store it
2035	lea	0x10(%rcx),%rcx
2036	jl	L(movdqa_epi)
2037
2038	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2039	sub	$0x10,%r8
2040	lea	0x10(%rdx),%rdx
2041	#palignr	$0xd,%xmm2,%xmm0
2042	.byte	0x66,0x0f,0x3a,0x0f
2043	.byte	0xc2,0x0d
2044	movdqa	%xmm0,(%rcx)      	# store it
2045	lea	0x10(%rcx),%rcx
2046	jmp	L(movdqa_epi)
2047
2048	.balign 16
2049L(mov3dqa14):
2050	movdqa	0x10(%rdx),%xmm3
2051	sub	$0x30,%r8
2052	movdqa	0x20(%rdx),%xmm0
2053	movdqa	0x30(%rdx),%xmm5
2054	lea	0x30(%rdx),%rdx
2055	cmp	$0x30,%r8
2056
2057	movdqa	%xmm3,%xmm2
2058	#palignr	$0xe,%xmm1,%xmm3
2059	.byte	0x66,0x0f,0x3a,0x0f
2060	.byte	0xd9,0x0e
2061	movdqa	%xmm3,(%rcx)
2062
2063	movdqa	%xmm0,%xmm4
2064	#palignr	$0xe,%xmm2,%xmm0
2065	.byte	0x66,0x0f,0x3a,0x0f
2066	.byte	0xc2,0x0e
2067	movdqa	%xmm0,0x10(%rcx)
2068
2069	movdqa	%xmm5,%xmm1
2070	#palignr	$0xe,%xmm4,%xmm5
2071	.byte	0x66,0x0f,0x3a,0x0f
2072	.byte	0xec,0x0e
2073	movdqa	%xmm5,0x20(%rcx)
2074
2075	lea	0x30(%rcx),%rcx
2076	jge	L(mov3dqa14)
2077
2078	cmp	$0x10,%r8
2079	jl	L(movdqa_epi)
2080	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2081	sub	$0x10,%r8
2082	lea	0x10(%rdx),%rdx
2083	movdqa	%xmm3,%xmm2		# save for use next concat
2084	#palignr	$0xe,%xmm1,%xmm3
2085	.byte	0x66,0x0f,0x3a,0x0f
2086	.byte	0xd9,0x0e
2087
2088	cmp	$0x10,%r8
2089	movdqa	%xmm3,(%rcx)      	# store it
2090	lea	0x10(%rcx),%rcx
2091	jl	L(movdqa_epi)
2092
2093	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2094	sub	$0x10,%r8
2095	lea	0x10(%rdx),%rdx
2096	#palignr	$0xe,%xmm2,%xmm0
2097	.byte	0x66,0x0f,0x3a,0x0f
2098	.byte	0xc2,0x0e
2099	movdqa	%xmm0,(%rcx)      	# store it
2100	lea	0x10(%rcx),%rcx
2101	jmp	L(movdqa_epi)
2102
2103	.balign 16
2104L(mov3dqa15):
2105	movdqa	0x10(%rdx),%xmm3
2106	sub	$0x30,%r8
2107	movdqa	0x20(%rdx),%xmm0
2108	movdqa	0x30(%rdx),%xmm5
2109	lea	0x30(%rdx),%rdx
2110	cmp	$0x30,%r8
2111
2112	movdqa	%xmm3,%xmm2
2113	#palignr	$0xf,%xmm1,%xmm3
2114	.byte	0x66,0x0f,0x3a,0x0f
2115	.byte	0xd9,0x0f
2116	movdqa	%xmm3,(%rcx)
2117
2118	movdqa	%xmm0,%xmm4
2119	#palignr	$0xf,%xmm2,%xmm0
2120	.byte	0x66,0x0f,0x3a,0x0f
2121	.byte	0xc2,0x0f
2122	movdqa	%xmm0,0x10(%rcx)
2123
2124	movdqa	%xmm5,%xmm1
2125	#palignr	$0xf,%xmm4,%xmm5
2126	.byte	0x66,0x0f,0x3a,0x0f
2127	.byte	0xec,0x0f
2128	movdqa	%xmm5,0x20(%rcx)
2129
2130	lea	0x30(%rcx),%rcx
2131	jge	L(mov3dqa15)
2132
2133	cmp	$0x10,%r8
2134	jl	L(movdqa_epi)
2135	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2136	sub	$0x10,%r8
2137	lea	0x10(%rdx),%rdx
2138	movdqa	%xmm3,%xmm2		# save for use next concat
2139	#palignr	$0xf,%xmm1,%xmm3
2140	.byte	0x66,0x0f,0x3a,0x0f
2141	.byte	0xd9,0x0f
2142
2143	cmp	$0x10,%r8
2144	movdqa	%xmm3,(%rcx)      	# store it
2145	lea	0x10(%rcx),%rcx
2146	jl	L(movdqa_epi)
2147
2148	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2149	sub	$0x10,%r8
2150	lea	0x10(%rdx),%rdx
2151	#palignr	$0xf,%xmm2,%xmm0
2152	.byte	0x66,0x0f,0x3a,0x0f
2153	.byte	0xc2,0x0f
2154	movdqa	%xmm0,(%rcx)      	# store it
2155	lea	0x10(%rcx),%rcx
2156	jmp	L(movdqa_epi)
2157
2158	.balign 16
2159L(sse2_nt_move):
2160	lea	0x40(%rcx),%rcx
2161	lea	0x40(%rdx),%rdx
2162	lea	-0x40(%r8),%r8
2163
2164	/*
2165	 * doesn't matter if source is aligned for stuff out of cache.
2166	 * the mis-aligned penalty is masked by the slowness of main memory.
2167	 */
2168	prefetchnta 0x180(%rdx)
2169	movdqu	-0x40(%rdx),%xmm0
2170	movdqu	-0x30(%rdx),%xmm1
2171
2172	cmp	$0x40,%r8
2173	movntdq	%xmm0,-0x40(%rcx)
2174	movntdq	%xmm1,-0x30(%rcx)
2175
2176	movdqu	-0x20(%rdx),%xmm2
2177	movdqu	-0x10(%rdx),%xmm3
2178
2179	movntdq	%xmm2,-0x20(%rcx)
2180	movntdq	%xmm3,-0x10(%rcx)
2181
2182	jge	L(sse2_nt_move)
2183
2184	lea	L(Fix16EndTable)(%rip),%r10
2185	mov	%r8,%r9
2186	and	$0xFFFFFFFFFFFFFFF0,%r9
2187	add	%r9,%rcx
2188	add	%r9,%rdx
2189	sub	%r9,%r8
2190	shr	$0x4,%r9
2191	sfence
2192
2193	movslq	(%r10,%r9,4),%r11
2194	lea	(%r11,%r10,1),%r10
2195	jmpq	*%r10
2196
2197	.balign 16
2198L(Fix16EndTable):
2199	.int    L(fix16_0)-L(Fix16EndTable)
2200	.int    L(fix16_1)-L(Fix16EndTable)
2201	.int    L(fix16_2)-L(Fix16EndTable)
2202	.int    L(fix16_3)-L(Fix16EndTable)
2203
2204	.balign 16
2205L(fix16_3):
2206	movdqu -0x30(%rdx),%xmm1
2207	movdqa %xmm1,-0x30(%rcx)
2208L(fix16_2):
2209	movdqu -0x20(%rdx),%xmm2
2210	movdqa %xmm2,-0x20(%rcx)
2211L(fix16_1):
2212	movdqu -0x10(%rdx),%xmm3
2213	movdqa %xmm3,-0x10(%rcx)
2214L(fix16_0):
2215	lea    L(fwdPxQx)(%rip),%r10
2216	add    %r8,%rdx
2217	add    %r8,%rcx
2218
2219	movslq (%r10,%r8,4),%r9
2220	lea    (%r9,%r10,1),%r10
2221	jmpq   *%r10
2222
2223	.balign 16
2224L(pre_both_aligned):
2225	cmp    $0x80,%r8
2226	jl     L(fix_16b)
2227
2228	.balign 16
2229L(both_aligned):
2230
2231	/*
2232	 * this 'paired' load/load/store/store seems to do best.
2233	 */
2234	movdqa (%rdx),%xmm0
2235	movdqa 0x10(%rdx),%xmm1
2236
2237	movdqa %xmm0,(%rcx)
2238	movdqa %xmm1,0x10(%rcx)
2239	lea    -0x80(%r8),%r8
2240
2241	movdqa 0x20(%rdx),%xmm2
2242	movdqa 0x30(%rdx),%xmm3
2243
2244	movdqa %xmm2,0x20(%rcx)
2245	movdqa %xmm3,0x30(%rcx)
2246
2247	movdqa 0x40(%rdx),%xmm0
2248	movdqa 0x50(%rdx),%xmm1
2249	cmp    $0x80,%r8
2250
2251	movdqa %xmm0,0x40(%rcx)
2252	movdqa %xmm1,0x50(%rcx)
2253
2254	movdqa 0x60(%rdx),%xmm2
2255	movdqa 0x70(%rdx),%xmm3
2256	lea    0x80(%rdx),%rdx
2257	movdqa %xmm2,0x60(%rcx)
2258	movdqa %xmm3,0x70(%rcx)
2259	lea    0x80(%rcx),%rcx
2260	jge    L(both_aligned)
2261
2262L(fix_16b):
2263	add    %r8,%rcx
2264	lea    L(fwdPxQx)(%rip),%r10
2265	add    %r8,%rdx
2266
2267	movslq (%r10,%r8,4),%r9
2268	lea    (%r9,%r10,1),%r10
2269	jmpq   *%r10
2270
2271	.balign 16
2272L(Loop8byte_pre):
2273	# Use 8-byte moves
2274	mov    .largest_level_cache_size(%rip),%r9d
2275	shr    %r9		# take half of it
2276	cmp    %r9,%r8
2277	jg     L(byte8_nt_top)
2278	# Find out whether to use rep movsq
2279	cmp    $4096,%r8
2280	jle    L(byte8_top)
2281	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
2282	cmp    %r9,%r8
2283	jle    L(use_rep)
2284
2285	.balign     16
2286L(byte8_top):
2287	mov    (%rdx),%r9
2288	mov    0x8(%rdx),%r10
2289	lea    -0x40(%r8),%r8
2290	mov    %r9,(%rcx)
2291	mov    %r10,0x8(%rcx)
2292	mov    0x10(%rdx),%r11
2293	mov    0x18(%rdx),%r9
2294	mov    %r11,0x10(%rcx)
2295	mov    %r9,0x18(%rcx)
2296
2297	cmp    $0x40,%r8
2298	mov    0x20(%rdx),%r10
2299	mov    0x28(%rdx),%r11
2300	mov    %r10,0x20(%rcx)
2301	mov    %r11,0x28(%rcx)
2302	mov    0x30(%rdx),%r9
2303	mov    0x38(%rdx),%r10
2304	lea    0x40(%rdx),%rdx
2305	mov    %r9,0x30(%rcx)
2306	mov    %r10,0x38(%rcx)
2307	lea    0x40(%rcx),%rcx
2308	jg     L(byte8_top)
2309
2310L(byte8_end):
2311	lea    L(fwdPxQx)(%rip),%r10
2312	lea    (%rdx,%r8,1),%rdx
2313	lea    (%rcx,%r8,1),%rcx
2314
2315	movslq (%r10,%r8,4),%r9
2316	lea    (%r9,%r10,1),%r10
2317	jmpq   *%r10
2318
2319	.balign	16
2320L(use_rep):
2321	mov    %rdx,%rsi		# %rsi = source
2322	mov    %rcx,%rdi		# %rdi = destination
2323	mov    %r8,%rcx			# %rcx = count
2324	shrq   $3,%rcx			# 8-byte word count
2325	rep
2326	  movsq
2327	mov    %rsi,%rdx		# source
2328	mov    %rdi,%rcx		# destination
2329	andq   $7,%r8			# remainder
2330	jnz    L(byte8_end)
2331	ret
2332
2333	.balign 16
2334L(byte8_nt_top):
2335	sub    $0x40,%r8
2336	prefetchnta 0x180(%rdx)
2337	mov    (%rdx),%r9
2338	movnti %r9,(%rcx)
2339	mov    0x8(%rdx),%r10
2340	movnti %r10,0x8(%rcx)
2341	mov    0x10(%rdx),%r11
2342	movnti %r11,0x10(%rcx)
2343	mov    0x18(%rdx),%r9
2344	movnti %r9,0x18(%rcx)
2345	mov    0x20(%rdx),%r10
2346	movnti %r10,0x20(%rcx)
2347	mov    0x28(%rdx),%r11
2348	movnti %r11,0x28(%rcx)
2349	mov    0x30(%rdx),%r9
2350	movnti %r9,0x30(%rcx)
2351	mov    0x38(%rdx),%r10
2352	movnti %r10,0x38(%rcx)
2353
2354	lea    0x40(%rdx),%rdx
2355	lea    0x40(%rcx),%rcx
2356	cmp    $0x40,%r8
2357	jge    L(byte8_nt_top)
2358	sfence
2359	jmp    L(byte8_end)
2360
2361	SET_SIZE(memcpy)
2362
2363	.balign 16
2364L(CopyBackwards):
2365	mov    %rdx,%r8
2366	mov    %rdi,%rcx
2367	mov    %rsi,%rdx
2368	mov    %rdi,%rax		# return value
2369
2370	# ck alignment of last byte
2371	lea    (%rcx,%r8,1),%rcx
2372	test   $0x7,%rcx
2373	lea    (%rdx,%r8,1),%rdx
2374	jne    L(bk_align)
2375
2376L(bk_qw_aligned):
2377	lea    L(bkPxQx)(%rip),%r10
2378
2379	cmp    $0x90,%r8		# 144
2380	jg     L(bk_ck_sse2_alignment)
2381
2382	sub    %r8,%rcx
2383	sub    %r8,%rdx
2384
2385	movslq (%r10,%r8,4),%r9
2386	lea    (%r9,%r10,1),%r10
2387	jmpq   *%r10
2388
2389	.balign 16
2390L(bk_align):
2391	# only align if len > 8
2392	cmp    $8,%r8
2393	jle    L(bk_qw_aligned)
2394	test   $0x1,%rcx
2395	je     L(bk_tst2)
2396	dec    %rcx
2397	dec    %rdx
2398	dec    %r8
2399	mov    (%rdx),%r9b
2400	mov    %r9b,(%rcx)
2401
2402L(bk_tst2):
2403	test   $0x2,%rcx
2404	je     L(bk_tst3)
2405
2406L(bk_got2):
2407	sub    $0x2,%rcx
2408	sub    $0x2,%rdx
2409	sub    $0x2,%r8
2410	movzwq (%rdx),%r9
2411	mov    %r9w,(%rcx)
2412
2413L(bk_tst3):
2414	test   $0x4,%rcx
2415	je     L(bk_qw_aligned)
2416
2417L(bk_got3):
2418	sub    $0x4,%rcx
2419	sub    $0x4,%rdx
2420	sub    $0x4,%r8
2421	mov    (%rdx),%r9d
2422	mov    %r9d,(%rcx)
2423	jmp    L(bk_qw_aligned)
2424
2425	.balign 16
2426L(bk_ck_sse2_alignment):
2427	cmpl   $NO_SSE,.memops_method(%rip)
2428	je     L(bk_use_rep)
2429	# check alignment of last byte
2430	test   $0xf,%rcx
2431	jz     L(bk_sse2_cpy)
2432
2433L(bk_sse2_align):
2434	# only here if already aligned on at least a qword bndry
2435	sub    $0x8,%rcx
2436	sub    $0x8,%rdx
2437	sub    $0x8,%r8
2438	mov    (%rdx),%r9
2439	mov    %r9,(%rcx)
2440	#jmp   L(bk_sse2_cpy)
2441
2442	.balign 16
2443L(bk_sse2_cpy):
2444	sub    $0x80,%rcx		# 128
2445	sub    $0x80,%rdx
2446	movdqu 0x70(%rdx),%xmm3
2447	movdqu 0x60(%rdx),%xmm2
2448	movdqa %xmm3,0x70(%rcx)
2449	movdqa %xmm2,0x60(%rcx)
2450	sub    $0x80,%r8
2451	movdqu 0x50(%rdx),%xmm1
2452	movdqu 0x40(%rdx),%xmm0
2453	movdqa %xmm1,0x50(%rcx)
2454	movdqa %xmm0,0x40(%rcx)
2455
2456	cmp    $0x80,%r8
2457	movdqu 0x30(%rdx),%xmm3
2458	movdqu 0x20(%rdx),%xmm2
2459	movdqa %xmm3,0x30(%rcx)
2460	movdqa %xmm2,0x20(%rcx)
2461	movdqu 0x10(%rdx),%xmm1
2462	movdqu (%rdx),%xmm0
2463	movdqa %xmm1,0x10(%rcx)
2464	movdqa %xmm0,(%rcx)
2465	jge    L(bk_sse2_cpy)
2466
2467L(bk_sse2_cpy_end):
2468	lea    L(bkPxQx)(%rip),%r10
2469	sub    %r8,%rdx
2470	sub    %r8,%rcx
2471	movslq (%r10,%r8,4),%r9
2472	lea    (%r9,%r10,1),%r10
2473	jmpq   *%r10
2474
2475	.balign 16
2476L(bk_use_rep):
2477	xchg   %rcx,%r9
2478	mov    %rdx,%rsi		# source
2479	mov    %r9,%rdi			# destination
2480	mov    %r8,%rcx			# count
2481	sub    $8,%rsi
2482	sub    $8,%rdi
2483	shr    $3,%rcx
2484	std				# reverse direction
2485	rep
2486	  movsq
2487	cld				# reset direction flag
2488
2489	xchg   %rcx,%r9
2490	lea    L(bkPxQx)(%rip),%r10
2491	sub    %r8,%rdx
2492	sub    %r8,%rcx
2493	andq   $7,%r8			# remainder
2494	jz     2f
2495	movslq (%r10,%r8,4),%r9
2496	lea    (%r9,%r10,1),%r10
2497	jmpq   *%r10
24982:
2499	ret
2500
2501	.balign 16
2502L(bkP0QI):
2503	mov    0x88(%rdx),%r10
2504	mov    %r10,0x88(%rcx)
2505L(bkP0QH):
2506	mov    0x80(%rdx),%r10
2507	mov    %r10,0x80(%rcx)
2508L(bkP0QG):
2509	mov    0x78(%rdx),%r9
2510	mov    %r9,0x78(%rcx)
2511L(bkP0QF):
2512	mov    0x70(%rdx),%r11
2513	mov    %r11,0x70(%rcx)
2514L(bkP0QE):
2515	mov    0x68(%rdx),%r10
2516	mov    %r10,0x68(%rcx)
2517L(bkP0QD):
2518	mov    0x60(%rdx),%r9
2519	mov    %r9,0x60(%rcx)
2520L(bkP0QC):
2521	mov    0x58(%rdx),%r11
2522	mov    %r11,0x58(%rcx)
2523L(bkP0QB):
2524	mov    0x50(%rdx),%r10
2525	mov    %r10,0x50(%rcx)
2526L(bkP0QA):
2527	mov    0x48(%rdx),%r9
2528	mov    %r9,0x48(%rcx)
2529L(bkP0Q9):
2530	mov    0x40(%rdx),%r11
2531	mov    %r11,0x40(%rcx)
2532L(bkP0Q8):
2533	mov    0x38(%rdx),%r10
2534	mov    %r10,0x38(%rcx)
2535L(bkP0Q7):
2536	mov    0x30(%rdx),%r9
2537	mov    %r9,0x30(%rcx)
2538L(bkP0Q6):
2539	mov    0x28(%rdx),%r11
2540	mov    %r11,0x28(%rcx)
2541L(bkP0Q5):
2542	mov    0x20(%rdx),%r10
2543	mov    %r10,0x20(%rcx)
2544L(bkP0Q4):
2545	mov    0x18(%rdx),%r9
2546	mov    %r9,0x18(%rcx)
2547L(bkP0Q3):
2548	mov    0x10(%rdx),%r11
2549	mov    %r11,0x10(%rcx)
2550L(bkP0Q2):
2551	mov    0x8(%rdx),%r10
2552	mov    %r10,0x8(%rcx)
2553L(bkP0Q1):
2554	mov    (%rdx),%r9
2555	mov    %r9,(%rcx)
2556L(bkP0Q0):
2557	ret
2558
2559	.balign 16
2560L(bkP1QI):
2561	mov    0x89(%rdx),%r10
2562	mov    %r10,0x89(%rcx)
2563L(bkP1QH):
2564	mov    0x81(%rdx),%r11
2565	mov    %r11,0x81(%rcx)
2566L(bkP1QG):
2567	mov    0x79(%rdx),%r10
2568	mov    %r10,0x79(%rcx)
2569L(bkP1QF):
2570	mov    0x71(%rdx),%r9
2571	mov    %r9,0x71(%rcx)
2572L(bkP1QE):
2573	mov    0x69(%rdx),%r11
2574	mov    %r11,0x69(%rcx)
2575L(bkP1QD):
2576	mov    0x61(%rdx),%r10
2577	mov    %r10,0x61(%rcx)
2578L(bkP1QC):
2579	mov    0x59(%rdx),%r9
2580	mov    %r9,0x59(%rcx)
2581L(bkP1QB):
2582	mov    0x51(%rdx),%r11
2583	mov    %r11,0x51(%rcx)
2584L(bkP1QA):
2585	mov    0x49(%rdx),%r10
2586	mov    %r10,0x49(%rcx)
2587L(bkP1Q9):
2588	mov    0x41(%rdx),%r9
2589	mov    %r9,0x41(%rcx)
2590L(bkP1Q8):
2591	mov    0x39(%rdx),%r11
2592	mov    %r11,0x39(%rcx)
2593L(bkP1Q7):
2594	mov    0x31(%rdx),%r10
2595	mov    %r10,0x31(%rcx)
2596L(bkP1Q6):
2597	mov    0x29(%rdx),%r9
2598	mov    %r9,0x29(%rcx)
2599L(bkP1Q5):
2600	mov    0x21(%rdx),%r11
2601	mov    %r11,0x21(%rcx)
2602L(bkP1Q4):
2603	mov    0x19(%rdx),%r10
2604	mov    %r10,0x19(%rcx)
2605L(bkP1Q3):
2606	mov    0x11(%rdx),%r9
2607	mov    %r9,0x11(%rcx)
2608L(bkP1Q2):
2609	mov    0x9(%rdx),%r11
2610	mov    %r11,0x9(%rcx)
2611L(bkP1Q1):
2612	mov    0x1(%rdx),%r10
2613	mov    %r10,0x1(%rcx)
2614L(bkP1Q0):
2615	mov    (%rdx),%r9b
2616	mov    %r9b,(%rcx)
2617	ret
2618
2619	.balign 16
2620L(bkP2QI):
2621	mov    0x8a(%rdx),%r10
2622	mov    %r10,0x8a(%rcx)
2623L(bkP2QH):
2624	mov    0x82(%rdx),%r11
2625	mov    %r11,0x82(%rcx)
2626L(bkP2QG):
2627	mov    0x7a(%rdx),%r10
2628	mov    %r10,0x7a(%rcx)
2629L(bkP2QF):
2630	mov    0x72(%rdx),%r9
2631	mov    %r9,0x72(%rcx)
2632L(bkP2QE):
2633	mov    0x6a(%rdx),%r11
2634	mov    %r11,0x6a(%rcx)
2635L(bkP2QD):
2636	mov    0x62(%rdx),%r10
2637	mov    %r10,0x62(%rcx)
2638L(bkP2QC):
2639	mov    0x5a(%rdx),%r9
2640	mov    %r9,0x5a(%rcx)
2641L(bkP2QB):
2642	mov    0x52(%rdx),%r11
2643	mov    %r11,0x52(%rcx)
2644L(bkP2QA):
2645	mov    0x4a(%rdx),%r10
2646	mov    %r10,0x4a(%rcx)
2647L(bkP2Q9):
2648	mov    0x42(%rdx),%r9
2649	mov    %r9,0x42(%rcx)
2650L(bkP2Q8):
2651	mov    0x3a(%rdx),%r11
2652	mov    %r11,0x3a(%rcx)
2653L(bkP2Q7):
2654	mov    0x32(%rdx),%r10
2655	mov    %r10,0x32(%rcx)
2656L(bkP2Q6):
2657	mov    0x2a(%rdx),%r9
2658	mov    %r9,0x2a(%rcx)
2659L(bkP2Q5):
2660	mov    0x22(%rdx),%r11
2661	mov    %r11,0x22(%rcx)
2662L(bkP2Q4):
2663	mov    0x1a(%rdx),%r10
2664	mov    %r10,0x1a(%rcx)
2665L(bkP2Q3):
2666	mov    0x12(%rdx),%r9
2667	mov    %r9,0x12(%rcx)
2668L(bkP2Q2):
2669	mov    0xa(%rdx),%r11
2670	mov    %r11,0xa(%rcx)
2671L(bkP2Q1):
2672	mov    0x2(%rdx),%r10
2673	mov    %r10,0x2(%rcx)
2674L(bkP2Q0):
2675	mov    (%rdx),%r9w
2676	mov    %r9w,(%rcx)
2677	ret
2678
2679	.balign 16
2680L(bkP3QI):
2681	mov    0x8b(%rdx),%r10
2682	mov    %r10,0x8b(%rcx)
2683L(bkP3QH):
2684	mov    0x83(%rdx),%r11
2685	mov    %r11,0x83(%rcx)
2686L(bkP3QG):
2687	mov    0x7b(%rdx),%r10
2688	mov    %r10,0x7b(%rcx)
2689L(bkP3QF):
2690	mov    0x73(%rdx),%r9
2691	mov    %r9,0x73(%rcx)
2692L(bkP3QE):
2693	mov    0x6b(%rdx),%r11
2694	mov    %r11,0x6b(%rcx)
2695L(bkP3QD):
2696	mov    0x63(%rdx),%r10
2697	mov    %r10,0x63(%rcx)
2698L(bkP3QC):
2699	mov    0x5b(%rdx),%r9
2700	mov    %r9,0x5b(%rcx)
2701L(bkP3QB):
2702	mov    0x53(%rdx),%r11
2703	mov    %r11,0x53(%rcx)
2704L(bkP3QA):
2705	mov    0x4b(%rdx),%r10
2706	mov    %r10,0x4b(%rcx)
2707L(bkP3Q9):
2708	mov    0x43(%rdx),%r9
2709	mov    %r9,0x43(%rcx)
2710L(bkP3Q8):
2711	mov    0x3b(%rdx),%r11
2712	mov    %r11,0x3b(%rcx)
2713L(bkP3Q7):
2714	mov    0x33(%rdx),%r10
2715	mov    %r10,0x33(%rcx)
2716L(bkP3Q6):
2717	mov    0x2b(%rdx),%r9
2718	mov    %r9,0x2b(%rcx)
2719L(bkP3Q5):
2720	mov    0x23(%rdx),%r11
2721	mov    %r11,0x23(%rcx)
2722L(bkP3Q4):
2723	mov    0x1b(%rdx),%r10
2724	mov    %r10,0x1b(%rcx)
2725L(bkP3Q3):
2726	mov    0x13(%rdx),%r9
2727	mov    %r9,0x13(%rcx)
2728L(bkP3Q2):
2729	mov    0xb(%rdx),%r11
2730	mov    %r11,0xb(%rcx)
2731L(bkP3Q1):
2732	mov    0x3(%rdx),%r10
2733	mov    %r10,0x3(%rcx)
2734L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
2735	mov    0x1(%rdx),%r9w
2736	mov    %r9w,0x1(%rcx)
2737	mov    (%rdx),%r10b
2738	mov    %r10b,(%rcx)
2739	ret
2740
2741	.balign 16
2742L(bkP4QI):
2743	mov    0x8c(%rdx),%r10
2744	mov    %r10,0x8c(%rcx)
2745L(bkP4QH):
2746	mov    0x84(%rdx),%r11
2747	mov    %r11,0x84(%rcx)
2748L(bkP4QG):
2749	mov    0x7c(%rdx),%r10
2750	mov    %r10,0x7c(%rcx)
2751L(bkP4QF):
2752	mov    0x74(%rdx),%r9
2753	mov    %r9,0x74(%rcx)
2754L(bkP4QE):
2755	mov    0x6c(%rdx),%r11
2756	mov    %r11,0x6c(%rcx)
2757L(bkP4QD):
2758	mov    0x64(%rdx),%r10
2759	mov    %r10,0x64(%rcx)
2760L(bkP4QC):
2761	mov    0x5c(%rdx),%r9
2762	mov    %r9,0x5c(%rcx)
2763L(bkP4QB):
2764	mov    0x54(%rdx),%r11
2765	mov    %r11,0x54(%rcx)
2766L(bkP4QA):
2767	mov    0x4c(%rdx),%r10
2768	mov    %r10,0x4c(%rcx)
2769L(bkP4Q9):
2770	mov    0x44(%rdx),%r9
2771	mov    %r9,0x44(%rcx)
2772L(bkP4Q8):
2773	mov    0x3c(%rdx),%r11
2774	mov    %r11,0x3c(%rcx)
2775L(bkP4Q7):
2776	mov    0x34(%rdx),%r10
2777	mov    %r10,0x34(%rcx)
2778L(bkP4Q6):
2779	mov    0x2c(%rdx),%r9
2780	mov    %r9,0x2c(%rcx)
2781L(bkP4Q5):
2782	mov    0x24(%rdx),%r11
2783	mov    %r11,0x24(%rcx)
2784L(bkP4Q4):
2785	mov    0x1c(%rdx),%r10
2786	mov    %r10,0x1c(%rcx)
2787L(bkP4Q3):
2788	mov    0x14(%rdx),%r9
2789	mov    %r9,0x14(%rcx)
2790L(bkP4Q2):
2791	mov    0xc(%rdx),%r11
2792	mov    %r11,0xc(%rcx)
2793L(bkP4Q1):
2794	mov    0x4(%rdx),%r10
2795	mov    %r10,0x4(%rcx)
2796L(bkP4Q0):
2797	mov    (%rdx),%r9d
2798	mov    %r9d,(%rcx)
2799	ret
2800
2801	.balign 16
2802L(bkP5QI):
2803	mov    0x8d(%rdx),%r10
2804	mov    %r10,0x8d(%rcx)
2805L(bkP5QH):
2806	mov    0x85(%rdx),%r9
2807	mov    %r9,0x85(%rcx)
2808L(bkP5QG):
2809	mov    0x7d(%rdx),%r11
2810	mov    %r11,0x7d(%rcx)
2811L(bkP5QF):
2812	mov    0x75(%rdx),%r10
2813	mov    %r10,0x75(%rcx)
2814L(bkP5QE):
2815	mov    0x6d(%rdx),%r9
2816	mov    %r9,0x6d(%rcx)
2817L(bkP5QD):
2818	mov    0x65(%rdx),%r11
2819	mov    %r11,0x65(%rcx)
2820L(bkP5QC):
2821	mov    0x5d(%rdx),%r10
2822	mov    %r10,0x5d(%rcx)
2823L(bkP5QB):
2824	mov    0x55(%rdx),%r9
2825	mov    %r9,0x55(%rcx)
2826L(bkP5QA):
2827	mov    0x4d(%rdx),%r11
2828	mov    %r11,0x4d(%rcx)
2829L(bkP5Q9):
2830	mov    0x45(%rdx),%r10
2831	mov    %r10,0x45(%rcx)
2832L(bkP5Q8):
2833	mov    0x3d(%rdx),%r9
2834	mov    %r9,0x3d(%rcx)
2835L(bkP5Q7):
2836	mov    0x35(%rdx),%r11
2837	mov    %r11,0x35(%rcx)
2838L(bkP5Q6):
2839	mov    0x2d(%rdx),%r10
2840	mov    %r10,0x2d(%rcx)
2841L(bkP5Q5):
2842	mov    0x25(%rdx),%r9
2843	mov    %r9,0x25(%rcx)
2844L(bkP5Q4):
2845	mov    0x1d(%rdx),%r11
2846	mov    %r11,0x1d(%rcx)
2847L(bkP5Q3):
2848	mov    0x15(%rdx),%r10
2849	mov    %r10,0x15(%rcx)
2850L(bkP5Q2):
2851	mov    0xd(%rdx),%r9
2852	mov    %r9,0xd(%rcx)
2853L(bkP5Q1):
2854	mov    0x5(%rdx),%r11
2855	mov    %r11,0x5(%rcx)
2856L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
2857	mov    0x1(%rdx),%r9d
2858	mov    %r9d,0x1(%rcx)
2859	mov    (%rdx),%r10b
2860	mov    %r10b,(%rcx)
2861	ret
2862
2863	.balign 16
2864L(bkP6QI):
2865	mov    0x8e(%rdx),%r10
2866	mov    %r10,0x8e(%rcx)
2867L(bkP6QH):
2868	mov    0x86(%rdx),%r11
2869	mov    %r11,0x86(%rcx)
2870L(bkP6QG):
2871	mov    0x7e(%rdx),%r10
2872	mov    %r10,0x7e(%rcx)
2873L(bkP6QF):
2874	mov    0x76(%rdx),%r9
2875	mov    %r9,0x76(%rcx)
2876L(bkP6QE):
2877	mov    0x6e(%rdx),%r11
2878	mov    %r11,0x6e(%rcx)
2879L(bkP6QD):
2880	mov    0x66(%rdx),%r10
2881	mov    %r10,0x66(%rcx)
2882L(bkP6QC):
2883	mov    0x5e(%rdx),%r9
2884	mov    %r9,0x5e(%rcx)
2885L(bkP6QB):
2886	mov    0x56(%rdx),%r11
2887	mov    %r11,0x56(%rcx)
2888L(bkP6QA):
2889	mov    0x4e(%rdx),%r10
2890	mov    %r10,0x4e(%rcx)
2891L(bkP6Q9):
2892	mov    0x46(%rdx),%r9
2893	mov    %r9,0x46(%rcx)
2894L(bkP6Q8):
2895	mov    0x3e(%rdx),%r11
2896	mov    %r11,0x3e(%rcx)
2897L(bkP6Q7):
2898	mov    0x36(%rdx),%r10
2899	mov    %r10,0x36(%rcx)
2900L(bkP6Q6):
2901	mov    0x2e(%rdx),%r9
2902	mov    %r9,0x2e(%rcx)
2903L(bkP6Q5):
2904	mov    0x26(%rdx),%r11
2905	mov    %r11,0x26(%rcx)
2906L(bkP6Q4):
2907	mov    0x1e(%rdx),%r10
2908	mov    %r10,0x1e(%rcx)
2909L(bkP6Q3):
2910	mov    0x16(%rdx),%r9
2911	mov    %r9,0x16(%rcx)
2912L(bkP6Q2):
2913	mov    0xe(%rdx),%r11
2914	mov    %r11,0xe(%rcx)
2915L(bkP6Q1):
2916	mov    0x6(%rdx),%r10
2917	mov    %r10,0x6(%rcx)
2918L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
2919	mov    0x2(%rdx),%r9d
2920	mov    %r9d,0x2(%rcx)
2921	mov    (%rdx),%r10w
2922	mov    %r10w,(%rcx)
2923	ret
2924
2925	.balign 16
2926L(bkP7QI):
2927	mov    0x8f(%rdx),%r10
2928	mov    %r10,0x8f(%rcx)
2929L(bkP7QH):
2930	mov    0x87(%rdx),%r11
2931	mov    %r11,0x87(%rcx)
2932L(bkP7QG):
2933	mov    0x7f(%rdx),%r10
2934	mov    %r10,0x7f(%rcx)
2935L(bkP7QF):
2936	mov    0x77(%rdx),%r9
2937	mov    %r9,0x77(%rcx)
2938L(bkP7QE):
2939	mov    0x6f(%rdx),%r11
2940	mov    %r11,0x6f(%rcx)
2941L(bkP7QD):
2942	mov    0x67(%rdx),%r10
2943	mov    %r10,0x67(%rcx)
2944L(bkP7QC):
2945	mov    0x5f(%rdx),%r9
2946	mov    %r9,0x5f(%rcx)
2947L(bkP7QB):
2948	mov    0x57(%rdx),%r11
2949	mov    %r11,0x57(%rcx)
2950L(bkP7QA):
2951	mov    0x4f(%rdx),%r10
2952	mov    %r10,0x4f(%rcx)
2953L(bkP7Q9):
2954	mov    0x47(%rdx),%r9
2955	mov    %r9,0x47(%rcx)
2956L(bkP7Q8):
2957	mov    0x3f(%rdx),%r11
2958	mov    %r11,0x3f(%rcx)
2959L(bkP7Q7):
2960	mov    0x37(%rdx),%r10
2961	mov    %r10,0x37(%rcx)
2962L(bkP7Q6):
2963	mov    0x2f(%rdx),%r9
2964	mov    %r9,0x2f(%rcx)
2965L(bkP7Q5):
2966	mov    0x27(%rdx),%r11
2967	mov    %r11,0x27(%rcx)
2968L(bkP7Q4):
2969	mov    0x1f(%rdx),%r10
2970	mov    %r10,0x1f(%rcx)
2971L(bkP7Q3):
2972	mov    0x17(%rdx),%r9
2973	mov    %r9,0x17(%rcx)
2974L(bkP7Q2):
2975	mov    0xf(%rdx),%r11
2976	mov    %r11,0xf(%rcx)
2977L(bkP7Q1):
2978	mov    0x7(%rdx),%r10
2979	mov    %r10,0x7(%rcx)
2980L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
2981	mov    0x3(%rdx),%r9d
2982	mov    %r9d,0x3(%rcx)
2983	mov    0x1(%rdx),%r10w
2984	mov    %r10w,0x1(%rcx)
2985	mov    (%rdx),%r11b
2986	mov    %r11b,(%rcx)
2987	ret
2988
2989		.balign 16
2990L(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
2991		.int L(bkP1Q0)-L(bkPxQx)
2992		.int L(bkP2Q0)-L(bkPxQx)
2993		.int L(bkP3Q0)-L(bkPxQx)
2994		.int L(bkP4Q0)-L(bkPxQx)
2995		.int L(bkP5Q0)-L(bkPxQx)
2996		.int L(bkP6Q0)-L(bkPxQx)
2997		.int L(bkP7Q0)-L(bkPxQx)
2998
2999		.int L(bkP0Q1)-L(bkPxQx)
3000		.int L(bkP1Q1)-L(bkPxQx)
3001		.int L(bkP2Q1)-L(bkPxQx)
3002		.int L(bkP3Q1)-L(bkPxQx)
3003		.int L(bkP4Q1)-L(bkPxQx)
3004		.int L(bkP5Q1)-L(bkPxQx)
3005		.int L(bkP6Q1)-L(bkPxQx)
3006		.int L(bkP7Q1)-L(bkPxQx)
3007
3008		.int L(bkP0Q2)-L(bkPxQx)
3009		.int L(bkP1Q2)-L(bkPxQx)
3010		.int L(bkP2Q2)-L(bkPxQx)
3011		.int L(bkP3Q2)-L(bkPxQx)
3012		.int L(bkP4Q2)-L(bkPxQx)
3013		.int L(bkP5Q2)-L(bkPxQx)
3014		.int L(bkP6Q2)-L(bkPxQx)
3015		.int L(bkP7Q2)-L(bkPxQx)
3016
3017		.int L(bkP0Q3)-L(bkPxQx)
3018		.int L(bkP1Q3)-L(bkPxQx)
3019		.int L(bkP2Q3)-L(bkPxQx)
3020		.int L(bkP3Q3)-L(bkPxQx)
3021		.int L(bkP4Q3)-L(bkPxQx)
3022		.int L(bkP5Q3)-L(bkPxQx)
3023		.int L(bkP6Q3)-L(bkPxQx)
3024		.int L(bkP7Q3)-L(bkPxQx)
3025
3026		.int L(bkP0Q4)-L(bkPxQx)
3027		.int L(bkP1Q4)-L(bkPxQx)
3028		.int L(bkP2Q4)-L(bkPxQx)
3029		.int L(bkP3Q4)-L(bkPxQx)
3030		.int L(bkP4Q4)-L(bkPxQx)
3031		.int L(bkP5Q4)-L(bkPxQx)
3032		.int L(bkP6Q4)-L(bkPxQx)
3033		.int L(bkP7Q4)-L(bkPxQx)
3034
3035		.int L(bkP0Q5)-L(bkPxQx)
3036		.int L(bkP1Q5)-L(bkPxQx)
3037		.int L(bkP2Q5)-L(bkPxQx)
3038		.int L(bkP3Q5)-L(bkPxQx)
3039		.int L(bkP4Q5)-L(bkPxQx)
3040		.int L(bkP5Q5)-L(bkPxQx)
3041		.int L(bkP6Q5)-L(bkPxQx)
3042		.int L(bkP7Q5)-L(bkPxQx)
3043
3044		.int L(bkP0Q6)-L(bkPxQx)
3045		.int L(bkP1Q6)-L(bkPxQx)
3046		.int L(bkP2Q6)-L(bkPxQx)
3047		.int L(bkP3Q6)-L(bkPxQx)
3048		.int L(bkP4Q6)-L(bkPxQx)
3049		.int L(bkP5Q6)-L(bkPxQx)
3050		.int L(bkP6Q6)-L(bkPxQx)
3051		.int L(bkP7Q6)-L(bkPxQx)
3052
3053		.int L(bkP0Q7)-L(bkPxQx)
3054		.int L(bkP1Q7)-L(bkPxQx)
3055		.int L(bkP2Q7)-L(bkPxQx)
3056		.int L(bkP3Q7)-L(bkPxQx)
3057		.int L(bkP4Q7)-L(bkPxQx)
3058		.int L(bkP5Q7)-L(bkPxQx)
3059		.int L(bkP6Q7)-L(bkPxQx)
3060		.int L(bkP7Q7)-L(bkPxQx)
3061
3062		.int L(bkP0Q8)-L(bkPxQx)
3063		.int L(bkP1Q8)-L(bkPxQx)
3064		.int L(bkP2Q8)-L(bkPxQx)
3065		.int L(bkP3Q8)-L(bkPxQx)
3066		.int L(bkP4Q8)-L(bkPxQx)
3067		.int L(bkP5Q8)-L(bkPxQx)
3068		.int L(bkP6Q8)-L(bkPxQx)
3069		.int L(bkP7Q8)-L(bkPxQx)
3070
3071		.int L(bkP0Q9)-L(bkPxQx)
3072		.int L(bkP1Q9)-L(bkPxQx)
3073		.int L(bkP2Q9)-L(bkPxQx)
3074		.int L(bkP3Q9)-L(bkPxQx)
3075		.int L(bkP4Q9)-L(bkPxQx)
3076		.int L(bkP5Q9)-L(bkPxQx)
3077		.int L(bkP6Q9)-L(bkPxQx)
3078		.int L(bkP7Q9)-L(bkPxQx)
3079
3080		.int L(bkP0QA)-L(bkPxQx)
3081		.int L(bkP1QA)-L(bkPxQx)
3082		.int L(bkP2QA)-L(bkPxQx)
3083		.int L(bkP3QA)-L(bkPxQx)
3084		.int L(bkP4QA)-L(bkPxQx)
3085		.int L(bkP5QA)-L(bkPxQx)
3086		.int L(bkP6QA)-L(bkPxQx)
3087		.int L(bkP7QA)-L(bkPxQx)
3088
3089		.int L(bkP0QB)-L(bkPxQx)
3090		.int L(bkP1QB)-L(bkPxQx)
3091		.int L(bkP2QB)-L(bkPxQx)
3092		.int L(bkP3QB)-L(bkPxQx)
3093		.int L(bkP4QB)-L(bkPxQx)
3094		.int L(bkP5QB)-L(bkPxQx)
3095		.int L(bkP6QB)-L(bkPxQx)
3096		.int L(bkP7QB)-L(bkPxQx)
3097
3098		.int L(bkP0QC)-L(bkPxQx)
3099		.int L(bkP1QC)-L(bkPxQx)
3100		.int L(bkP2QC)-L(bkPxQx)
3101		.int L(bkP3QC)-L(bkPxQx)
3102		.int L(bkP4QC)-L(bkPxQx)
3103		.int L(bkP5QC)-L(bkPxQx)
3104		.int L(bkP6QC)-L(bkPxQx)
3105		.int L(bkP7QC)-L(bkPxQx)
3106
3107		.int L(bkP0QD)-L(bkPxQx)
3108		.int L(bkP1QD)-L(bkPxQx)
3109		.int L(bkP2QD)-L(bkPxQx)
3110		.int L(bkP3QD)-L(bkPxQx)
3111		.int L(bkP4QD)-L(bkPxQx)
3112		.int L(bkP5QD)-L(bkPxQx)
3113		.int L(bkP6QD)-L(bkPxQx)
3114		.int L(bkP7QD)-L(bkPxQx)
3115
3116		.int L(bkP0QE)-L(bkPxQx)
3117		.int L(bkP1QE)-L(bkPxQx)
3118		.int L(bkP2QE)-L(bkPxQx)
3119		.int L(bkP3QE)-L(bkPxQx)
3120		.int L(bkP4QE)-L(bkPxQx)
3121		.int L(bkP5QE)-L(bkPxQx)
3122		.int L(bkP6QE)-L(bkPxQx)
3123		.int L(bkP7QE)-L(bkPxQx)
3124
3125		.int L(bkP0QF)-L(bkPxQx)
3126		.int L(bkP1QF)-L(bkPxQx)
3127		.int L(bkP2QF)-L(bkPxQx)
3128		.int L(bkP3QF)-L(bkPxQx)
3129		.int L(bkP4QF)-L(bkPxQx)
3130		.int L(bkP5QF)-L(bkPxQx)
3131		.int L(bkP6QF)-L(bkPxQx)
3132		.int L(bkP7QF)-L(bkPxQx)
3133
3134		.int L(bkP0QG)-L(bkPxQx)
3135		.int L(bkP1QG)-L(bkPxQx)
3136		.int L(bkP2QG)-L(bkPxQx)
3137		.int L(bkP3QG)-L(bkPxQx)
3138		.int L(bkP4QG)-L(bkPxQx)
3139		.int L(bkP5QG)-L(bkPxQx)
3140		.int L(bkP6QG)-L(bkPxQx)
3141		.int L(bkP7QG)-L(bkPxQx)
3142
3143		.int L(bkP0QH)-L(bkPxQx)
3144		.int L(bkP1QH)-L(bkPxQx)
3145		.int L(bkP2QH)-L(bkPxQx)
3146		.int L(bkP3QH)-L(bkPxQx)
3147		.int L(bkP4QH)-L(bkPxQx)
3148		.int L(bkP5QH)-L(bkPxQx)
3149		.int L(bkP6QH)-L(bkPxQx)
3150		.int L(bkP7QH)-L(bkPxQx)
3151
3152		.int L(bkP0QI)-L(bkPxQx)
3153		.int L(bkP1QI)-L(bkPxQx)
3154		.int L(bkP2QI)-L(bkPxQx)
3155		.int L(bkP3QI)-L(bkPxQx)
3156		.int L(bkP4QI)-L(bkPxQx)
3157		.int L(bkP5QI)-L(bkPxQx)
3158		.int L(bkP6QI)-L(bkPxQx)
3159		.int L(bkP7QI)-L(bkPxQx)
3160
3161	SET_SIZE(memmove)
3162