xref: /titanic_44/usr/src/lib/libc/amd64/gen/memcpy.s (revision 6b5ad791879c4fea5b397add82a50aaf0d392b91)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2008, Intel Corporation
24 * All rights reserved.
25 */
26
27/*
28 * memcpy.s - copies two blocks of memory
29 *	Implements memcpy() and memmove() libc primitives.
30 */
31	.ident	"%Z%%M%	%I%	%E% SMI"
32
33	.file	"%M%"
34
35#include <sys/asm_linkage.h>
36	ANSI_PRAGMA_WEAK(memmove,function)
37	ANSI_PRAGMA_WEAK(memcpy,function)
38
39#include "synonyms.h"
40#include "cache.h"
41#include "proc64_id.h"
42
43	ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function)
44
45#define L(s) .memcpy/**/s
46
47/*
48 * memcpy algorithm overview:
49 *
50 * Thresholds used below were determined experimentally.
51 *
52 * Pseudo code:
53 *
54 * If (size <= 128 bytes) {
55 *	do unrolled code (primarily 8-byte loads/stores) regardless of
56 *	alignment.
57 * } else {
58 *	Align destination to 16-byte boundary
59 *
60 *      if (NO_SSE) {
61 *		If (size > half of the largest level cache) {
62 *			Use 8-byte non-temporal stores (64-bytes/loop)
63 *		} else {
64 *			if (size > 4K && size <= half l1 cache size) {
65 *				Use rep movsq
66 *			} else {
67 *				Use 8-byte loads/stores (64 bytes per loop)
68 *			}
69 *		}
70 *
71 *	} else { **USE SSE**
72 *		If (size > half of the largest level cache) {
73 *			Use 16-byte non-temporal stores (128-bytes per loop)
74 *		} else {
75 *			If (both source and destination are aligned) {
76 *			    Use 16-byte aligned loads and stores (128 bytes/loop)
77 *			} else {
78 *			    use pairs of xmm registers with SSE2 or SSSE3
79 *			    instructions to concatenate and shift appropriately
80 *			    to account for source unalignment. This enables
81 *			    16-byte aligned loads to be done.
82 *			}
83 *		}
84	}
85 *
86 *	Finish any remaining bytes via unrolled code above.
87 * }
88 *
89 * memmove overview:
90 *	memmove is the same as memcpy except one case where copy needs to be
91 *	done backwards. The copy backwards code is done in a similar manner.
92 */
93
94	ENTRY(memmove)
95	cmp	%rsi,%rdi		# if dst <= src
96	jbe	L(CopyForward)		# then do copy forward
97	mov	%rsi,%r9		# move src to r9
98	add	%rdx,%r9		# add len to get addr of end of src
99	cmp	%r9,%rdi		# if dst < end of src
100	jb	L(CopyBackwards)	# then do copy backwards
101	jmp	L(CopyForward)
102
103	ENTRY (memcpy)
104L(CopyForward):
105	mov    %rdx,%r8
106	mov    %rdi,%rcx
107	mov    %rsi,%rdx
108	mov    %rdi,%rax
109	lea    L(fwdPxQx)(%rip),%r11
110	cmp    $0x80,%r8		# 128
111	jg     L(ck_use_sse2)
112	add    %r8,%rcx
113	add    %r8,%rdx
114
115	movslq (%r11,%r8,4),%r10
116	lea    (%r10,%r11,1),%r11
117	jmpq   *%r11
118
119	.balign 16
120L(ShrtAlignNew):
121	lea    L(AliPxQx)(%rip),%r11
122	mov    %rcx,%r9
123	and    $0xf,%r9
124
125	movslq (%r11,%r9,4),%r10
126	lea    (%r10,%r11,1),%r11
127	jmpq   *%r11
128
129	.balign 16
130L(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
131           .int        L(P1Q0)-L(fwdPxQx)
132           .int        L(P2Q0)-L(fwdPxQx)
133           .int        L(P3Q0)-L(fwdPxQx)
134           .int        L(P4Q0)-L(fwdPxQx)
135           .int        L(P5Q0)-L(fwdPxQx)
136           .int        L(P6Q0)-L(fwdPxQx)
137           .int        L(P7Q0)-L(fwdPxQx)
138
139           .int        L(P0Q1)-L(fwdPxQx)
140           .int        L(P1Q1)-L(fwdPxQx)
141           .int        L(P2Q1)-L(fwdPxQx)
142           .int        L(P3Q1)-L(fwdPxQx)
143           .int        L(P4Q1)-L(fwdPxQx)
144           .int        L(P5Q1)-L(fwdPxQx)
145           .int        L(P6Q1)-L(fwdPxQx)
146           .int        L(P7Q1)-L(fwdPxQx)
147
148           .int        L(P0Q2)-L(fwdPxQx)
149           .int        L(P1Q2)-L(fwdPxQx)
150           .int        L(P2Q2)-L(fwdPxQx)
151           .int        L(P3Q2)-L(fwdPxQx)
152           .int        L(P4Q2)-L(fwdPxQx)
153           .int        L(P5Q2)-L(fwdPxQx)
154           .int        L(P6Q2)-L(fwdPxQx)
155           .int        L(P7Q2)-L(fwdPxQx)
156
157           .int        L(P0Q3)-L(fwdPxQx)
158           .int        L(P1Q3)-L(fwdPxQx)
159           .int        L(P2Q3)-L(fwdPxQx)
160           .int        L(P3Q3)-L(fwdPxQx)
161           .int        L(P4Q3)-L(fwdPxQx)
162           .int        L(P5Q3)-L(fwdPxQx)
163           .int        L(P6Q3)-L(fwdPxQx)
164           .int        L(P7Q3)-L(fwdPxQx)
165
166           .int        L(P0Q4)-L(fwdPxQx)
167           .int        L(P1Q4)-L(fwdPxQx)
168           .int        L(P2Q4)-L(fwdPxQx)
169           .int        L(P3Q4)-L(fwdPxQx)
170           .int        L(P4Q4)-L(fwdPxQx)
171           .int        L(P5Q4)-L(fwdPxQx)
172           .int        L(P6Q4)-L(fwdPxQx)
173           .int        L(P7Q4)-L(fwdPxQx)
174
175           .int        L(P0Q5)-L(fwdPxQx)
176           .int        L(P1Q5)-L(fwdPxQx)
177           .int        L(P2Q5)-L(fwdPxQx)
178           .int        L(P3Q5)-L(fwdPxQx)
179           .int        L(P4Q5)-L(fwdPxQx)
180           .int        L(P5Q5)-L(fwdPxQx)
181           .int        L(P6Q5)-L(fwdPxQx)
182           .int        L(P7Q5)-L(fwdPxQx)
183
184           .int        L(P0Q6)-L(fwdPxQx)
185           .int        L(P1Q6)-L(fwdPxQx)
186           .int        L(P2Q6)-L(fwdPxQx)
187           .int        L(P3Q6)-L(fwdPxQx)
188           .int        L(P4Q6)-L(fwdPxQx)
189           .int        L(P5Q6)-L(fwdPxQx)
190           .int        L(P6Q6)-L(fwdPxQx)
191           .int        L(P7Q6)-L(fwdPxQx)
192
193           .int        L(P0Q7)-L(fwdPxQx)
194           .int        L(P1Q7)-L(fwdPxQx)
195           .int        L(P2Q7)-L(fwdPxQx)
196           .int        L(P3Q7)-L(fwdPxQx)
197           .int        L(P4Q7)-L(fwdPxQx)
198           .int        L(P5Q7)-L(fwdPxQx)
199           .int        L(P6Q7)-L(fwdPxQx)
200           .int        L(P7Q7)-L(fwdPxQx)
201
202           .int        L(P0Q8)-L(fwdPxQx)
203           .int        L(P1Q8)-L(fwdPxQx)
204           .int        L(P2Q8)-L(fwdPxQx)
205           .int        L(P3Q8)-L(fwdPxQx)
206           .int        L(P4Q8)-L(fwdPxQx)
207           .int        L(P5Q8)-L(fwdPxQx)
208           .int        L(P6Q8)-L(fwdPxQx)
209           .int        L(P7Q8)-L(fwdPxQx)
210
211           .int        L(P0Q9)-L(fwdPxQx)
212           .int        L(P1Q9)-L(fwdPxQx)
213           .int        L(P2Q9)-L(fwdPxQx)
214           .int        L(P3Q9)-L(fwdPxQx)
215           .int        L(P4Q9)-L(fwdPxQx)
216           .int        L(P5Q9)-L(fwdPxQx)
217           .int        L(P6Q9)-L(fwdPxQx)
218           .int        L(P7Q9)-L(fwdPxQx)
219
220           .int        L(P0QA)-L(fwdPxQx)
221           .int        L(P1QA)-L(fwdPxQx)
222           .int        L(P2QA)-L(fwdPxQx)
223           .int        L(P3QA)-L(fwdPxQx)
224           .int        L(P4QA)-L(fwdPxQx)
225           .int        L(P5QA)-L(fwdPxQx)
226           .int        L(P6QA)-L(fwdPxQx)
227           .int        L(P7QA)-L(fwdPxQx)
228
229           .int        L(P0QB)-L(fwdPxQx)
230           .int        L(P1QB)-L(fwdPxQx)
231           .int        L(P2QB)-L(fwdPxQx)
232           .int        L(P3QB)-L(fwdPxQx)
233           .int        L(P4QB)-L(fwdPxQx)
234           .int        L(P5QB)-L(fwdPxQx)
235           .int        L(P6QB)-L(fwdPxQx)
236           .int        L(P7QB)-L(fwdPxQx)
237
238           .int        L(P0QC)-L(fwdPxQx)
239           .int        L(P1QC)-L(fwdPxQx)
240           .int        L(P2QC)-L(fwdPxQx)
241           .int        L(P3QC)-L(fwdPxQx)
242           .int        L(P4QC)-L(fwdPxQx)
243           .int        L(P5QC)-L(fwdPxQx)
244           .int        L(P6QC)-L(fwdPxQx)
245           .int        L(P7QC)-L(fwdPxQx)
246
247           .int        L(P0QD)-L(fwdPxQx)
248           .int        L(P1QD)-L(fwdPxQx)
249           .int        L(P2QD)-L(fwdPxQx)
250           .int        L(P3QD)-L(fwdPxQx)
251           .int        L(P4QD)-L(fwdPxQx)
252           .int        L(P5QD)-L(fwdPxQx)
253           .int        L(P6QD)-L(fwdPxQx)
254           .int        L(P7QD)-L(fwdPxQx)
255
256           .int        L(P0QE)-L(fwdPxQx)
257           .int        L(P1QE)-L(fwdPxQx)
258           .int        L(P2QE)-L(fwdPxQx)
259           .int        L(P3QE)-L(fwdPxQx)
260           .int        L(P4QE)-L(fwdPxQx)
261           .int        L(P5QE)-L(fwdPxQx)
262           .int        L(P6QE)-L(fwdPxQx)
263           .int        L(P7QE)-L(fwdPxQx)
264
265           .int        L(P0QF)-L(fwdPxQx)
266           .int        L(P1QF)-L(fwdPxQx)
267           .int        L(P2QF)-L(fwdPxQx)
268           .int        L(P3QF)-L(fwdPxQx)
269           .int        L(P4QF)-L(fwdPxQx)
270           .int        L(P5QF)-L(fwdPxQx)
271           .int        L(P6QF)-L(fwdPxQx)
272           .int        L(P7QF)-L(fwdPxQx)
273
274           .int        L(P0QG)-L(fwdPxQx)	# 0x80
275
276	   .balign 16
277L(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
278           .int        L(A1Q0)-L(AliPxQx)
279           .int        L(A2Q0)-L(AliPxQx)
280           .int        L(A3Q0)-L(AliPxQx)
281           .int        L(A4Q0)-L(AliPxQx)
282           .int        L(A5Q0)-L(AliPxQx)
283           .int        L(A6Q0)-L(AliPxQx)
284           .int        L(A7Q0)-L(AliPxQx)
285           .int        L(A0Q1)-L(AliPxQx)
286           .int        L(A1Q1)-L(AliPxQx)
287           .int        L(A2Q1)-L(AliPxQx)
288           .int        L(A3Q1)-L(AliPxQx)
289           .int        L(A4Q1)-L(AliPxQx)
290           .int        L(A5Q1)-L(AliPxQx)
291           .int        L(A6Q1)-L(AliPxQx)
292           .int        L(A7Q1)-L(AliPxQx)
293
294	.balign 16
295L(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
296	movzbq (%rdx),%r11
297	sub    $0xf,%r8
298	mov    %r11b,(%rcx)
299
300	movzwq 0x1(%rdx),%r10
301	mov    %r10w,0x1(%rcx)
302
303	mov    0x3(%rdx),%r9d
304	mov    %r9d,0x3(%rcx)
305
306	mov    0x7(%rdx),%r11
307	add    $0xf,%rdx
308	mov    %r11,0x7(%rcx)
309
310	add    $0xf,%rcx
311	jmp    L(now_qw_aligned)
312
313	.balign 16
314L(A2Q0):			# ; need to move 8+ 6=2+4 bytes
315	movzwq (%rdx),%r10
316	sub    $0xe,%r8
317	mov    %r10w,(%rcx)
318
319	mov    0x2(%rdx),%r9d
320	mov    %r9d,0x2(%rcx)
321
322	mov    0x6(%rdx),%r11
323	add    $0xe,%rdx
324	mov    %r11,0x6(%rcx)
325	add    $0xe,%rcx
326	jmp    L(now_qw_aligned)
327
328	.balign 16
329L(A3Q0):			# ; need to move 8+ 5=1+4 bytes
330	movzbq (%rdx),%r11
331	sub    $0xd,%r8
332	mov    %r11b,(%rcx)
333
334	mov    0x1(%rdx),%r9d
335	mov    %r9d,0x1(%rcx)
336
337	mov    0x5(%rdx),%r10
338	add    $0xd,%rdx
339	mov    %r10,0x5(%rcx)
340
341	add    $0xd,%rcx
342	jmp    L(now_qw_aligned)
343
344	.balign 16
345L(A4Q0):			# ; need to move 8+4 bytes
346	mov    (%rdx),%r9d
347	sub    $0xc,%r8
348	mov    %r9d,(%rcx)
349
350	mov    0x4(%rdx),%r10
351	add    $0xc,%rdx
352	mov    %r10,0x4(%rcx)
353
354	add    $0xc,%rcx
355	jmp    L(now_qw_aligned)
356
357	.balign 16
358L(A5Q0):			# ; need to move 8+ 3=1+2 bytes
359	movzbq (%rdx),%r11
360	sub    $0xb,%r8
361	mov    %r11b,(%rcx)
362
363	movzwq 0x1(%rdx),%r10
364	mov    %r10w,0x1(%rcx)
365
366	mov    0x3(%rdx),%r9
367	add    $0xb,%rdx
368	mov    %r9,0x3(%rcx)
369
370	add    $0xb,%rcx
371	jmp    L(now_qw_aligned)
372
373	.balign 16
374L(A6Q0):			# ; need to move 8+2 bytes
375	movzwq (%rdx),%r10
376	sub    $0xa,%r8
377	mov    %r10w,(%rcx)
378
379	mov    0x2(%rdx),%r9
380	add    $0xa,%rdx
381	mov    %r9,0x2(%rcx)
382
383	add    $0xa,%rcx
384	jmp    L(now_qw_aligned)
385
386	.balign 16
387L(A7Q0):			# ; need to move 8+1 byte
388	movzbq (%rdx),%r11
389	sub    $0x9,%r8
390	mov    %r11b,(%rcx)
391
392	mov    0x1(%rdx),%r10
393	add    $0x9,%rdx
394	mov    %r10,0x1(%rcx)
395
396	add    $0x9,%rcx
397	jmp    L(now_qw_aligned)
398
399	.balign 16
400L(A0Q1):			# ; need to move 8 bytes
401
402	mov    (%rdx),%r10
403	add    $0x8,%rdx
404	sub    $0x8,%r8
405	mov    %r10,(%rcx)
406
407	add    $0x8,%rcx
408	jmp    L(now_qw_aligned)
409
410	.balign 16
411L(A1Q1):			# ; need to move 7=1+2+4 bytes
412	movzbq (%rdx),%r11
413	sub    $0x7,%r8
414	mov    %r11b,(%rcx)
415
416	movzwq 0x1(%rdx),%r10
417	mov    %r10w,0x1(%rcx)
418
419	mov    0x3(%rdx),%r9d
420	add    $0x7,%rdx
421	mov    %r9d,0x3(%rcx)
422	add    $0x7,%rcx
423	jmp    L(now_qw_aligned)
424
425	.balign 16
426L(A2Q1):			# ; need to move 6=2+4 bytes
427	movzwq (%rdx),%r10
428	sub    $0x6,%r8
429	mov    %r10w,(%rcx)
430	mov    0x2(%rdx),%r9d
431	add    $0x6,%rdx
432	mov    %r9d,0x2(%rcx)
433	add    $0x6,%rcx
434	jmp    L(now_qw_aligned)
435
436	.balign 16
437L(A3Q1):			# ; need to move 5=1+4 bytes
438	movzbq (%rdx),%r11
439	sub    $0x5,%r8
440	mov    %r11b,(%rcx)
441	mov    0x1(%rdx),%r9d
442	add    $0x5,%rdx
443	mov    %r9d,0x1(%rcx)
444	add    $0x5,%rcx
445	jmp    L(now_qw_aligned)
446
447	.balign 16
448L(A4Q1):			# ; need to move 4 bytes
449	mov    (%rdx),%r9d
450	sub    $0x4,%r8
451	add    $0x4,%rdx
452	mov    %r9d,(%rcx)
453	add    $0x4,%rcx
454	jmp    L(now_qw_aligned)
455
456	.balign 16
457L(A5Q1):			# ; need to move 3=1+2 bytes
458	movzbq (%rdx),%r11
459	sub    $0x3,%r8
460	mov    %r11b,(%rcx)
461
462	movzwq 0x1(%rdx),%r10
463	add    $0x3,%rdx
464	mov    %r10w,0x1(%rcx)
465
466	add    $0x3,%rcx
467	jmp    L(now_qw_aligned)
468
469	.balign 16
470L(A6Q1):			# ; need to move 2 bytes
471	movzwq (%rdx),%r10
472	sub    $0x2,%r8
473	add    $0x2,%rdx
474	mov    %r10w,(%rcx)
475	add    $0x2,%rcx
476	jmp    L(now_qw_aligned)
477
478	.balign 16
479L(A7Q1):			# ; need to move 1 byte
480	movzbq (%rdx),%r11
481	dec    %r8
482	inc    %rdx
483	mov    %r11b,(%rcx)
484	inc    %rcx
485	jmp    L(now_qw_aligned)
486
487
488	.balign 16
489L(P0QG):
490	mov    -0x80(%rdx),%r9
491	mov    %r9,-0x80(%rcx)
492L(P0QF):
493	mov    -0x78(%rdx),%r10
494	mov    %r10,-0x78(%rcx)
495L(P0QE):
496	mov    -0x70(%rdx),%r9
497	mov    %r9,-0x70(%rcx)
498L(P0QD):
499	mov    -0x68(%rdx),%r10
500	mov    %r10,-0x68(%rcx)
501L(P0QC):
502	mov    -0x60(%rdx),%r9
503	mov    %r9,-0x60(%rcx)
504L(P0QB):
505	mov    -0x58(%rdx),%r10
506	mov    %r10,-0x58(%rcx)
507L(P0QA):
508	mov    -0x50(%rdx),%r9
509	mov    %r9,-0x50(%rcx)
510L(P0Q9):
511	mov    -0x48(%rdx),%r10
512	mov    %r10,-0x48(%rcx)
513L(P0Q8):
514	mov    -0x40(%rdx),%r9
515	mov    %r9,-0x40(%rcx)
516L(P0Q7):
517	mov    -0x38(%rdx),%r10
518	mov    %r10,-0x38(%rcx)
519L(P0Q6):
520	mov    -0x30(%rdx),%r9
521	mov    %r9,-0x30(%rcx)
522L(P0Q5):
523	mov    -0x28(%rdx),%r10
524	mov    %r10,-0x28(%rcx)
525L(P0Q4):
526	mov    -0x20(%rdx),%r9
527	mov    %r9,-0x20(%rcx)
528L(P0Q3):
529	mov    -0x18(%rdx),%r10
530	mov    %r10,-0x18(%rcx)
531L(P0Q2):
532	mov    -0x10(%rdx),%r9
533	mov    %r9,-0x10(%rcx)
534L(P0Q1):
535	mov    -0x8(%rdx),%r10
536	mov    %r10,-0x8(%rcx)
537L(P0Q0):
538	ret
539
540	.balign 16
541L(P1QF):
542	mov    -0x79(%rdx),%r9
543	mov    %r9,-0x79(%rcx)
544L(P1QE):
545	mov    -0x71(%rdx),%r11
546	mov    %r11,-0x71(%rcx)
547L(P1QD):
548	mov    -0x69(%rdx),%r10
549	mov    %r10,-0x69(%rcx)
550L(P1QC):
551	mov    -0x61(%rdx),%r9
552	mov    %r9,-0x61(%rcx)
553L(P1QB):
554	mov    -0x59(%rdx),%r11
555	mov    %r11,-0x59(%rcx)
556L(P1QA):
557	mov    -0x51(%rdx),%r10
558	mov    %r10,-0x51(%rcx)
559L(P1Q9):
560	mov    -0x49(%rdx),%r9
561	mov    %r9,-0x49(%rcx)
562L(P1Q8):
563	mov    -0x41(%rdx),%r11
564	mov    %r11,-0x41(%rcx)
565L(P1Q7):
566	mov    -0x39(%rdx),%r10
567	mov    %r10,-0x39(%rcx)
568L(P1Q6):
569	mov    -0x31(%rdx),%r9
570	mov    %r9,-0x31(%rcx)
571L(P1Q5):
572	mov    -0x29(%rdx),%r11
573	mov    %r11,-0x29(%rcx)
574L(P1Q4):
575	mov    -0x21(%rdx),%r10
576	mov    %r10,-0x21(%rcx)
577L(P1Q3):
578	mov    -0x19(%rdx),%r9
579	mov    %r9,-0x19(%rcx)
580L(P1Q2):
581	mov    -0x11(%rdx),%r11
582	mov    %r11,-0x11(%rcx)
583L(P1Q1):
584	mov    -0x9(%rdx),%r10
585	mov    %r10,-0x9(%rcx)
586L(P1Q0):
587	movzbq -0x1(%rdx),%r9
588	mov    %r9b,-0x1(%rcx)
589	ret
590
591	.balign 16
592L(P2QF):
593	mov    -0x7a(%rdx),%r9
594	mov    %r9,-0x7a(%rcx)
595L(P2QE):
596	mov    -0x72(%rdx),%r11
597	mov    %r11,-0x72(%rcx)
598L(P2QD):
599	mov    -0x6a(%rdx),%r10
600	mov    %r10,-0x6a(%rcx)
601L(P2QC):
602	mov    -0x62(%rdx),%r9
603	mov    %r9,-0x62(%rcx)
604L(P2QB):
605	mov    -0x5a(%rdx),%r11
606	mov    %r11,-0x5a(%rcx)
607L(P2QA):
608	mov    -0x52(%rdx),%r10
609	mov    %r10,-0x52(%rcx)
610L(P2Q9):
611	mov    -0x4a(%rdx),%r9
612	mov    %r9,-0x4a(%rcx)
613L(P2Q8):
614	mov    -0x42(%rdx),%r11
615	mov    %r11,-0x42(%rcx)
616L(P2Q7):
617	mov    -0x3a(%rdx),%r10
618	mov    %r10,-0x3a(%rcx)
619L(P2Q6):
620	mov    -0x32(%rdx),%r9
621	mov    %r9,-0x32(%rcx)
622L(P2Q5):
623	mov    -0x2a(%rdx),%r11
624	mov    %r11,-0x2a(%rcx)
625L(P2Q4):
626	mov    -0x22(%rdx),%r10
627	mov    %r10,-0x22(%rcx)
628L(P2Q3):
629	mov    -0x1a(%rdx),%r9
630	mov    %r9,-0x1a(%rcx)
631L(P2Q2):
632	mov    -0x12(%rdx),%r11
633	mov    %r11,-0x12(%rcx)
634L(P2Q1):
635	mov    -0xa(%rdx),%r10
636	mov    %r10,-0xa(%rcx)
637L(P2Q0):
638	movzwq -0x2(%rdx),%r9
639	mov    %r9w,-0x2(%rcx)
640	ret
641
642	.balign 16
643L(P3QF):
644	mov    -0x7b(%rdx),%r9
645	mov    %r9,-0x7b(%rcx)
646L(P3QE):
647	mov    -0x73(%rdx),%r11
648	mov    %r11,-0x73(%rcx)
649L(P3QD):
650	mov    -0x6b(%rdx),%r10
651	mov    %r10,-0x6b(%rcx)
652L(P3QC):
653	mov    -0x63(%rdx),%r9
654	mov    %r9,-0x63(%rcx)
655L(P3QB):
656	mov    -0x5b(%rdx),%r11
657	mov    %r11,-0x5b(%rcx)
658L(P3QA):
659	mov    -0x53(%rdx),%r10
660	mov    %r10,-0x53(%rcx)
661L(P3Q9):
662	mov    -0x4b(%rdx),%r9
663	mov    %r9,-0x4b(%rcx)
664L(P3Q8):
665	mov    -0x43(%rdx),%r11
666	mov    %r11,-0x43(%rcx)
667L(P3Q7):
668	mov    -0x3b(%rdx),%r10
669	mov    %r10,-0x3b(%rcx)
670L(P3Q6):
671	mov    -0x33(%rdx),%r9
672	mov    %r9,-0x33(%rcx)
673L(P3Q5):
674	mov    -0x2b(%rdx),%r11
675	mov    %r11,-0x2b(%rcx)
676L(P3Q4):
677	mov    -0x23(%rdx),%r10
678	mov    %r10,-0x23(%rcx)
679L(P3Q3):
680	mov    -0x1b(%rdx),%r9
681	mov    %r9,-0x1b(%rcx)
682L(P3Q2):
683	mov    -0x13(%rdx),%r11
684	mov    %r11,-0x13(%rcx)
685L(P3Q1):
686	mov    -0xb(%rdx),%r10
687	mov    %r10,-0xb(%rcx)
688	/*
689	 * These trailing loads/stores have to do all their loads 1st,
690	 * then do the stores.
691	 */
692L(P3Q0):
693	movzwq -0x3(%rdx),%r9
694	movzbq -0x1(%rdx),%r10
695	mov    %r9w,-0x3(%rcx)
696	mov    %r10b,-0x1(%rcx)
697	ret
698
699	.balign 16
700L(P4QF):
701	mov    -0x7c(%rdx),%r9
702	mov    %r9,-0x7c(%rcx)
703L(P4QE):
704	mov    -0x74(%rdx),%r11
705	mov    %r11,-0x74(%rcx)
706L(P4QD):
707	mov    -0x6c(%rdx),%r10
708	mov    %r10,-0x6c(%rcx)
709L(P4QC):
710	mov    -0x64(%rdx),%r9
711	mov    %r9,-0x64(%rcx)
712L(P4QB):
713	mov    -0x5c(%rdx),%r11
714	mov    %r11,-0x5c(%rcx)
715L(P4QA):
716	mov    -0x54(%rdx),%r10
717	mov    %r10,-0x54(%rcx)
718L(P4Q9):
719	mov    -0x4c(%rdx),%r9
720	mov    %r9,-0x4c(%rcx)
721L(P4Q8):
722	mov    -0x44(%rdx),%r11
723	mov    %r11,-0x44(%rcx)
724L(P4Q7):
725	mov    -0x3c(%rdx),%r10
726	mov    %r10,-0x3c(%rcx)
727L(P4Q6):
728	mov    -0x34(%rdx),%r9
729	mov    %r9,-0x34(%rcx)
730L(P4Q5):
731	mov    -0x2c(%rdx),%r11
732	mov    %r11,-0x2c(%rcx)
733L(P4Q4):
734	mov    -0x24(%rdx),%r10
735	mov    %r10,-0x24(%rcx)
736L(P4Q3):
737	mov    -0x1c(%rdx),%r9
738	mov    %r9,-0x1c(%rcx)
739L(P4Q2):
740	mov    -0x14(%rdx),%r11
741	mov    %r11,-0x14(%rcx)
742L(P4Q1):
743	mov    -0xc(%rdx),%r10
744	mov    %r10,-0xc(%rcx)
745L(P4Q0):
746	mov    -0x4(%rdx),%r9d
747	mov    %r9d,-0x4(%rcx)
748	ret
749
750	.balign 16
751L(P5QF):
752	mov    -0x7d(%rdx),%r9
753	mov    %r9,-0x7d(%rcx)
754L(P5QE):
755	mov    -0x75(%rdx),%r11
756	mov    %r11,-0x75(%rcx)
757L(P5QD):
758	mov    -0x6d(%rdx),%r10
759	mov    %r10,-0x6d(%rcx)
760L(P5QC):
761	mov    -0x65(%rdx),%r9
762	mov    %r9,-0x65(%rcx)
763L(P5QB):
764	mov    -0x5d(%rdx),%r11
765	mov    %r11,-0x5d(%rcx)
766L(P5QA):
767	mov    -0x55(%rdx),%r10
768	mov    %r10,-0x55(%rcx)
769L(P5Q9):
770	mov    -0x4d(%rdx),%r9
771	mov    %r9,-0x4d(%rcx)
772L(P5Q8):
773	mov    -0x45(%rdx),%r11
774	mov    %r11,-0x45(%rcx)
775L(P5Q7):
776	mov    -0x3d(%rdx),%r10
777	mov    %r10,-0x3d(%rcx)
778L(P5Q6):
779	mov    -0x35(%rdx),%r9
780	mov    %r9,-0x35(%rcx)
781L(P5Q5):
782	mov    -0x2d(%rdx),%r11
783	mov    %r11,-0x2d(%rcx)
784L(P5Q4):
785	mov    -0x25(%rdx),%r10
786	mov    %r10,-0x25(%rcx)
787L(P5Q3):
788	mov    -0x1d(%rdx),%r9
789	mov    %r9,-0x1d(%rcx)
790L(P5Q2):
791	mov    -0x15(%rdx),%r11
792	mov    %r11,-0x15(%rcx)
793L(P5Q1):
794	mov    -0xd(%rdx),%r10
795	mov    %r10,-0xd(%rcx)
796	/*
797	 * These trailing loads/stores have to do all their loads 1st,
798	 * then do the stores.
799	 */
800L(P5Q0):
801	mov    -0x5(%rdx),%r9d
802	movzbq -0x1(%rdx),%r10
803	mov    %r9d,-0x5(%rcx)
804	mov    %r10b,-0x1(%rcx)
805	ret
806
807	.balign 16
808L(P6QF):
809	mov    -0x7e(%rdx),%r9
810	mov    %r9,-0x7e(%rcx)
811L(P6QE):
812	mov    -0x76(%rdx),%r11
813	mov    %r11,-0x76(%rcx)
814L(P6QD):
815	mov    -0x6e(%rdx),%r10
816	mov    %r10,-0x6e(%rcx)
817L(P6QC):
818	mov    -0x66(%rdx),%r9
819	mov    %r9,-0x66(%rcx)
820L(P6QB):
821	mov    -0x5e(%rdx),%r11
822	mov    %r11,-0x5e(%rcx)
823L(P6QA):
824	mov    -0x56(%rdx),%r10
825	mov    %r10,-0x56(%rcx)
826L(P6Q9):
827	mov    -0x4e(%rdx),%r9
828	mov    %r9,-0x4e(%rcx)
829L(P6Q8):
830	mov    -0x46(%rdx),%r11
831	mov    %r11,-0x46(%rcx)
832L(P6Q7):
833	mov    -0x3e(%rdx),%r10
834	mov    %r10,-0x3e(%rcx)
835L(P6Q6):
836	mov    -0x36(%rdx),%r9
837	mov    %r9,-0x36(%rcx)
838L(P6Q5):
839	mov    -0x2e(%rdx),%r11
840	mov    %r11,-0x2e(%rcx)
841L(P6Q4):
842	mov    -0x26(%rdx),%r10
843	mov    %r10,-0x26(%rcx)
844L(P6Q3):
845	mov    -0x1e(%rdx),%r9
846	mov    %r9,-0x1e(%rcx)
847L(P6Q2):
848	mov    -0x16(%rdx),%r11
849	mov    %r11,-0x16(%rcx)
850L(P6Q1):
851	mov    -0xe(%rdx),%r10
852	mov    %r10,-0xe(%rcx)
853	/*
854	 * These trailing loads/stores have to do all their loads 1st,
855	 * then do the stores.
856	 */
857L(P6Q0):
858	mov    -0x6(%rdx),%r9d
859	movzwq -0x2(%rdx),%r10
860	mov    %r9d,-0x6(%rcx)
861	mov    %r10w,-0x2(%rcx)
862	ret
863
864	.balign 16
865L(P7QF):
866	mov    -0x7f(%rdx),%r9
867	mov    %r9,-0x7f(%rcx)
868L(P7QE):
869	mov    -0x77(%rdx),%r11
870	mov    %r11,-0x77(%rcx)
871L(P7QD):
872	mov    -0x6f(%rdx),%r10
873	mov    %r10,-0x6f(%rcx)
874L(P7QC):
875	mov    -0x67(%rdx),%r9
876	mov    %r9,-0x67(%rcx)
877L(P7QB):
878	mov    -0x5f(%rdx),%r11
879	mov    %r11,-0x5f(%rcx)
880L(P7QA):
881	mov    -0x57(%rdx),%r10
882	mov    %r10,-0x57(%rcx)
883L(P7Q9):
884	mov    -0x4f(%rdx),%r9
885	mov    %r9,-0x4f(%rcx)
886L(P7Q8):
887	mov    -0x47(%rdx),%r11
888	mov    %r11,-0x47(%rcx)
889L(P7Q7):
890	mov    -0x3f(%rdx),%r10
891	mov    %r10,-0x3f(%rcx)
892L(P7Q6):
893	mov    -0x37(%rdx),%r9
894	mov    %r9,-0x37(%rcx)
895L(P7Q5):
896	mov    -0x2f(%rdx),%r11
897	mov    %r11,-0x2f(%rcx)
898L(P7Q4):
899	mov    -0x27(%rdx),%r10
900	mov    %r10,-0x27(%rcx)
901L(P7Q3):
902	mov    -0x1f(%rdx),%r9
903	mov    %r9,-0x1f(%rcx)
904L(P7Q2):
905	mov    -0x17(%rdx),%r11
906	mov    %r11,-0x17(%rcx)
907L(P7Q1):
908	mov    -0xf(%rdx),%r10
909	mov    %r10,-0xf(%rcx)
910	/*
911	 * These trailing loads/stores have to do all their loads 1st,
912	 * then do the stores.
913	 */
914L(P7Q0):
915	mov    -0x7(%rdx),%r9d
916	movzwq -0x3(%rdx),%r10
917	movzbq -0x1(%rdx),%r11
918	mov    %r9d,-0x7(%rcx)
919	mov    %r10w,-0x3(%rcx)
920	mov    %r11b,-0x1(%rcx)
921	ret
922
923	.balign 16
924L(ck_use_sse2):
925	/*
926	 * Align dest to 16 byte boundary.
927	 */
928	test   $0xf,%rcx
929	jnz    L(ShrtAlignNew)
930
931L(now_qw_aligned):
932	cmpl   $NO_SSE,.memops_method(%rip)
933	je     L(Loop8byte_pre)
934
935	/*
936	 * The fall-through path is to do SSE2 16-byte load/stores
937	 */
938
939	/*
940	 * If current move size is larger than half of the highest level cache
941	 * size, then do non-temporal moves.
942	 */
943	mov    .largest_level_cache_size(%rip),%r9d
944	shr    %r9		# take half of it
945	cmp    %r9,%r8
946	jg     L(sse2_nt_move)
947
948	/*
949	 * If both the source and dest are aligned, then use the both aligned
950	 * logic. Well aligned data should reap the rewards.
951	 */
952	test   $0xf,%rdx
953	jz     L(pre_both_aligned)
954
955	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
956	testl  $USE_SSSE3,.memops_method(%rip)
957	jz     1f
958	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
959
9601:
961	/*
962	 * if the src is not 16 byte aligned...
963	 */
964	mov    %rdx,%r11
965	and    $0xf,%r11
966	movdqu (%rdx),%xmm0
967	movdqa %xmm0,(%rcx)
968	add    $0x10,%rdx
969	sub    %r11,%rdx
970	add    $0x10,%rcx
971	sub    $0x10,%r8
972	movdqa (%rdx),%xmm1
973
974	movslq (%r10,%r11,4),%r9
975	lea    (%r9,%r10,1),%r10
976	jmpq   *%r10
977
978	    .balign 16
979L(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
980	    .int        L(mov3dqa1) -L(SSSE3_src)
981	    .int        L(mov3dqa2) -L(SSSE3_src)
982	    .int        L(mov3dqa3) -L(SSSE3_src)
983	    .int        L(mov3dqa4) -L(SSSE3_src)
984	    .int        L(mov3dqa5) -L(SSSE3_src)
985	    .int        L(mov3dqa6) -L(SSSE3_src)
986	    .int        L(mov3dqa7) -L(SSSE3_src)
987	    .int        L(movdqa8)  -L(SSSE3_src)
988	    .int        L(mov3dqa9) -L(SSSE3_src)
989	    .int        L(mov3dqa10)-L(SSSE3_src)
990	    .int        L(mov3dqa11)-L(SSSE3_src)
991	    .int        L(mov3dqa12)-L(SSSE3_src)
992	    .int        L(mov3dqa13)-L(SSSE3_src)
993	    .int        L(mov3dqa14)-L(SSSE3_src)
994	    .int        L(mov3dqa15)-L(SSSE3_src)
995L(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
996	    .int        L(movdqa1) -L(SSE_src)
997	    .int        L(movdqa2) -L(SSE_src)
998	    .int        L(movdqa3) -L(SSE_src)
999	    .int        L(movdqa4) -L(SSE_src)
1000	    .int        L(movdqa5) -L(SSE_src)
1001	    .int        L(movdqa6) -L(SSE_src)
1002	    .int        L(movdqa7) -L(SSE_src)
1003	    .int        L(movdqa8) -L(SSE_src)
1004	    .int        L(movdqa9) -L(SSE_src)
1005	    .int        L(movdqa10)-L(SSE_src)
1006	    .int        L(movdqa11)-L(SSE_src)
1007	    .int        L(movdqa12)-L(SSE_src)
1008	    .int        L(movdqa13)-L(SSE_src)
1009	    .int        L(movdqa14)-L(SSE_src)
1010	    .int        L(movdqa15)-L(SSE_src)
1011
1012	.balign 16
1013L(movdqa1):
1014	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
1015	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
1016	lea    0x20(%rdx),%rdx
1017	lea    -0x20(%r8),%r8
1018
1019	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
1020	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
1021	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
1022	por    %xmm1,%xmm3 # OR them together
1023	cmp    $0x20,%r8
1024
1025	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
1026	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
1027	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
1028	por    %xmm2,%xmm0 # OR them together
1029	movdqa %xmm3,(%rcx)     # store it
1030	movdqa %xmm0,0x10(%rcx) # store it
1031	lea    0x20(%rcx),%rcx
1032
1033	jge    L(movdqa1)
1034	jmp    L(movdqa_epi)
1035
1036	.balign 16
1037L(movdqa2):
1038	sub    $0x20,%r8
1039	movdqa 0x10(%rdx),%xmm3
1040	movdqa 0x20(%rdx),%xmm0
1041	add    $0x20,%rdx
1042
1043	psrldq $0x2,%xmm1
1044	movdqa %xmm3,%xmm2
1045	pslldq $0xe,%xmm3
1046	por    %xmm1,%xmm3
1047
1048	psrldq $0x2,%xmm2
1049	movdqa %xmm0,%xmm1
1050	pslldq $0xe,%xmm0
1051	por    %xmm2,%xmm0
1052	movdqa %xmm3,(%rcx)
1053	movdqa %xmm0,0x10(%rcx)
1054
1055	add    $0x20,%rcx
1056	cmp    $0x20,%r8
1057	jge    L(movdqa2)
1058	jmp    L(movdqa_epi)
1059
1060	.balign 16
1061L(movdqa3):
1062	sub    $0x20,%r8
1063	movdqa 0x10(%rdx),%xmm3
1064	movdqa 0x20(%rdx),%xmm0
1065	add    $0x20,%rdx
1066
1067	psrldq $0x3,%xmm1
1068	movdqa %xmm3,%xmm2
1069	pslldq $0xd,%xmm3
1070	por    %xmm1,%xmm3
1071
1072	psrldq $0x3,%xmm2
1073	movdqa %xmm0,%xmm1
1074	pslldq $0xd,%xmm0
1075	por    %xmm2,%xmm0
1076	movdqa %xmm3,(%rcx)
1077	movdqa %xmm0,0x10(%rcx)
1078
1079	add    $0x20,%rcx
1080	cmp    $0x20,%r8
1081	jge    L(movdqa3)
1082	jmp    L(movdqa_epi)
1083
1084	.balign 16
1085L(movdqa4):
1086	sub    $0x20,%r8
1087	movdqa 0x10(%rdx),%xmm3
1088	movdqa 0x20(%rdx),%xmm0
1089	add    $0x20,%rdx
1090
1091	psrldq $0x4,%xmm1
1092	movdqa %xmm3,%xmm2
1093	pslldq $0xc,%xmm3
1094	por    %xmm1,%xmm3
1095
1096	psrldq $0x4,%xmm2
1097	movdqa %xmm0,%xmm1
1098	pslldq $0xc,%xmm0
1099	por    %xmm2,%xmm0
1100
1101	movdqa %xmm3,(%rcx)
1102	movdqa %xmm0,0x10(%rcx)
1103
1104	add    $0x20,%rcx
1105	cmp    $0x20,%r8
1106	jge    L(movdqa4)
1107	jmp    L(movdqa_epi)
1108
1109	.balign 16
1110L(movdqa5):
1111	sub    $0x20,%r8
1112	movdqa 0x10(%rdx),%xmm3
1113	movdqa 0x20(%rdx),%xmm0
1114	add    $0x20,%rdx
1115
1116	psrldq $0x5,%xmm1
1117	movdqa %xmm3,%xmm2
1118	pslldq $0xb,%xmm3
1119	por    %xmm1,%xmm3
1120
1121	psrldq $0x5,%xmm2
1122	movdqa %xmm0,%xmm1
1123	pslldq $0xb,%xmm0
1124	por    %xmm2,%xmm0
1125
1126	movdqa %xmm3,(%rcx)
1127	movdqa %xmm0,0x10(%rcx)
1128
1129	add    $0x20,%rcx
1130	cmp    $0x20,%r8
1131	jge    L(movdqa5)
1132	jmp    L(movdqa_epi)
1133
1134	.balign 16
1135L(movdqa6):
1136	sub    $0x20,%r8
1137	movdqa 0x10(%rdx),%xmm3
1138	movdqa 0x20(%rdx),%xmm0
1139	add    $0x20,%rdx
1140
1141	psrldq $0x6,%xmm1
1142	movdqa %xmm3,%xmm2
1143	pslldq $0xa,%xmm3
1144	por    %xmm1,%xmm3
1145
1146	psrldq $0x6,%xmm2
1147	movdqa %xmm0,%xmm1
1148	pslldq $0xa,%xmm0
1149	por    %xmm2,%xmm0
1150	movdqa %xmm3,(%rcx)
1151	movdqa %xmm0,0x10(%rcx)
1152
1153	add    $0x20,%rcx
1154	cmp    $0x20,%r8
1155	jge    L(movdqa6)
1156	jmp    L(movdqa_epi)
1157
1158	.balign 16
1159L(movdqa7):
1160	sub    $0x20,%r8
1161	movdqa 0x10(%rdx),%xmm3
1162	movdqa 0x20(%rdx),%xmm0
1163	add    $0x20,%rdx
1164
1165	psrldq $0x7,%xmm1
1166	movdqa %xmm3,%xmm2
1167	pslldq $0x9,%xmm3
1168	por    %xmm1,%xmm3
1169
1170	psrldq $0x7,%xmm2
1171	movdqa %xmm0,%xmm1
1172	pslldq $0x9,%xmm0
1173	por    %xmm2,%xmm0
1174	movdqa %xmm3,(%rcx)
1175	movdqa %xmm0,0x10(%rcx)
1176
1177	add    $0x20,%rcx
1178	cmp    $0x20,%r8
1179	jge    L(movdqa7)
1180	jmp    L(movdqa_epi)
1181
1182	.balign 16
1183L(movdqa8):
1184	movdqa 0x10(%rdx),%xmm3
1185	sub    $0x30,%r8
1186	movdqa 0x20(%rdx),%xmm0
1187	movdqa 0x30(%rdx),%xmm5
1188	lea    0x30(%rdx),%rdx
1189
1190	shufpd $0x1,%xmm3,%xmm1
1191	movdqa %xmm1,(%rcx)
1192
1193	cmp    $0x30,%r8
1194
1195	shufpd $0x1,%xmm0,%xmm3
1196	movdqa %xmm3,0x10(%rcx)
1197
1198	movdqa %xmm5,%xmm1
1199	shufpd $0x1,%xmm5,%xmm0
1200	movdqa %xmm0,0x20(%rcx)
1201
1202	lea    0x30(%rcx),%rcx
1203
1204	jge    L(movdqa8)
1205	jmp    L(movdqa_epi)
1206
1207	.balign 16
1208L(movdqa9):
1209	sub    $0x20,%r8
1210	movdqa 0x10(%rdx),%xmm3
1211	movdqa 0x20(%rdx),%xmm0
1212	add    $0x20,%rdx
1213
1214	psrldq $0x9,%xmm1
1215	movdqa %xmm3,%xmm2
1216	pslldq $0x7,%xmm3
1217	por    %xmm1,%xmm3
1218
1219	psrldq $0x9,%xmm2
1220	movdqa %xmm0,%xmm1
1221	pslldq $0x7,%xmm0
1222	por    %xmm2,%xmm0
1223	movdqa %xmm3,(%rcx)
1224	movdqa %xmm0,0x10(%rcx)
1225
1226	add    $0x20,%rcx
1227	cmp    $0x20,%r8
1228	jge    L(movdqa9)
1229	jmp    L(movdqa_epi)
1230
1231	.balign 16
1232L(movdqa10):
1233	sub    $0x20,%r8
1234	movdqa 0x10(%rdx),%xmm3
1235	movdqa 0x20(%rdx),%xmm0
1236	add    $0x20,%rdx
1237
1238	psrldq $0xa,%xmm1
1239	movdqa %xmm3,%xmm2
1240	pslldq $0x6,%xmm3
1241	por    %xmm1,%xmm3
1242
1243	psrldq $0xa,%xmm2
1244	movdqa %xmm0,%xmm1
1245	pslldq $0x6,%xmm0
1246	por    %xmm2,%xmm0
1247	movdqa %xmm3,(%rcx)
1248	movdqa %xmm0,0x10(%rcx)
1249
1250	add    $0x20,%rcx
1251	cmp    $0x20,%r8
1252	jge    L(movdqa10)
1253	jmp    L(movdqa_epi)
1254
1255	.balign 16
1256L(movdqa11):
1257	sub    $0x20,%r8
1258	movdqa 0x10(%rdx),%xmm3
1259	movdqa 0x20(%rdx),%xmm0
1260	add    $0x20,%rdx
1261
1262	psrldq $0xb,%xmm1
1263	movdqa %xmm3,%xmm2
1264	pslldq $0x5,%xmm3
1265	por    %xmm1,%xmm3
1266
1267	psrldq $0xb,%xmm2
1268	movdqa %xmm0,%xmm1
1269	pslldq $0x5,%xmm0
1270	por    %xmm2,%xmm0
1271	movdqa %xmm3,(%rcx)
1272	movdqa %xmm0,0x10(%rcx)
1273
1274	add    $0x20,%rcx
1275	cmp    $0x20,%r8
1276	jge    L(movdqa11)
1277	jmp    L(movdqa_epi)
1278
1279	.balign 16
1280L(movdqa12):
1281	sub    $0x20,%r8
1282	movdqa 0x10(%rdx),%xmm3
1283	movdqa 0x20(%rdx),%xmm0
1284	add    $0x20,%rdx
1285
1286	psrldq $0xc,%xmm1
1287	movdqa %xmm3,%xmm2
1288	pslldq $0x4,%xmm3
1289	por    %xmm1,%xmm3
1290
1291	psrldq $0xc,%xmm2
1292	movdqa %xmm0,%xmm1
1293	pslldq $0x4,%xmm0
1294	por    %xmm2,%xmm0
1295	movdqa %xmm3,(%rcx)
1296	movdqa %xmm0,0x10(%rcx)
1297
1298	add    $0x20,%rcx
1299	cmp    $0x20,%r8
1300	jge    L(movdqa12)
1301	jmp    L(movdqa_epi)
1302
1303	.balign 16
1304L(movdqa13):
1305	sub    $0x20,%r8
1306	movdqa 0x10(%rdx),%xmm3
1307	movdqa 0x20(%rdx),%xmm0
1308	add    $0x20,%rdx
1309
1310	psrldq $0xd,%xmm1
1311	movdqa %xmm3,%xmm2
1312	pslldq $0x3,%xmm3
1313	por    %xmm1,%xmm3
1314
1315	psrldq $0xd,%xmm2
1316	movdqa %xmm0,%xmm1
1317	pslldq $0x3,%xmm0
1318	por    %xmm2,%xmm0
1319	movdqa %xmm3,(%rcx)
1320	movdqa %xmm0,0x10(%rcx)
1321
1322	add    $0x20,%rcx
1323	cmp    $0x20,%r8
1324	jge    L(movdqa13)
1325	jmp    L(movdqa_epi)
1326
1327	.balign 16
1328L(movdqa14):
1329	sub    $0x20,%r8
1330	movdqa 0x10(%rdx),%xmm3
1331	movdqa 0x20(%rdx),%xmm0
1332	add    $0x20,%rdx
1333
1334	psrldq $0xe,%xmm1
1335	movdqa %xmm3,%xmm2
1336	pslldq $0x2,%xmm3
1337	por    %xmm1,%xmm3
1338
1339	psrldq $0xe,%xmm2
1340	movdqa %xmm0,%xmm1
1341	pslldq $0x2,%xmm0
1342	por    %xmm2,%xmm0
1343	movdqa %xmm3,(%rcx)
1344	movdqa %xmm0,0x10(%rcx)
1345
1346	add    $0x20,%rcx
1347	cmp    $0x20,%r8
1348	jge    L(movdqa14)
1349	jmp    L(movdqa_epi)
1350
1351	.balign 16
1352L(movdqa15):
1353	sub    $0x20,%r8
1354	movdqa 0x10(%rdx),%xmm3
1355	movdqa 0x20(%rdx),%xmm0
1356	add    $0x20,%rdx
1357
1358	psrldq $0xf,%xmm1
1359	movdqa %xmm3,%xmm2
1360	pslldq $0x1,%xmm3
1361	por    %xmm1,%xmm3
1362
1363	psrldq $0xf,%xmm2
1364	movdqa %xmm0,%xmm1
1365	pslldq $0x1,%xmm0
1366	por    %xmm2,%xmm0
1367	movdqa %xmm3,(%rcx)
1368	movdqa %xmm0,0x10(%rcx)
1369
1370	add    $0x20,%rcx
1371	cmp    $0x20,%r8
1372	jge    L(movdqa15)
1373	#jmp   L(movdqa_epi)
1374
1375	.balign 16
1376L(movdqa_epi):
1377	lea    L(fwdPxQx)(%rip),%r10
1378	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
1379	add    %r8,%rcx
1380	add    %r8,%rdx
1381
1382	movslq (%r10,%r8,4),%r9
1383	lea    (%r9,%r10,1),%r10
1384	jmpq   *%r10
1385
1386	.balign 16
1387L(mov3dqa1):
1388	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
1389	sub	$0x30,%r8
1390	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
1391	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
1392	lea	0x30(%rdx),%rdx
1393	cmp	$0x30,%r8
1394
1395	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
1396	#palignr	$0x1,%xmm1,%xmm3
1397	.byte	0x66,0x0f,0x3a,0x0f
1398	.byte	0xd9,0x01
1399	movdqa	%xmm3,(%rcx)      # store it
1400
1401	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
1402	#palignr	$0x1,%xmm2,%xmm0
1403	.byte	0x66,0x0f,0x3a,0x0f
1404	.byte	0xc2,0x01
1405	movdqa	%xmm0,0x10(%rcx)  # store it
1406
1407	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
1408	#palignr	$0x1,%xmm4,%xmm5
1409	.byte	0x66,0x0f,0x3a,0x0f
1410	.byte	0xec,0x01
1411	movdqa	%xmm5,0x20(%rcx)  # store it
1412
1413	lea	0x30(%rcx),%rcx
1414	jge	L(mov3dqa1)
1415
1416	cmp	$0x10,%r8
1417	jl	L(movdqa_epi)
1418	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1419	sub	$0x10,%r8
1420	lea	0x10(%rdx),%rdx
1421	movdqa	%xmm3,%xmm2		# save for use next concat
1422	#palignr	$0x1,%xmm1,%xmm3
1423	.byte	0x66,0x0f,0x3a,0x0f
1424	.byte	0xd9,0x01
1425
1426	cmp	$0x10,%r8
1427	movdqa	%xmm3,(%rcx)      	# store it
1428	lea	0x10(%rcx),%rcx
1429	jl	L(movdqa_epi)
1430
1431	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1432	sub	$0x10,%r8
1433	lea	0x10(%rdx),%rdx
1434	#palignr	$0x1,%xmm2,%xmm0
1435	.byte	0x66,0x0f,0x3a,0x0f
1436	.byte	0xc2,0x01
1437	movdqa	%xmm0,(%rcx)      	# store it
1438	lea	0x10(%rcx),%rcx
1439	jmp	L(movdqa_epi)
1440
1441	.balign 16
1442L(mov3dqa2):
1443	movdqa	0x10(%rdx),%xmm3
1444	sub	$0x30,%r8
1445	movdqa	0x20(%rdx),%xmm0
1446	movdqa	0x30(%rdx),%xmm5
1447	lea	0x30(%rdx),%rdx
1448	cmp	$0x30,%r8
1449
1450	movdqa	%xmm3,%xmm2
1451	#palignr	$0x2,%xmm1,%xmm3
1452	.byte	0x66,0x0f,0x3a,0x0f
1453	.byte	0xd9,0x02
1454	movdqa	%xmm3,(%rcx)
1455
1456	movdqa	%xmm0,%xmm4
1457	#palignr	$0x2,%xmm2,%xmm0
1458	.byte	0x66,0x0f,0x3a,0x0f
1459	.byte	0xc2,0x02
1460	movdqa	%xmm0,0x10(%rcx)
1461
1462	movdqa	%xmm5,%xmm1
1463	#palignr	$0x2,%xmm4,%xmm5
1464	.byte	0x66,0x0f,0x3a,0x0f
1465	.byte	0xec,0x02
1466	movdqa	%xmm5,0x20(%rcx)
1467
1468	lea	0x30(%rcx),%rcx
1469	jge	L(mov3dqa2)
1470
1471	cmp	$0x10,%r8
1472	jl	L(movdqa_epi)
1473	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1474	sub	$0x10,%r8
1475	lea	0x10(%rdx),%rdx
1476	movdqa	%xmm3,%xmm2		# save for use next concat
1477	#palignr	$0x2,%xmm1,%xmm3
1478	.byte	0x66,0x0f,0x3a,0x0f
1479	.byte	0xd9,0x02
1480
1481	cmp	$0x10,%r8
1482	movdqa	%xmm3,(%rcx)      	# store it
1483	lea	0x10(%rcx),%rcx
1484	jl	L(movdqa_epi)
1485
1486	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1487	sub	$0x10,%r8
1488	lea	0x10(%rdx),%rdx
1489	#palignr	$0x2,%xmm2,%xmm0
1490	.byte	0x66,0x0f,0x3a,0x0f
1491	.byte	0xc2,0x02
1492	movdqa	%xmm0,(%rcx)      	# store it
1493	lea	0x10(%rcx),%rcx
1494	jmp	L(movdqa_epi)
1495
1496	.balign 16
1497L(mov3dqa3):
1498	movdqa	0x10(%rdx),%xmm3
1499	sub	$0x30,%r8
1500	movdqa	0x20(%rdx),%xmm0
1501	movdqa	0x30(%rdx),%xmm5
1502	lea	0x30(%rdx),%rdx
1503	cmp	$0x30,%r8
1504
1505	movdqa	%xmm3,%xmm2
1506	#palignr	$0x3,%xmm1,%xmm3
1507	.byte	0x66,0x0f,0x3a,0x0f
1508	.byte	0xd9,0x03
1509	movdqa	%xmm3,(%rcx)
1510
1511	movdqa	%xmm0,%xmm4
1512	#palignr	$0x3,%xmm2,%xmm0
1513	.byte	0x66,0x0f,0x3a,0x0f
1514	.byte	0xc2,0x03
1515	movdqa	%xmm0,0x10(%rcx)
1516
1517	movdqa	%xmm5,%xmm1
1518	#palignr	$0x3,%xmm4,%xmm5
1519	.byte	0x66,0x0f,0x3a,0x0f
1520	.byte	0xec,0x03
1521	movdqa	%xmm5,0x20(%rcx)
1522
1523	lea	0x30(%rcx),%rcx
1524	jge	L(mov3dqa3)
1525
1526	cmp	$0x10,%r8
1527	jl	L(movdqa_epi)
1528	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1529	sub	$0x10,%r8
1530	lea	0x10(%rdx),%rdx
1531	movdqa	%xmm3,%xmm2		# save for use next concat
1532	#palignr	$0x3,%xmm1,%xmm3
1533	.byte	0x66,0x0f,0x3a,0x0f
1534	.byte	0xd9,0x03
1535
1536	cmp	$0x10,%r8
1537	movdqa	%xmm3,(%rcx)      	# store it
1538	lea	0x10(%rcx),%rcx
1539	jl	L(movdqa_epi)
1540
1541	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1542	sub	$0x10,%r8
1543	lea	0x10(%rdx),%rdx
1544	#palignr	$0x3,%xmm2,%xmm0
1545	.byte	0x66,0x0f,0x3a,0x0f
1546	.byte	0xc2,0x03
1547	movdqa	%xmm0,(%rcx)      	# store it
1548	lea	0x10(%rcx),%rcx
1549	jmp	L(movdqa_epi)
1550
1551	.balign 16
1552L(mov3dqa4):
1553	movdqa	0x10(%rdx),%xmm3
1554	sub	$0x30,%r8
1555	movdqa	0x20(%rdx),%xmm0
1556	movdqa	0x30(%rdx),%xmm5
1557	lea	0x30(%rdx),%rdx
1558	cmp	$0x30,%r8
1559
1560	movdqa	%xmm3,%xmm2
1561	#palignr	$0x4,%xmm1,%xmm3
1562	.byte	0x66,0x0f,0x3a,0x0f
1563	.byte	0xd9,0x04
1564	movdqa	%xmm3,(%rcx)
1565
1566	movdqa	%xmm0,%xmm4
1567	#palignr	$0x4,%xmm2,%xmm0
1568	.byte	0x66,0x0f,0x3a,0x0f
1569	.byte	0xc2,0x04
1570	movdqa	%xmm0,0x10(%rcx)
1571
1572	movdqa	%xmm5,%xmm1
1573	#palignr	$0x4,%xmm4,%xmm5
1574	.byte	0x66,0x0f,0x3a,0x0f
1575	.byte	0xec,0x04
1576	movdqa	%xmm5,0x20(%rcx)
1577
1578	lea	0x30(%rcx),%rcx
1579	jge	L(mov3dqa4)
1580
1581	cmp	$0x10,%r8
1582	jl	L(movdqa_epi)
1583	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1584	sub	$0x10,%r8
1585	lea	0x10(%rdx),%rdx
1586	movdqa	%xmm3,%xmm2		# save for use next concat
1587	#palignr	$0x4,%xmm1,%xmm3
1588	.byte	0x66,0x0f,0x3a,0x0f
1589	.byte	0xd9,0x04
1590
1591	cmp	$0x10,%r8
1592	movdqa	%xmm3,(%rcx)      	# store it
1593	lea	0x10(%rcx),%rcx
1594	jl	L(movdqa_epi)
1595
1596	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1597	sub	$0x10,%r8
1598	lea	0x10(%rdx),%rdx
1599	#palignr	$0x4,%xmm2,%xmm0
1600	.byte	0x66,0x0f,0x3a,0x0f
1601	.byte	0xc2,0x04
1602	movdqa	%xmm0,(%rcx)      	# store it
1603	lea	0x10(%rcx),%rcx
1604	jmp	L(movdqa_epi)
1605
1606	.balign 16
1607L(mov3dqa5):
1608	movdqa	0x10(%rdx),%xmm3
1609	sub	$0x30,%r8
1610	movdqa	0x20(%rdx),%xmm0
1611	movdqa	0x30(%rdx),%xmm5
1612	lea	0x30(%rdx),%rdx
1613	cmp	$0x30,%r8
1614
1615	movdqa	%xmm3,%xmm2
1616	#palignr	$0x5,%xmm1,%xmm3
1617	.byte	0x66,0x0f,0x3a,0x0f
1618	.byte	0xd9,0x05
1619	movdqa	%xmm3,(%rcx)
1620
1621	movdqa	%xmm0,%xmm4
1622	#palignr	$0x5,%xmm2,%xmm0
1623	.byte	0x66,0x0f,0x3a,0x0f
1624	.byte	0xc2,0x05
1625	movdqa	%xmm0,0x10(%rcx)
1626
1627	movdqa	%xmm5,%xmm1
1628	#palignr	$0x5,%xmm4,%xmm5
1629	.byte	0x66,0x0f,0x3a,0x0f
1630	.byte	0xec,0x05
1631	movdqa	%xmm5,0x20(%rcx)
1632
1633	lea	0x30(%rcx),%rcx
1634	jge	L(mov3dqa5)
1635
1636	cmp	$0x10,%r8
1637	jl	L(movdqa_epi)
1638	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1639	sub	$0x10,%r8
1640	lea	0x10(%rdx),%rdx
1641	movdqa	%xmm3,%xmm2		# save for use next concat
1642	#palignr	$0x5,%xmm1,%xmm3
1643	.byte	0x66,0x0f,0x3a,0x0f
1644	.byte	0xd9,0x05
1645
1646	cmp	$0x10,%r8
1647	movdqa	%xmm3,(%rcx)      	# store it
1648	lea	0x10(%rcx),%rcx
1649	jl	L(movdqa_epi)
1650
1651	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1652	sub	$0x10,%r8
1653	lea	0x10(%rdx),%rdx
1654	#palignr	$0x5,%xmm2,%xmm0
1655	.byte	0x66,0x0f,0x3a,0x0f
1656	.byte	0xc2,0x05
1657	movdqa	%xmm0,(%rcx)      	# store it
1658	lea	0x10(%rcx),%rcx
1659	jmp	L(movdqa_epi)
1660
1661	.balign 16
1662L(mov3dqa6):
1663	movdqa	0x10(%rdx),%xmm3
1664	sub	$0x30,%r8
1665	movdqa	0x20(%rdx),%xmm0
1666	movdqa	0x30(%rdx),%xmm5
1667	lea	0x30(%rdx),%rdx
1668	cmp	$0x30,%r8
1669
1670	movdqa	%xmm3,%xmm2
1671	#palignr	$0x6,%xmm1,%xmm3
1672	.byte	0x66,0x0f,0x3a,0x0f
1673	.byte	0xd9,0x06
1674	movdqa	%xmm3,(%rcx)
1675
1676	movdqa	%xmm0,%xmm4
1677	#palignr	$0x6,%xmm2,%xmm0
1678	.byte	0x66,0x0f,0x3a,0x0f
1679	.byte	0xc2,0x06
1680	movdqa	%xmm0,0x10(%rcx)
1681
1682	movdqa	%xmm5,%xmm1
1683	#palignr	$0x6,%xmm4,%xmm5
1684	.byte	0x66,0x0f,0x3a,0x0f
1685	.byte	0xec,0x06
1686	movdqa	%xmm5,0x20(%rcx)
1687
1688	lea	0x30(%rcx),%rcx
1689	jge	L(mov3dqa6)
1690
1691	cmp	$0x10,%r8
1692	jl	L(movdqa_epi)
1693	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1694	sub	$0x10,%r8
1695	lea	0x10(%rdx),%rdx
1696	movdqa	%xmm3,%xmm2		# save for use next concat
1697	#palignr	$0x6,%xmm1,%xmm3
1698	.byte	0x66,0x0f,0x3a,0x0f
1699	.byte	0xd9,0x06
1700
1701	cmp	$0x10,%r8
1702	movdqa	%xmm3,(%rcx)      	# store it
1703	lea	0x10(%rcx),%rcx
1704	jl	L(movdqa_epi)
1705
1706	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1707	sub	$0x10,%r8
1708	lea	0x10(%rdx),%rdx
1709	#palignr	$0x6,%xmm2,%xmm0
1710	.byte	0x66,0x0f,0x3a,0x0f
1711	.byte	0xc2,0x06
1712	movdqa	%xmm0,(%rcx)      	# store it
1713	lea	0x10(%rcx),%rcx
1714	jmp	L(movdqa_epi)
1715
1716	.balign 16
1717L(mov3dqa7):
1718	movdqa	0x10(%rdx),%xmm3
1719	sub	$0x30,%r8
1720	movdqa	0x20(%rdx),%xmm0
1721	movdqa	0x30(%rdx),%xmm5
1722	lea	0x30(%rdx),%rdx
1723	cmp	$0x30,%r8
1724
1725	movdqa	%xmm3,%xmm2
1726	#palignr	$0x7,%xmm1,%xmm3
1727	.byte	0x66,0x0f,0x3a,0x0f
1728	.byte	0xd9,0x07
1729	movdqa	%xmm3,(%rcx)
1730
1731	movdqa	%xmm0,%xmm4
1732	#palignr	$0x7,%xmm2,%xmm0
1733	.byte	0x66,0x0f,0x3a,0x0f
1734	.byte	0xc2,0x07
1735	movdqa	%xmm0,0x10(%rcx)
1736
1737	movdqa	%xmm5,%xmm1
1738	#palignr	$0x7,%xmm4,%xmm5
1739	.byte	0x66,0x0f,0x3a,0x0f
1740	.byte	0xec,0x07
1741	movdqa	%xmm5,0x20(%rcx)
1742
1743	lea	0x30(%rcx),%rcx
1744	jge	L(mov3dqa7)
1745
1746	cmp	$0x10,%r8
1747	jl	L(movdqa_epi)
1748	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1749	sub	$0x10,%r8
1750	lea	0x10(%rdx),%rdx
1751	movdqa	%xmm3,%xmm2		# save for use next concat
1752	#palignr	$0x7,%xmm1,%xmm3
1753	.byte	0x66,0x0f,0x3a,0x0f
1754	.byte	0xd9,0x07
1755
1756	cmp	$0x10,%r8
1757	movdqa	%xmm3,(%rcx)      	# store it
1758	lea	0x10(%rcx),%rcx
1759	jl	L(movdqa_epi)
1760
1761	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1762	sub	$0x10,%r8
1763	lea	0x10(%rdx),%rdx
1764	#palignr	$0x7,%xmm2,%xmm0
1765	.byte	0x66,0x0f,0x3a,0x0f
1766	.byte	0xc2,0x07
1767	movdqa	%xmm0,(%rcx)      	# store it
1768	lea	0x10(%rcx),%rcx
1769	jmp	L(movdqa_epi)
1770
1771	.balign 16
1772L(mov3dqa9):
1773	movdqa	0x10(%rdx),%xmm3
1774	sub	$0x30,%r8
1775	movdqa	0x20(%rdx),%xmm0
1776	movdqa	0x30(%rdx),%xmm5
1777	lea	0x30(%rdx),%rdx
1778	cmp	$0x30,%r8
1779
1780	movdqa	%xmm3,%xmm2
1781	#palignr	$0x9,%xmm1,%xmm3
1782	.byte	0x66,0x0f,0x3a,0x0f
1783	.byte	0xd9,0x09
1784	movdqa	%xmm3,(%rcx)
1785
1786	movdqa	%xmm0,%xmm4
1787	#palignr	$0x9,%xmm2,%xmm0
1788	.byte	0x66,0x0f,0x3a,0x0f
1789	.byte	0xc2,0x09
1790	movdqa	%xmm0,0x10(%rcx)
1791
1792	movdqa	%xmm5,%xmm1
1793	#palignr	$0x9,%xmm4,%xmm5
1794	.byte	0x66,0x0f,0x3a,0x0f
1795	.byte	0xec,0x09
1796	movdqa	%xmm5,0x20(%rcx)
1797
1798	lea	0x30(%rcx),%rcx
1799	jge	L(mov3dqa9)
1800
1801	cmp	$0x10,%r8
1802	jl	L(movdqa_epi)
1803	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1804	sub	$0x10,%r8
1805	lea	0x10(%rdx),%rdx
1806	movdqa	%xmm3,%xmm2		# save for use next concat
1807	#palignr	$0x9,%xmm1,%xmm3
1808	.byte	0x66,0x0f,0x3a,0x0f
1809	.byte	0xd9,0x09
1810
1811	cmp	$0x10,%r8
1812	movdqa	%xmm3,(%rcx)      	# store it
1813	lea	0x10(%rcx),%rcx
1814	jl	L(movdqa_epi)
1815
1816	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1817	sub	$0x10,%r8
1818	lea	0x10(%rdx),%rdx
1819	#palignr	$0x9,%xmm2,%xmm0
1820	.byte	0x66,0x0f,0x3a,0x0f
1821	.byte	0xc2,0x09
1822	movdqa	%xmm0,(%rcx)      	# store it
1823	lea	0x10(%rcx),%rcx
1824	jmp	L(movdqa_epi)
1825
1826	.balign 16
1827L(mov3dqa10):
1828	movdqa	0x10(%rdx),%xmm3
1829	sub	$0x30,%r8
1830	movdqa	0x20(%rdx),%xmm0
1831	movdqa	0x30(%rdx),%xmm5
1832	lea	0x30(%rdx),%rdx
1833	cmp	$0x30,%r8
1834
1835	movdqa	%xmm3,%xmm2
1836	#palignr	$0xa,%xmm1,%xmm3
1837	.byte	0x66,0x0f,0x3a,0x0f
1838	.byte	0xd9,0x0a
1839	movdqa	%xmm3,(%rcx)
1840
1841	movdqa	%xmm0,%xmm4
1842	#palignr	$0xa,%xmm2,%xmm0
1843	.byte	0x66,0x0f,0x3a,0x0f
1844	.byte	0xc2,0x0a
1845	movdqa	%xmm0,0x10(%rcx)
1846
1847	movdqa	%xmm5,%xmm1
1848	#palignr	$0xa,%xmm4,%xmm5
1849	.byte	0x66,0x0f,0x3a,0x0f
1850	.byte	0xec,0x0a
1851	movdqa	%xmm5,0x20(%rcx)
1852
1853	lea	0x30(%rcx),%rcx
1854	jge	L(mov3dqa10)
1855
1856	cmp	$0x10,%r8
1857	jl	L(movdqa_epi)
1858	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1859	sub	$0x10,%r8
1860	lea	0x10(%rdx),%rdx
1861	movdqa	%xmm3,%xmm2		# save for use next concat
1862	#palignr	$0xa,%xmm1,%xmm3
1863	.byte	0x66,0x0f,0x3a,0x0f
1864	.byte	0xd9,0x0a
1865
1866	cmp	$0x10,%r8
1867	movdqa	%xmm3,(%rcx)      	# store it
1868	lea	0x10(%rcx),%rcx
1869	jl	L(movdqa_epi)
1870
1871	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1872	sub	$0x10,%r8
1873	lea	0x10(%rdx),%rdx
1874	#palignr	$0xa,%xmm2,%xmm0
1875	.byte	0x66,0x0f,0x3a,0x0f
1876	.byte	0xc2,0x0a
1877	movdqa	%xmm0,(%rcx)      	# store it
1878	lea	0x10(%rcx),%rcx
1879	jmp	L(movdqa_epi)
1880
1881	.balign 16
1882L(mov3dqa11):
1883	movdqa	0x10(%rdx),%xmm3
1884	sub	$0x30,%r8
1885	movdqa	0x20(%rdx),%xmm0
1886	movdqa	0x30(%rdx),%xmm5
1887	lea	0x30(%rdx),%rdx
1888	cmp	$0x30,%r8
1889
1890	movdqa	%xmm3,%xmm2
1891	#palignr	$0xb,%xmm1,%xmm3
1892	.byte	0x66,0x0f,0x3a,0x0f
1893	.byte	0xd9,0x0b
1894	movdqa	%xmm3,(%rcx)
1895
1896	movdqa	%xmm0,%xmm4
1897	#palignr	$0xb,%xmm2,%xmm0
1898	.byte	0x66,0x0f,0x3a,0x0f
1899	.byte	0xc2,0x0b
1900	movdqa	%xmm0,0x10(%rcx)
1901
1902	movdqa	%xmm5,%xmm1
1903	#palignr	$0xb,%xmm4,%xmm5
1904	.byte	0x66,0x0f,0x3a,0x0f
1905	.byte	0xec,0x0b
1906	movdqa	%xmm5,0x20(%rcx)
1907
1908	lea	0x30(%rcx),%rcx
1909	jge	L(mov3dqa11)
1910
1911	cmp	$0x10,%r8
1912	jl	L(movdqa_epi)
1913	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1914	sub	$0x10,%r8
1915	lea	0x10(%rdx),%rdx
1916	movdqa	%xmm3,%xmm2		# save for use next concat
1917	#palignr	$0xb,%xmm1,%xmm3
1918	.byte	0x66,0x0f,0x3a,0x0f
1919	.byte	0xd9,0x0b
1920
1921	cmp	$0x10,%r8
1922	movdqa	%xmm3,(%rcx)      	# store it
1923	lea	0x10(%rcx),%rcx
1924	jl	L(movdqa_epi)
1925
1926	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1927	sub	$0x10,%r8
1928	lea	0x10(%rdx),%rdx
1929	#palignr	$0xb,%xmm2,%xmm0
1930	.byte	0x66,0x0f,0x3a,0x0f
1931	.byte	0xc2,0x0b
1932	movdqa	%xmm0,(%rcx)      	# store it
1933	lea	0x10(%rcx),%rcx
1934	jmp	L(movdqa_epi)
1935
1936	.balign 16
1937L(mov3dqa12):
1938	movdqa	0x10(%rdx),%xmm3
1939	sub	$0x30,%r8
1940	movdqa	0x20(%rdx),%xmm0
1941	movdqa	0x30(%rdx),%xmm5
1942	lea	0x30(%rdx),%rdx
1943	cmp	$0x30,%r8
1944
1945	movdqa	%xmm3,%xmm2
1946	#palignr	$0xc,%xmm1,%xmm3
1947	.byte	0x66,0x0f,0x3a,0x0f
1948	.byte	0xd9,0x0c
1949	movdqa	%xmm3,(%rcx)
1950
1951	movdqa	%xmm0,%xmm4
1952	#palignr	$0xc,%xmm2,%xmm0
1953	.byte	0x66,0x0f,0x3a,0x0f
1954	.byte	0xc2,0x0c
1955	movdqa	%xmm0,0x10(%rcx)
1956
1957	movdqa	%xmm5,%xmm1
1958	#palignr	$0xc,%xmm4,%xmm5
1959	.byte	0x66,0x0f,0x3a,0x0f
1960	.byte	0xec,0x0c
1961	movdqa	%xmm5,0x20(%rcx)
1962
1963	lea	0x30(%rcx),%rcx
1964	jge	L(mov3dqa12)
1965
1966	cmp	$0x10,%r8
1967	jl	L(movdqa_epi)
1968	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1969	sub	$0x10,%r8
1970	lea	0x10(%rdx),%rdx
1971	movdqa	%xmm3,%xmm2		# save for use next concat
1972	#palignr	$0xc,%xmm1,%xmm3
1973	.byte	0x66,0x0f,0x3a,0x0f
1974	.byte	0xd9,0x0c
1975
1976	cmp	$0x10,%r8
1977	movdqa	%xmm3,(%rcx)      	# store it
1978	lea	0x10(%rcx),%rcx
1979	jl	L(movdqa_epi)
1980
1981	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1982	sub	$0x10,%r8
1983	lea	0x10(%rdx),%rdx
1984	#palignr	$0xc,%xmm2,%xmm0
1985	.byte	0x66,0x0f,0x3a,0x0f
1986	.byte	0xc2,0x0c
1987	movdqa	%xmm0,(%rcx)      	# store it
1988	lea	0x10(%rcx),%rcx
1989	jmp	L(movdqa_epi)
1990
1991	.balign 16
1992L(mov3dqa13):
1993	movdqa	0x10(%rdx),%xmm3
1994	sub	$0x30,%r8
1995	movdqa	0x20(%rdx),%xmm0
1996	movdqa	0x30(%rdx),%xmm5
1997	lea	0x30(%rdx),%rdx
1998	cmp	$0x30,%r8
1999
2000	movdqa	%xmm3,%xmm2
2001	#palignr	$0xd,%xmm1,%xmm3
2002	.byte	0x66,0x0f,0x3a,0x0f
2003	.byte	0xd9,0x0d
2004	movdqa	%xmm3,(%rcx)
2005
2006	movdqa	%xmm0,%xmm4
2007	#palignr	$0xd,%xmm2,%xmm0
2008	.byte	0x66,0x0f,0x3a,0x0f
2009	.byte	0xc2,0x0d
2010	movdqa	%xmm0,0x10(%rcx)
2011
2012	movdqa	%xmm5,%xmm1
2013	#palignr	$0xd,%xmm4,%xmm5
2014	.byte	0x66,0x0f,0x3a,0x0f
2015	.byte	0xec,0x0d
2016	movdqa	%xmm5,0x20(%rcx)
2017
2018	lea	0x30(%rcx),%rcx
2019	jge	L(mov3dqa13)
2020
2021	cmp	$0x10,%r8
2022	jl	L(movdqa_epi)
2023	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2024	sub	$0x10,%r8
2025	lea	0x10(%rdx),%rdx
2026	movdqa	%xmm3,%xmm2		# save for use next concat
2027	#palignr	$0xd,%xmm1,%xmm3
2028	.byte	0x66,0x0f,0x3a,0x0f
2029	.byte	0xd9,0x0d
2030
2031	cmp	$0x10,%r8
2032	movdqa	%xmm3,(%rcx)      	# store it
2033	lea	0x10(%rcx),%rcx
2034	jl	L(movdqa_epi)
2035
2036	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2037	sub	$0x10,%r8
2038	lea	0x10(%rdx),%rdx
2039	#palignr	$0xd,%xmm2,%xmm0
2040	.byte	0x66,0x0f,0x3a,0x0f
2041	.byte	0xc2,0x0d
2042	movdqa	%xmm0,(%rcx)      	# store it
2043	lea	0x10(%rcx),%rcx
2044	jmp	L(movdqa_epi)
2045
2046	.balign 16
2047L(mov3dqa14):
2048	movdqa	0x10(%rdx),%xmm3
2049	sub	$0x30,%r8
2050	movdqa	0x20(%rdx),%xmm0
2051	movdqa	0x30(%rdx),%xmm5
2052	lea	0x30(%rdx),%rdx
2053	cmp	$0x30,%r8
2054
2055	movdqa	%xmm3,%xmm2
2056	#palignr	$0xe,%xmm1,%xmm3
2057	.byte	0x66,0x0f,0x3a,0x0f
2058	.byte	0xd9,0x0e
2059	movdqa	%xmm3,(%rcx)
2060
2061	movdqa	%xmm0,%xmm4
2062	#palignr	$0xe,%xmm2,%xmm0
2063	.byte	0x66,0x0f,0x3a,0x0f
2064	.byte	0xc2,0x0e
2065	movdqa	%xmm0,0x10(%rcx)
2066
2067	movdqa	%xmm5,%xmm1
2068	#palignr	$0xe,%xmm4,%xmm5
2069	.byte	0x66,0x0f,0x3a,0x0f
2070	.byte	0xec,0x0e
2071	movdqa	%xmm5,0x20(%rcx)
2072
2073	lea	0x30(%rcx),%rcx
2074	jge	L(mov3dqa14)
2075
2076	cmp	$0x10,%r8
2077	jl	L(movdqa_epi)
2078	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2079	sub	$0x10,%r8
2080	lea	0x10(%rdx),%rdx
2081	movdqa	%xmm3,%xmm2		# save for use next concat
2082	#palignr	$0xe,%xmm1,%xmm3
2083	.byte	0x66,0x0f,0x3a,0x0f
2084	.byte	0xd9,0x0e
2085
2086	cmp	$0x10,%r8
2087	movdqa	%xmm3,(%rcx)      	# store it
2088	lea	0x10(%rcx),%rcx
2089	jl	L(movdqa_epi)
2090
2091	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2092	sub	$0x10,%r8
2093	lea	0x10(%rdx),%rdx
2094	#palignr	$0xe,%xmm2,%xmm0
2095	.byte	0x66,0x0f,0x3a,0x0f
2096	.byte	0xc2,0x0e
2097	movdqa	%xmm0,(%rcx)      	# store it
2098	lea	0x10(%rcx),%rcx
2099	jmp	L(movdqa_epi)
2100
2101	.balign 16
2102L(mov3dqa15):
2103	movdqa	0x10(%rdx),%xmm3
2104	sub	$0x30,%r8
2105	movdqa	0x20(%rdx),%xmm0
2106	movdqa	0x30(%rdx),%xmm5
2107	lea	0x30(%rdx),%rdx
2108	cmp	$0x30,%r8
2109
2110	movdqa	%xmm3,%xmm2
2111	#palignr	$0xf,%xmm1,%xmm3
2112	.byte	0x66,0x0f,0x3a,0x0f
2113	.byte	0xd9,0x0f
2114	movdqa	%xmm3,(%rcx)
2115
2116	movdqa	%xmm0,%xmm4
2117	#palignr	$0xf,%xmm2,%xmm0
2118	.byte	0x66,0x0f,0x3a,0x0f
2119	.byte	0xc2,0x0f
2120	movdqa	%xmm0,0x10(%rcx)
2121
2122	movdqa	%xmm5,%xmm1
2123	#palignr	$0xf,%xmm4,%xmm5
2124	.byte	0x66,0x0f,0x3a,0x0f
2125	.byte	0xec,0x0f
2126	movdqa	%xmm5,0x20(%rcx)
2127
2128	lea	0x30(%rcx),%rcx
2129	jge	L(mov3dqa15)
2130
2131	cmp	$0x10,%r8
2132	jl	L(movdqa_epi)
2133	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2134	sub	$0x10,%r8
2135	lea	0x10(%rdx),%rdx
2136	movdqa	%xmm3,%xmm2		# save for use next concat
2137	#palignr	$0xf,%xmm1,%xmm3
2138	.byte	0x66,0x0f,0x3a,0x0f
2139	.byte	0xd9,0x0f
2140
2141	cmp	$0x10,%r8
2142	movdqa	%xmm3,(%rcx)      	# store it
2143	lea	0x10(%rcx),%rcx
2144	jl	L(movdqa_epi)
2145
2146	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2147	sub	$0x10,%r8
2148	lea	0x10(%rdx),%rdx
2149	#palignr	$0xf,%xmm2,%xmm0
2150	.byte	0x66,0x0f,0x3a,0x0f
2151	.byte	0xc2,0x0f
2152	movdqa	%xmm0,(%rcx)      	# store it
2153	lea	0x10(%rcx),%rcx
2154	jmp	L(movdqa_epi)
2155
2156	.balign 16
2157L(sse2_nt_move):
2158	lea	0x40(%rcx),%rcx
2159	lea	0x40(%rdx),%rdx
2160	lea	-0x40(%r8),%r8
2161
2162	/*
2163	 * doesn't matter if source is aligned for stuff out of cache.
2164	 * the mis-aligned penalty is masked by the slowness of main memory.
2165	 */
2166	prefetchnta 0x180(%rdx)
2167	movdqu	-0x40(%rdx),%xmm0
2168	movdqu	-0x30(%rdx),%xmm1
2169
2170	cmp	$0x40,%r8
2171	movntdq	%xmm0,-0x40(%rcx)
2172	movntdq	%xmm1,-0x30(%rcx)
2173
2174	movdqu	-0x20(%rdx),%xmm2
2175	movdqu	-0x10(%rdx),%xmm3
2176
2177	movntdq	%xmm2,-0x20(%rcx)
2178	movntdq	%xmm3,-0x10(%rcx)
2179
2180	jge	L(sse2_nt_move)
2181
2182	lea	L(Fix16EndTable)(%rip),%r10
2183	mov	%r8,%r9
2184	and	$0xFFFFFFFFFFFFFFF0,%r9
2185	add	%r9,%rcx
2186	add	%r9,%rdx
2187	sub	%r9,%r8
2188	shr	$0x4,%r9
2189	sfence
2190
2191	movslq	(%r10,%r9,4),%r11
2192	lea	(%r11,%r10,1),%r10
2193	jmpq	*%r10
2194
2195	.balign 16
2196L(Fix16EndTable):
2197	.int    L(fix16_0)-L(Fix16EndTable)
2198	.int    L(fix16_1)-L(Fix16EndTable)
2199	.int    L(fix16_2)-L(Fix16EndTable)
2200	.int    L(fix16_3)-L(Fix16EndTable)
2201
2202	.balign 16
2203L(fix16_3):
2204	movdqu -0x30(%rdx),%xmm1
2205	movdqa %xmm1,-0x30(%rcx)
2206L(fix16_2):
2207	movdqu -0x20(%rdx),%xmm2
2208	movdqa %xmm2,-0x20(%rcx)
2209L(fix16_1):
2210	movdqu -0x10(%rdx),%xmm3
2211	movdqa %xmm3,-0x10(%rcx)
2212L(fix16_0):
2213	lea    L(fwdPxQx)(%rip),%r10
2214	add    %r8,%rdx
2215	add    %r8,%rcx
2216
2217	movslq (%r10,%r8,4),%r9
2218	lea    (%r9,%r10,1),%r10
2219	jmpq   *%r10
2220
2221	.balign 16
2222L(pre_both_aligned):
2223	cmp    $0x80,%r8
2224	jl     L(fix_16b)
2225
2226	.balign 16
2227L(both_aligned):
2228
2229	/*
2230	 * this 'paired' load/load/store/store seems to do best.
2231	 */
2232	movdqa (%rdx),%xmm0
2233	movdqa 0x10(%rdx),%xmm1
2234
2235	movdqa %xmm0,(%rcx)
2236	movdqa %xmm1,0x10(%rcx)
2237	lea    -0x80(%r8),%r8
2238
2239	movdqa 0x20(%rdx),%xmm2
2240	movdqa 0x30(%rdx),%xmm3
2241
2242	movdqa %xmm2,0x20(%rcx)
2243	movdqa %xmm3,0x30(%rcx)
2244
2245	movdqa 0x40(%rdx),%xmm0
2246	movdqa 0x50(%rdx),%xmm1
2247	cmp    $0x80,%r8
2248
2249	movdqa %xmm0,0x40(%rcx)
2250	movdqa %xmm1,0x50(%rcx)
2251
2252	movdqa 0x60(%rdx),%xmm2
2253	movdqa 0x70(%rdx),%xmm3
2254	lea    0x80(%rdx),%rdx
2255	movdqa %xmm2,0x60(%rcx)
2256	movdqa %xmm3,0x70(%rcx)
2257	lea    0x80(%rcx),%rcx
2258	jge    L(both_aligned)
2259
2260L(fix_16b):
2261	add    %r8,%rcx
2262	lea    L(fwdPxQx)(%rip),%r10
2263	add    %r8,%rdx
2264
2265	movslq (%r10,%r8,4),%r9
2266	lea    (%r9,%r10,1),%r10
2267	jmpq   *%r10
2268
2269	.balign 16
2270L(Loop8byte_pre):
2271	# Use 8-byte moves
2272	mov    .largest_level_cache_size(%rip),%r9d
2273	shr    %r9		# take half of it
2274	cmp    %r9,%r8
2275	jg     L(byte8_nt_top)
2276	# Find out whether to use rep movsq
2277	cmp    $4096,%r8
2278	jle    L(byte8_top)
2279	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
2280	cmp    %r9,%r8
2281	jle    L(use_rep)
2282
2283	.balign     16
2284L(byte8_top):
2285	mov    (%rdx),%r9
2286	mov    0x8(%rdx),%r10
2287	lea    -0x40(%r8),%r8
2288	mov    %r9,(%rcx)
2289	mov    %r10,0x8(%rcx)
2290	mov    0x10(%rdx),%r11
2291	mov    0x18(%rdx),%r9
2292	mov    %r11,0x10(%rcx)
2293	mov    %r9,0x18(%rcx)
2294
2295	cmp    $0x40,%r8
2296	mov    0x20(%rdx),%r10
2297	mov    0x28(%rdx),%r11
2298	mov    %r10,0x20(%rcx)
2299	mov    %r11,0x28(%rcx)
2300	mov    0x30(%rdx),%r9
2301	mov    0x38(%rdx),%r10
2302	lea    0x40(%rdx),%rdx
2303	mov    %r9,0x30(%rcx)
2304	mov    %r10,0x38(%rcx)
2305	lea    0x40(%rcx),%rcx
2306	jg     L(byte8_top)
2307
2308L(byte8_end):
2309	lea    L(fwdPxQx)(%rip),%r10
2310	lea    (%rdx,%r8,1),%rdx
2311	lea    (%rcx,%r8,1),%rcx
2312
2313	movslq (%r10,%r8,4),%r9
2314	lea    (%r9,%r10,1),%r10
2315	jmpq   *%r10
2316
2317	.balign	16
2318L(use_rep):
2319	mov    %rdx,%rsi		# %rsi = source
2320	mov    %rcx,%rdi		# %rdi = destination
2321	mov    %r8,%rcx			# %rcx = count
2322	shrq   $3,%rcx			# 8-byte word count
2323	rep
2324	  movsq
2325	mov    %rsi,%rdx		# source
2326	mov    %rdi,%rcx		# destination
2327	andq   $7,%r8			# remainder
2328	jnz    L(byte8_end)
2329	ret
2330
2331	.balign 16
2332L(byte8_nt_top):
2333	sub    $0x40,%r8
2334	prefetchnta 0x180(%rdx)
2335	mov    (%rdx),%r9
2336	movnti %r9,(%rcx)
2337	mov    0x8(%rdx),%r10
2338	movnti %r10,0x8(%rcx)
2339	mov    0x10(%rdx),%r11
2340	movnti %r11,0x10(%rcx)
2341	mov    0x18(%rdx),%r9
2342	movnti %r9,0x18(%rcx)
2343	mov    0x20(%rdx),%r10
2344	movnti %r10,0x20(%rcx)
2345	mov    0x28(%rdx),%r11
2346	movnti %r11,0x28(%rcx)
2347	mov    0x30(%rdx),%r9
2348	movnti %r9,0x30(%rcx)
2349	mov    0x38(%rdx),%r10
2350	movnti %r10,0x38(%rcx)
2351
2352	lea    0x40(%rdx),%rdx
2353	lea    0x40(%rcx),%rcx
2354	cmp    $0x40,%r8
2355	jge    L(byte8_nt_top)
2356	sfence
2357	jmp    L(byte8_end)
2358
2359	SET_SIZE(memcpy)
2360
2361	.balign 16
2362L(CopyBackwards):
2363	mov    %rdx,%r8
2364	mov    %rdi,%rcx
2365	mov    %rsi,%rdx
2366	mov    %rdi,%rax		# return value
2367
2368	# ck alignment of last byte
2369	lea    (%rcx,%r8,1),%rcx
2370	test   $0x7,%rcx
2371	lea    (%rdx,%r8,1),%rdx
2372	jne    L(bk_align)
2373
2374L(bk_qw_aligned):
2375	lea    L(bkPxQx)(%rip),%r10
2376
2377	cmp    $0x90,%r8		# 144
2378	jg     L(bk_ck_sse2_alignment)
2379
2380	sub    %r8,%rcx
2381	sub    %r8,%rdx
2382
2383	movslq (%r10,%r8,4),%r9
2384	lea    (%r9,%r10,1),%r10
2385	jmpq   *%r10
2386
2387	.balign 16
2388L(bk_align):
2389	# only align if len > 8
2390	cmp    $8,%r8
2391	jle    L(bk_qw_aligned)
2392	test   $0x1,%rcx
2393	je     L(bk_tst2)
2394	dec    %rcx
2395	dec    %rdx
2396	dec    %r8
2397	mov    (%rdx),%r9b
2398	mov    %r9b,(%rcx)
2399
2400L(bk_tst2):
2401	test   $0x2,%rcx
2402	je     L(bk_tst3)
2403
2404L(bk_got2):
2405	sub    $0x2,%rcx
2406	sub    $0x2,%rdx
2407	sub    $0x2,%r8
2408	movzwq (%rdx),%r9
2409	mov    %r9w,(%rcx)
2410
2411L(bk_tst3):
2412	test   $0x4,%rcx
2413	je     L(bk_qw_aligned)
2414
2415L(bk_got3):
2416	sub    $0x4,%rcx
2417	sub    $0x4,%rdx
2418	sub    $0x4,%r8
2419	mov    (%rdx),%r9d
2420	mov    %r9d,(%rcx)
2421	jmp    L(bk_qw_aligned)
2422
2423	.balign 16
2424L(bk_ck_sse2_alignment):
2425	cmpl   $NO_SSE,.memops_method(%rip)
2426	je     L(bk_use_rep)
2427	# check alignment of last byte
2428	test   $0xf,%rcx
2429	jz     L(bk_sse2_cpy)
2430
2431L(bk_sse2_align):
2432	# only here if already aligned on at least a qword bndry
2433	sub    $0x8,%rcx
2434	sub    $0x8,%rdx
2435	sub    $0x8,%r8
2436	mov    (%rdx),%r9
2437	mov    %r9,(%rcx)
2438	#jmp   L(bk_sse2_cpy)
2439
2440	.balign 16
2441L(bk_sse2_cpy):
2442	sub    $0x80,%rcx		# 128
2443	sub    $0x80,%rdx
2444	movdqu 0x70(%rdx),%xmm3
2445	movdqu 0x60(%rdx),%xmm2
2446	movdqa %xmm3,0x70(%rcx)
2447	movdqa %xmm2,0x60(%rcx)
2448	sub    $0x80,%r8
2449	movdqu 0x50(%rdx),%xmm1
2450	movdqu 0x40(%rdx),%xmm0
2451	movdqa %xmm1,0x50(%rcx)
2452	movdqa %xmm0,0x40(%rcx)
2453
2454	cmp    $0x80,%r8
2455	movdqu 0x30(%rdx),%xmm3
2456	movdqu 0x20(%rdx),%xmm2
2457	movdqa %xmm3,0x30(%rcx)
2458	movdqa %xmm2,0x20(%rcx)
2459	movdqu 0x10(%rdx),%xmm1
2460	movdqu (%rdx),%xmm0
2461	movdqa %xmm1,0x10(%rcx)
2462	movdqa %xmm0,(%rcx)
2463	jge    L(bk_sse2_cpy)
2464
2465L(bk_sse2_cpy_end):
2466	lea    L(bkPxQx)(%rip),%r10
2467	sub    %r8,%rdx
2468	sub    %r8,%rcx
2469	movslq (%r10,%r8,4),%r9
2470	lea    (%r9,%r10,1),%r10
2471	jmpq   *%r10
2472
2473	.balign 16
2474L(bk_use_rep):
2475	xchg   %rcx,%r9
2476	mov    %rdx,%rsi		# source
2477	mov    %r9,%rdi			# destination
2478	mov    %r8,%rcx			# count
2479	sub    $8,%rsi
2480	sub    $8,%rdi
2481	shr    $3,%rcx
2482	std				# reverse direction
2483	rep
2484	  movsq
2485	cld				# reset direction flag
2486
2487	xchg   %rcx,%r9
2488	lea    L(bkPxQx)(%rip),%r10
2489	sub    %r8,%rdx
2490	sub    %r8,%rcx
2491	andq   $7,%r8			# remainder
2492	jz     2f
2493	movslq (%r10,%r8,4),%r9
2494	lea    (%r9,%r10,1),%r10
2495	jmpq   *%r10
24962:
2497	ret
2498
2499	.balign 16
2500L(bkP0QI):
2501	mov    0x88(%rdx),%r10
2502	mov    %r10,0x88(%rcx)
2503L(bkP0QH):
2504	mov    0x80(%rdx),%r10
2505	mov    %r10,0x80(%rcx)
2506L(bkP0QG):
2507	mov    0x78(%rdx),%r9
2508	mov    %r9,0x78(%rcx)
2509L(bkP0QF):
2510	mov    0x70(%rdx),%r11
2511	mov    %r11,0x70(%rcx)
2512L(bkP0QE):
2513	mov    0x68(%rdx),%r10
2514	mov    %r10,0x68(%rcx)
2515L(bkP0QD):
2516	mov    0x60(%rdx),%r9
2517	mov    %r9,0x60(%rcx)
2518L(bkP0QC):
2519	mov    0x58(%rdx),%r11
2520	mov    %r11,0x58(%rcx)
2521L(bkP0QB):
2522	mov    0x50(%rdx),%r10
2523	mov    %r10,0x50(%rcx)
2524L(bkP0QA):
2525	mov    0x48(%rdx),%r9
2526	mov    %r9,0x48(%rcx)
2527L(bkP0Q9):
2528	mov    0x40(%rdx),%r11
2529	mov    %r11,0x40(%rcx)
2530L(bkP0Q8):
2531	mov    0x38(%rdx),%r10
2532	mov    %r10,0x38(%rcx)
2533L(bkP0Q7):
2534	mov    0x30(%rdx),%r9
2535	mov    %r9,0x30(%rcx)
2536L(bkP0Q6):
2537	mov    0x28(%rdx),%r11
2538	mov    %r11,0x28(%rcx)
2539L(bkP0Q5):
2540	mov    0x20(%rdx),%r10
2541	mov    %r10,0x20(%rcx)
2542L(bkP0Q4):
2543	mov    0x18(%rdx),%r9
2544	mov    %r9,0x18(%rcx)
2545L(bkP0Q3):
2546	mov    0x10(%rdx),%r11
2547	mov    %r11,0x10(%rcx)
2548L(bkP0Q2):
2549	mov    0x8(%rdx),%r10
2550	mov    %r10,0x8(%rcx)
2551L(bkP0Q1):
2552	mov    (%rdx),%r9
2553	mov    %r9,(%rcx)
2554L(bkP0Q0):
2555	ret
2556
2557	.balign 16
2558L(bkP1QI):
2559	mov    0x89(%rdx),%r10
2560	mov    %r10,0x89(%rcx)
2561L(bkP1QH):
2562	mov    0x81(%rdx),%r11
2563	mov    %r11,0x81(%rcx)
2564L(bkP1QG):
2565	mov    0x79(%rdx),%r10
2566	mov    %r10,0x79(%rcx)
2567L(bkP1QF):
2568	mov    0x71(%rdx),%r9
2569	mov    %r9,0x71(%rcx)
2570L(bkP1QE):
2571	mov    0x69(%rdx),%r11
2572	mov    %r11,0x69(%rcx)
2573L(bkP1QD):
2574	mov    0x61(%rdx),%r10
2575	mov    %r10,0x61(%rcx)
2576L(bkP1QC):
2577	mov    0x59(%rdx),%r9
2578	mov    %r9,0x59(%rcx)
2579L(bkP1QB):
2580	mov    0x51(%rdx),%r11
2581	mov    %r11,0x51(%rcx)
2582L(bkP1QA):
2583	mov    0x49(%rdx),%r10
2584	mov    %r10,0x49(%rcx)
2585L(bkP1Q9):
2586	mov    0x41(%rdx),%r9
2587	mov    %r9,0x41(%rcx)
2588L(bkP1Q8):
2589	mov    0x39(%rdx),%r11
2590	mov    %r11,0x39(%rcx)
2591L(bkP1Q7):
2592	mov    0x31(%rdx),%r10
2593	mov    %r10,0x31(%rcx)
2594L(bkP1Q6):
2595	mov    0x29(%rdx),%r9
2596	mov    %r9,0x29(%rcx)
2597L(bkP1Q5):
2598	mov    0x21(%rdx),%r11
2599	mov    %r11,0x21(%rcx)
2600L(bkP1Q4):
2601	mov    0x19(%rdx),%r10
2602	mov    %r10,0x19(%rcx)
2603L(bkP1Q3):
2604	mov    0x11(%rdx),%r9
2605	mov    %r9,0x11(%rcx)
2606L(bkP1Q2):
2607	mov    0x9(%rdx),%r11
2608	mov    %r11,0x9(%rcx)
2609L(bkP1Q1):
2610	mov    0x1(%rdx),%r10
2611	mov    %r10,0x1(%rcx)
2612L(bkP1Q0):
2613	mov    (%rdx),%r9b
2614	mov    %r9b,(%rcx)
2615	ret
2616
2617	.balign 16
2618L(bkP2QI):
2619	mov    0x8a(%rdx),%r10
2620	mov    %r10,0x8a(%rcx)
2621L(bkP2QH):
2622	mov    0x82(%rdx),%r11
2623	mov    %r11,0x82(%rcx)
2624L(bkP2QG):
2625	mov    0x7a(%rdx),%r10
2626	mov    %r10,0x7a(%rcx)
2627L(bkP2QF):
2628	mov    0x72(%rdx),%r9
2629	mov    %r9,0x72(%rcx)
2630L(bkP2QE):
2631	mov    0x6a(%rdx),%r11
2632	mov    %r11,0x6a(%rcx)
2633L(bkP2QD):
2634	mov    0x62(%rdx),%r10
2635	mov    %r10,0x62(%rcx)
2636L(bkP2QC):
2637	mov    0x5a(%rdx),%r9
2638	mov    %r9,0x5a(%rcx)
2639L(bkP2QB):
2640	mov    0x52(%rdx),%r11
2641	mov    %r11,0x52(%rcx)
2642L(bkP2QA):
2643	mov    0x4a(%rdx),%r10
2644	mov    %r10,0x4a(%rcx)
2645L(bkP2Q9):
2646	mov    0x42(%rdx),%r9
2647	mov    %r9,0x42(%rcx)
2648L(bkP2Q8):
2649	mov    0x3a(%rdx),%r11
2650	mov    %r11,0x3a(%rcx)
2651L(bkP2Q7):
2652	mov    0x32(%rdx),%r10
2653	mov    %r10,0x32(%rcx)
2654L(bkP2Q6):
2655	mov    0x2a(%rdx),%r9
2656	mov    %r9,0x2a(%rcx)
2657L(bkP2Q5):
2658	mov    0x22(%rdx),%r11
2659	mov    %r11,0x22(%rcx)
2660L(bkP2Q4):
2661	mov    0x1a(%rdx),%r10
2662	mov    %r10,0x1a(%rcx)
2663L(bkP2Q3):
2664	mov    0x12(%rdx),%r9
2665	mov    %r9,0x12(%rcx)
2666L(bkP2Q2):
2667	mov    0xa(%rdx),%r11
2668	mov    %r11,0xa(%rcx)
2669L(bkP2Q1):
2670	mov    0x2(%rdx),%r10
2671	mov    %r10,0x2(%rcx)
2672L(bkP2Q0):
2673	mov    (%rdx),%r9w
2674	mov    %r9w,(%rcx)
2675	ret
2676
2677	.balign 16
2678L(bkP3QI):
2679	mov    0x8b(%rdx),%r10
2680	mov    %r10,0x8b(%rcx)
2681L(bkP3QH):
2682	mov    0x83(%rdx),%r11
2683	mov    %r11,0x83(%rcx)
2684L(bkP3QG):
2685	mov    0x7b(%rdx),%r10
2686	mov    %r10,0x7b(%rcx)
2687L(bkP3QF):
2688	mov    0x73(%rdx),%r9
2689	mov    %r9,0x73(%rcx)
2690L(bkP3QE):
2691	mov    0x6b(%rdx),%r11
2692	mov    %r11,0x6b(%rcx)
2693L(bkP3QD):
2694	mov    0x63(%rdx),%r10
2695	mov    %r10,0x63(%rcx)
2696L(bkP3QC):
2697	mov    0x5b(%rdx),%r9
2698	mov    %r9,0x5b(%rcx)
2699L(bkP3QB):
2700	mov    0x53(%rdx),%r11
2701	mov    %r11,0x53(%rcx)
2702L(bkP3QA):
2703	mov    0x4b(%rdx),%r10
2704	mov    %r10,0x4b(%rcx)
2705L(bkP3Q9):
2706	mov    0x43(%rdx),%r9
2707	mov    %r9,0x43(%rcx)
2708L(bkP3Q8):
2709	mov    0x3b(%rdx),%r11
2710	mov    %r11,0x3b(%rcx)
2711L(bkP3Q7):
2712	mov    0x33(%rdx),%r10
2713	mov    %r10,0x33(%rcx)
2714L(bkP3Q6):
2715	mov    0x2b(%rdx),%r9
2716	mov    %r9,0x2b(%rcx)
2717L(bkP3Q5):
2718	mov    0x23(%rdx),%r11
2719	mov    %r11,0x23(%rcx)
2720L(bkP3Q4):
2721	mov    0x1b(%rdx),%r10
2722	mov    %r10,0x1b(%rcx)
2723L(bkP3Q3):
2724	mov    0x13(%rdx),%r9
2725	mov    %r9,0x13(%rcx)
2726L(bkP3Q2):
2727	mov    0xb(%rdx),%r11
2728	mov    %r11,0xb(%rcx)
2729L(bkP3Q1):
2730	mov    0x3(%rdx),%r10
2731	mov    %r10,0x3(%rcx)
2732L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
2733	mov    0x1(%rdx),%r9w
2734	mov    %r9w,0x1(%rcx)
2735	mov    (%rdx),%r10b
2736	mov    %r10b,(%rcx)
2737	ret
2738
2739	.balign 16
2740L(bkP4QI):
2741	mov    0x8c(%rdx),%r10
2742	mov    %r10,0x8c(%rcx)
2743L(bkP4QH):
2744	mov    0x84(%rdx),%r11
2745	mov    %r11,0x84(%rcx)
2746L(bkP4QG):
2747	mov    0x7c(%rdx),%r10
2748	mov    %r10,0x7c(%rcx)
2749L(bkP4QF):
2750	mov    0x74(%rdx),%r9
2751	mov    %r9,0x74(%rcx)
2752L(bkP4QE):
2753	mov    0x6c(%rdx),%r11
2754	mov    %r11,0x6c(%rcx)
2755L(bkP4QD):
2756	mov    0x64(%rdx),%r10
2757	mov    %r10,0x64(%rcx)
2758L(bkP4QC):
2759	mov    0x5c(%rdx),%r9
2760	mov    %r9,0x5c(%rcx)
2761L(bkP4QB):
2762	mov    0x54(%rdx),%r11
2763	mov    %r11,0x54(%rcx)
2764L(bkP4QA):
2765	mov    0x4c(%rdx),%r10
2766	mov    %r10,0x4c(%rcx)
2767L(bkP4Q9):
2768	mov    0x44(%rdx),%r9
2769	mov    %r9,0x44(%rcx)
2770L(bkP4Q8):
2771	mov    0x3c(%rdx),%r11
2772	mov    %r11,0x3c(%rcx)
2773L(bkP4Q7):
2774	mov    0x34(%rdx),%r10
2775	mov    %r10,0x34(%rcx)
2776L(bkP4Q6):
2777	mov    0x2c(%rdx),%r9
2778	mov    %r9,0x2c(%rcx)
2779L(bkP4Q5):
2780	mov    0x24(%rdx),%r11
2781	mov    %r11,0x24(%rcx)
2782L(bkP4Q4):
2783	mov    0x1c(%rdx),%r10
2784	mov    %r10,0x1c(%rcx)
2785L(bkP4Q3):
2786	mov    0x14(%rdx),%r9
2787	mov    %r9,0x14(%rcx)
2788L(bkP4Q2):
2789	mov    0xc(%rdx),%r11
2790	mov    %r11,0xc(%rcx)
2791L(bkP4Q1):
2792	mov    0x4(%rdx),%r10
2793	mov    %r10,0x4(%rcx)
2794L(bkP4Q0):
2795	mov    (%rdx),%r9d
2796	mov    %r9d,(%rcx)
2797	ret
2798
2799	.balign 16
2800L(bkP5QI):
2801	mov    0x8d(%rdx),%r10
2802	mov    %r10,0x8d(%rcx)
2803L(bkP5QH):
2804	mov    0x85(%rdx),%r9
2805	mov    %r9,0x85(%rcx)
2806L(bkP5QG):
2807	mov    0x7d(%rdx),%r11
2808	mov    %r11,0x7d(%rcx)
2809L(bkP5QF):
2810	mov    0x75(%rdx),%r10
2811	mov    %r10,0x75(%rcx)
2812L(bkP5QE):
2813	mov    0x6d(%rdx),%r9
2814	mov    %r9,0x6d(%rcx)
2815L(bkP5QD):
2816	mov    0x65(%rdx),%r11
2817	mov    %r11,0x65(%rcx)
2818L(bkP5QC):
2819	mov    0x5d(%rdx),%r10
2820	mov    %r10,0x5d(%rcx)
2821L(bkP5QB):
2822	mov    0x55(%rdx),%r9
2823	mov    %r9,0x55(%rcx)
2824L(bkP5QA):
2825	mov    0x4d(%rdx),%r11
2826	mov    %r11,0x4d(%rcx)
2827L(bkP5Q9):
2828	mov    0x45(%rdx),%r10
2829	mov    %r10,0x45(%rcx)
2830L(bkP5Q8):
2831	mov    0x3d(%rdx),%r9
2832	mov    %r9,0x3d(%rcx)
2833L(bkP5Q7):
2834	mov    0x35(%rdx),%r11
2835	mov    %r11,0x35(%rcx)
2836L(bkP5Q6):
2837	mov    0x2d(%rdx),%r10
2838	mov    %r10,0x2d(%rcx)
2839L(bkP5Q5):
2840	mov    0x25(%rdx),%r9
2841	mov    %r9,0x25(%rcx)
2842L(bkP5Q4):
2843	mov    0x1d(%rdx),%r11
2844	mov    %r11,0x1d(%rcx)
2845L(bkP5Q3):
2846	mov    0x15(%rdx),%r10
2847	mov    %r10,0x15(%rcx)
2848L(bkP5Q2):
2849	mov    0xd(%rdx),%r9
2850	mov    %r9,0xd(%rcx)
2851L(bkP5Q1):
2852	mov    0x5(%rdx),%r11
2853	mov    %r11,0x5(%rcx)
2854L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
2855	mov    0x1(%rdx),%r9d
2856	mov    %r9d,0x1(%rcx)
2857	mov    (%rdx),%r10b
2858	mov    %r10b,(%rcx)
2859	ret
2860
2861	.balign 16
2862L(bkP6QI):
2863	mov    0x8e(%rdx),%r10
2864	mov    %r10,0x8e(%rcx)
2865L(bkP6QH):
2866	mov    0x86(%rdx),%r11
2867	mov    %r11,0x86(%rcx)
2868L(bkP6QG):
2869	mov    0x7e(%rdx),%r10
2870	mov    %r10,0x7e(%rcx)
2871L(bkP6QF):
2872	mov    0x76(%rdx),%r9
2873	mov    %r9,0x76(%rcx)
2874L(bkP6QE):
2875	mov    0x6e(%rdx),%r11
2876	mov    %r11,0x6e(%rcx)
2877L(bkP6QD):
2878	mov    0x66(%rdx),%r10
2879	mov    %r10,0x66(%rcx)
2880L(bkP6QC):
2881	mov    0x5e(%rdx),%r9
2882	mov    %r9,0x5e(%rcx)
2883L(bkP6QB):
2884	mov    0x56(%rdx),%r11
2885	mov    %r11,0x56(%rcx)
2886L(bkP6QA):
2887	mov    0x4e(%rdx),%r10
2888	mov    %r10,0x4e(%rcx)
2889L(bkP6Q9):
2890	mov    0x46(%rdx),%r9
2891	mov    %r9,0x46(%rcx)
2892L(bkP6Q8):
2893	mov    0x3e(%rdx),%r11
2894	mov    %r11,0x3e(%rcx)
2895L(bkP6Q7):
2896	mov    0x36(%rdx),%r10
2897	mov    %r10,0x36(%rcx)
2898L(bkP6Q6):
2899	mov    0x2e(%rdx),%r9
2900	mov    %r9,0x2e(%rcx)
2901L(bkP6Q5):
2902	mov    0x26(%rdx),%r11
2903	mov    %r11,0x26(%rcx)
2904L(bkP6Q4):
2905	mov    0x1e(%rdx),%r10
2906	mov    %r10,0x1e(%rcx)
2907L(bkP6Q3):
2908	mov    0x16(%rdx),%r9
2909	mov    %r9,0x16(%rcx)
2910L(bkP6Q2):
2911	mov    0xe(%rdx),%r11
2912	mov    %r11,0xe(%rcx)
2913L(bkP6Q1):
2914	mov    0x6(%rdx),%r10
2915	mov    %r10,0x6(%rcx)
2916L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
2917	mov    0x2(%rdx),%r9d
2918	mov    %r9d,0x2(%rcx)
2919	mov    (%rdx),%r10w
2920	mov    %r10w,(%rcx)
2921	ret
2922
2923	.balign 16
2924L(bkP7QI):
2925	mov    0x8f(%rdx),%r10
2926	mov    %r10,0x8f(%rcx)
2927L(bkP7QH):
2928	mov    0x87(%rdx),%r11
2929	mov    %r11,0x87(%rcx)
2930L(bkP7QG):
2931	mov    0x7f(%rdx),%r10
2932	mov    %r10,0x7f(%rcx)
2933L(bkP7QF):
2934	mov    0x77(%rdx),%r9
2935	mov    %r9,0x77(%rcx)
2936L(bkP7QE):
2937	mov    0x6f(%rdx),%r11
2938	mov    %r11,0x6f(%rcx)
2939L(bkP7QD):
2940	mov    0x67(%rdx),%r10
2941	mov    %r10,0x67(%rcx)
2942L(bkP7QC):
2943	mov    0x5f(%rdx),%r9
2944	mov    %r9,0x5f(%rcx)
2945L(bkP7QB):
2946	mov    0x57(%rdx),%r11
2947	mov    %r11,0x57(%rcx)
2948L(bkP7QA):
2949	mov    0x4f(%rdx),%r10
2950	mov    %r10,0x4f(%rcx)
2951L(bkP7Q9):
2952	mov    0x47(%rdx),%r9
2953	mov    %r9,0x47(%rcx)
2954L(bkP7Q8):
2955	mov    0x3f(%rdx),%r11
2956	mov    %r11,0x3f(%rcx)
2957L(bkP7Q7):
2958	mov    0x37(%rdx),%r10
2959	mov    %r10,0x37(%rcx)
2960L(bkP7Q6):
2961	mov    0x2f(%rdx),%r9
2962	mov    %r9,0x2f(%rcx)
2963L(bkP7Q5):
2964	mov    0x27(%rdx),%r11
2965	mov    %r11,0x27(%rcx)
2966L(bkP7Q4):
2967	mov    0x1f(%rdx),%r10
2968	mov    %r10,0x1f(%rcx)
2969L(bkP7Q3):
2970	mov    0x17(%rdx),%r9
2971	mov    %r9,0x17(%rcx)
2972L(bkP7Q2):
2973	mov    0xf(%rdx),%r11
2974	mov    %r11,0xf(%rcx)
2975L(bkP7Q1):
2976	mov    0x7(%rdx),%r10
2977	mov    %r10,0x7(%rcx)
2978L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
2979	mov    0x3(%rdx),%r9d
2980	mov    %r9d,0x3(%rcx)
2981	mov    0x1(%rdx),%r10w
2982	mov    %r10w,0x1(%rcx)
2983	mov    (%rdx),%r11b
2984	mov    %r11b,(%rcx)
2985	ret
2986
2987		.balign 16
2988L(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
2989		.int L(bkP1Q0)-L(bkPxQx)
2990		.int L(bkP2Q0)-L(bkPxQx)
2991		.int L(bkP3Q0)-L(bkPxQx)
2992		.int L(bkP4Q0)-L(bkPxQx)
2993		.int L(bkP5Q0)-L(bkPxQx)
2994		.int L(bkP6Q0)-L(bkPxQx)
2995		.int L(bkP7Q0)-L(bkPxQx)
2996
2997		.int L(bkP0Q1)-L(bkPxQx)
2998		.int L(bkP1Q1)-L(bkPxQx)
2999		.int L(bkP2Q1)-L(bkPxQx)
3000		.int L(bkP3Q1)-L(bkPxQx)
3001		.int L(bkP4Q1)-L(bkPxQx)
3002		.int L(bkP5Q1)-L(bkPxQx)
3003		.int L(bkP6Q1)-L(bkPxQx)
3004		.int L(bkP7Q1)-L(bkPxQx)
3005
3006		.int L(bkP0Q2)-L(bkPxQx)
3007		.int L(bkP1Q2)-L(bkPxQx)
3008		.int L(bkP2Q2)-L(bkPxQx)
3009		.int L(bkP3Q2)-L(bkPxQx)
3010		.int L(bkP4Q2)-L(bkPxQx)
3011		.int L(bkP5Q2)-L(bkPxQx)
3012		.int L(bkP6Q2)-L(bkPxQx)
3013		.int L(bkP7Q2)-L(bkPxQx)
3014
3015		.int L(bkP0Q3)-L(bkPxQx)
3016		.int L(bkP1Q3)-L(bkPxQx)
3017		.int L(bkP2Q3)-L(bkPxQx)
3018		.int L(bkP3Q3)-L(bkPxQx)
3019		.int L(bkP4Q3)-L(bkPxQx)
3020		.int L(bkP5Q3)-L(bkPxQx)
3021		.int L(bkP6Q3)-L(bkPxQx)
3022		.int L(bkP7Q3)-L(bkPxQx)
3023
3024		.int L(bkP0Q4)-L(bkPxQx)
3025		.int L(bkP1Q4)-L(bkPxQx)
3026		.int L(bkP2Q4)-L(bkPxQx)
3027		.int L(bkP3Q4)-L(bkPxQx)
3028		.int L(bkP4Q4)-L(bkPxQx)
3029		.int L(bkP5Q4)-L(bkPxQx)
3030		.int L(bkP6Q4)-L(bkPxQx)
3031		.int L(bkP7Q4)-L(bkPxQx)
3032
3033		.int L(bkP0Q5)-L(bkPxQx)
3034		.int L(bkP1Q5)-L(bkPxQx)
3035		.int L(bkP2Q5)-L(bkPxQx)
3036		.int L(bkP3Q5)-L(bkPxQx)
3037		.int L(bkP4Q5)-L(bkPxQx)
3038		.int L(bkP5Q5)-L(bkPxQx)
3039		.int L(bkP6Q5)-L(bkPxQx)
3040		.int L(bkP7Q5)-L(bkPxQx)
3041
3042		.int L(bkP0Q6)-L(bkPxQx)
3043		.int L(bkP1Q6)-L(bkPxQx)
3044		.int L(bkP2Q6)-L(bkPxQx)
3045		.int L(bkP3Q6)-L(bkPxQx)
3046		.int L(bkP4Q6)-L(bkPxQx)
3047		.int L(bkP5Q6)-L(bkPxQx)
3048		.int L(bkP6Q6)-L(bkPxQx)
3049		.int L(bkP7Q6)-L(bkPxQx)
3050
3051		.int L(bkP0Q7)-L(bkPxQx)
3052		.int L(bkP1Q7)-L(bkPxQx)
3053		.int L(bkP2Q7)-L(bkPxQx)
3054		.int L(bkP3Q7)-L(bkPxQx)
3055		.int L(bkP4Q7)-L(bkPxQx)
3056		.int L(bkP5Q7)-L(bkPxQx)
3057		.int L(bkP6Q7)-L(bkPxQx)
3058		.int L(bkP7Q7)-L(bkPxQx)
3059
3060		.int L(bkP0Q8)-L(bkPxQx)
3061		.int L(bkP1Q8)-L(bkPxQx)
3062		.int L(bkP2Q8)-L(bkPxQx)
3063		.int L(bkP3Q8)-L(bkPxQx)
3064		.int L(bkP4Q8)-L(bkPxQx)
3065		.int L(bkP5Q8)-L(bkPxQx)
3066		.int L(bkP6Q8)-L(bkPxQx)
3067		.int L(bkP7Q8)-L(bkPxQx)
3068
3069		.int L(bkP0Q9)-L(bkPxQx)
3070		.int L(bkP1Q9)-L(bkPxQx)
3071		.int L(bkP2Q9)-L(bkPxQx)
3072		.int L(bkP3Q9)-L(bkPxQx)
3073		.int L(bkP4Q9)-L(bkPxQx)
3074		.int L(bkP5Q9)-L(bkPxQx)
3075		.int L(bkP6Q9)-L(bkPxQx)
3076		.int L(bkP7Q9)-L(bkPxQx)
3077
3078		.int L(bkP0QA)-L(bkPxQx)
3079		.int L(bkP1QA)-L(bkPxQx)
3080		.int L(bkP2QA)-L(bkPxQx)
3081		.int L(bkP3QA)-L(bkPxQx)
3082		.int L(bkP4QA)-L(bkPxQx)
3083		.int L(bkP5QA)-L(bkPxQx)
3084		.int L(bkP6QA)-L(bkPxQx)
3085		.int L(bkP7QA)-L(bkPxQx)
3086
3087		.int L(bkP0QB)-L(bkPxQx)
3088		.int L(bkP1QB)-L(bkPxQx)
3089		.int L(bkP2QB)-L(bkPxQx)
3090		.int L(bkP3QB)-L(bkPxQx)
3091		.int L(bkP4QB)-L(bkPxQx)
3092		.int L(bkP5QB)-L(bkPxQx)
3093		.int L(bkP6QB)-L(bkPxQx)
3094		.int L(bkP7QB)-L(bkPxQx)
3095
3096		.int L(bkP0QC)-L(bkPxQx)
3097		.int L(bkP1QC)-L(bkPxQx)
3098		.int L(bkP2QC)-L(bkPxQx)
3099		.int L(bkP3QC)-L(bkPxQx)
3100		.int L(bkP4QC)-L(bkPxQx)
3101		.int L(bkP5QC)-L(bkPxQx)
3102		.int L(bkP6QC)-L(bkPxQx)
3103		.int L(bkP7QC)-L(bkPxQx)
3104
3105		.int L(bkP0QD)-L(bkPxQx)
3106		.int L(bkP1QD)-L(bkPxQx)
3107		.int L(bkP2QD)-L(bkPxQx)
3108		.int L(bkP3QD)-L(bkPxQx)
3109		.int L(bkP4QD)-L(bkPxQx)
3110		.int L(bkP5QD)-L(bkPxQx)
3111		.int L(bkP6QD)-L(bkPxQx)
3112		.int L(bkP7QD)-L(bkPxQx)
3113
3114		.int L(bkP0QE)-L(bkPxQx)
3115		.int L(bkP1QE)-L(bkPxQx)
3116		.int L(bkP2QE)-L(bkPxQx)
3117		.int L(bkP3QE)-L(bkPxQx)
3118		.int L(bkP4QE)-L(bkPxQx)
3119		.int L(bkP5QE)-L(bkPxQx)
3120		.int L(bkP6QE)-L(bkPxQx)
3121		.int L(bkP7QE)-L(bkPxQx)
3122
3123		.int L(bkP0QF)-L(bkPxQx)
3124		.int L(bkP1QF)-L(bkPxQx)
3125		.int L(bkP2QF)-L(bkPxQx)
3126		.int L(bkP3QF)-L(bkPxQx)
3127		.int L(bkP4QF)-L(bkPxQx)
3128		.int L(bkP5QF)-L(bkPxQx)
3129		.int L(bkP6QF)-L(bkPxQx)
3130		.int L(bkP7QF)-L(bkPxQx)
3131
3132		.int L(bkP0QG)-L(bkPxQx)
3133		.int L(bkP1QG)-L(bkPxQx)
3134		.int L(bkP2QG)-L(bkPxQx)
3135		.int L(bkP3QG)-L(bkPxQx)
3136		.int L(bkP4QG)-L(bkPxQx)
3137		.int L(bkP5QG)-L(bkPxQx)
3138		.int L(bkP6QG)-L(bkPxQx)
3139		.int L(bkP7QG)-L(bkPxQx)
3140
3141		.int L(bkP0QH)-L(bkPxQx)
3142		.int L(bkP1QH)-L(bkPxQx)
3143		.int L(bkP2QH)-L(bkPxQx)
3144		.int L(bkP3QH)-L(bkPxQx)
3145		.int L(bkP4QH)-L(bkPxQx)
3146		.int L(bkP5QH)-L(bkPxQx)
3147		.int L(bkP6QH)-L(bkPxQx)
3148		.int L(bkP7QH)-L(bkPxQx)
3149
3150		.int L(bkP0QI)-L(bkPxQx)
3151		.int L(bkP1QI)-L(bkPxQx)
3152		.int L(bkP2QI)-L(bkPxQx)
3153		.int L(bkP3QI)-L(bkPxQx)
3154		.int L(bkP4QI)-L(bkPxQx)
3155		.int L(bkP5QI)-L(bkPxQx)
3156		.int L(bkP6QI)-L(bkPxQx)
3157		.int L(bkP7QI)-L(bkPxQx)
3158
3159	SET_SIZE(memmove)
3160