xref: /titanic_50/usr/src/lib/libc/amd64/gen/memcpy.s (revision a9da3307db733eb1739ba859952610bba3d894ab)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2008, Intel Corporation
24 * All rights reserved.
25 */
26
27/*
28 * memcpy.s - copies two blocks of memory
29 *	Implements memcpy() and memmove() libc primitives.
30 */
31	.ident	"%Z%%M%	%I%	%E% SMI"
32
33	.file	"%M%"
34
35#include <sys/asm_linkage.h>
36	ANSI_PRAGMA_WEAK(memmove,function)
37	ANSI_PRAGMA_WEAK(memcpy,function)
38
39#include "synonyms.h"
40#include "cache.h"
41#include "proc64_id.h"
42
43#define L(s) .memcpy/**/s
44
45/*
46 * memcpy algorithm overview:
47 *
48 * Thresholds used below were determined experimentally.
49 *
50 * Pseudo code:
51 *
52 * If (size <= 128 bytes) {
53 *	do unrolled code (primarily 8-byte loads/stores) regardless of
54 *	alignment.
55 * } else {
56 *	Align destination to 16-byte boundary
57 *
58 *      if (NO_SSE) {
59 *		If (size > half of the largest level cache) {
60 *			Use 8-byte non-temporal stores (64-bytes/loop)
61 *		} else {
62 *			if (size > 4K && size <= half l1 cache size) {
63 *				Use rep movsq
64 *			} else {
65 *				Use 8-byte loads/stores (64 bytes per loop)
66 *			}
67 *		}
68 *
69 *	} else { **USE SSE**
70 *		If (size > half of the largest level cache) {
71 *			Use 16-byte non-temporal stores (128-bytes per loop)
72 *		} else {
73 *			If (both source and destination are aligned) {
74 *			    Use 16-byte aligned loads and stores (128 bytes/loop)
75 *			} else {
76 *			    use pairs of xmm registers with SSE2 or SSSE3
77 *			    instructions to concatenate and shift appropriately
78 *			    to account for source unalignment. This enables
79 *			    16-byte aligned loads to be done.
80 *			}
81 *		}
82	}
83 *
84 *	Finish any remaining bytes via unrolled code above.
85 * }
86 *
87 * memmove overview:
88 *	memmove is the same as memcpy except one case where copy needs to be
89 *	done backwards. The copy backwards code is done in a similar manner.
90 */
91
92	ENTRY(memmove)
93	cmp	%rsi,%rdi		# if dst <= src
94	jbe	L(CopyForward)		# then do copy forward
95	mov	%rsi,%r9		# move src to r9
96	add	%rdx,%r9		# add len to get addr of end of src
97	cmp	%r9,%rdi		# if dst < end of src
98	jb	L(CopyBackwards)	# then do copy backwards
99	jmp	L(CopyForward)
100
101	ENTRY (memcpy)
102L(CopyForward):
103	mov    %rdx,%r8
104	mov    %rdi,%rcx
105	mov    %rsi,%rdx
106	mov    %rdi,%rax
107	lea    L(fwdPxQx)(%rip),%r11
108	cmp    $0x80,%r8		# 128
109	jg     L(ck_use_sse2)
110	add    %r8,%rcx
111	add    %r8,%rdx
112
113	movslq (%r11,%r8,4),%r10
114	lea    (%r10,%r11,1),%r11
115	jmpq   *%r11
116
117	.balign 16
118L(ShrtAlignNew):
119	lea    L(AliPxQx)(%rip),%r11
120	mov    %rcx,%r9
121	and    $0xf,%r9
122
123	movslq (%r11,%r9,4),%r10
124	lea    (%r10,%r11,1),%r11
125	jmpq   *%r11
126
127	.balign 16
128L(fwdPxQx): .int       L(P0Q0)-L(fwdPxQx)
129           .int        L(P1Q0)-L(fwdPxQx)
130           .int        L(P2Q0)-L(fwdPxQx)
131           .int        L(P3Q0)-L(fwdPxQx)
132           .int        L(P4Q0)-L(fwdPxQx)
133           .int        L(P5Q0)-L(fwdPxQx)
134           .int        L(P6Q0)-L(fwdPxQx)
135           .int        L(P7Q0)-L(fwdPxQx)
136
137           .int        L(P0Q1)-L(fwdPxQx)
138           .int        L(P1Q1)-L(fwdPxQx)
139           .int        L(P2Q1)-L(fwdPxQx)
140           .int        L(P3Q1)-L(fwdPxQx)
141           .int        L(P4Q1)-L(fwdPxQx)
142           .int        L(P5Q1)-L(fwdPxQx)
143           .int        L(P6Q1)-L(fwdPxQx)
144           .int        L(P7Q1)-L(fwdPxQx)
145
146           .int        L(P0Q2)-L(fwdPxQx)
147           .int        L(P1Q2)-L(fwdPxQx)
148           .int        L(P2Q2)-L(fwdPxQx)
149           .int        L(P3Q2)-L(fwdPxQx)
150           .int        L(P4Q2)-L(fwdPxQx)
151           .int        L(P5Q2)-L(fwdPxQx)
152           .int        L(P6Q2)-L(fwdPxQx)
153           .int        L(P7Q2)-L(fwdPxQx)
154
155           .int        L(P0Q3)-L(fwdPxQx)
156           .int        L(P1Q3)-L(fwdPxQx)
157           .int        L(P2Q3)-L(fwdPxQx)
158           .int        L(P3Q3)-L(fwdPxQx)
159           .int        L(P4Q3)-L(fwdPxQx)
160           .int        L(P5Q3)-L(fwdPxQx)
161           .int        L(P6Q3)-L(fwdPxQx)
162           .int        L(P7Q3)-L(fwdPxQx)
163
164           .int        L(P0Q4)-L(fwdPxQx)
165           .int        L(P1Q4)-L(fwdPxQx)
166           .int        L(P2Q4)-L(fwdPxQx)
167           .int        L(P3Q4)-L(fwdPxQx)
168           .int        L(P4Q4)-L(fwdPxQx)
169           .int        L(P5Q4)-L(fwdPxQx)
170           .int        L(P6Q4)-L(fwdPxQx)
171           .int        L(P7Q4)-L(fwdPxQx)
172
173           .int        L(P0Q5)-L(fwdPxQx)
174           .int        L(P1Q5)-L(fwdPxQx)
175           .int        L(P2Q5)-L(fwdPxQx)
176           .int        L(P3Q5)-L(fwdPxQx)
177           .int        L(P4Q5)-L(fwdPxQx)
178           .int        L(P5Q5)-L(fwdPxQx)
179           .int        L(P6Q5)-L(fwdPxQx)
180           .int        L(P7Q5)-L(fwdPxQx)
181
182           .int        L(P0Q6)-L(fwdPxQx)
183           .int        L(P1Q6)-L(fwdPxQx)
184           .int        L(P2Q6)-L(fwdPxQx)
185           .int        L(P3Q6)-L(fwdPxQx)
186           .int        L(P4Q6)-L(fwdPxQx)
187           .int        L(P5Q6)-L(fwdPxQx)
188           .int        L(P6Q6)-L(fwdPxQx)
189           .int        L(P7Q6)-L(fwdPxQx)
190
191           .int        L(P0Q7)-L(fwdPxQx)
192           .int        L(P1Q7)-L(fwdPxQx)
193           .int        L(P2Q7)-L(fwdPxQx)
194           .int        L(P3Q7)-L(fwdPxQx)
195           .int        L(P4Q7)-L(fwdPxQx)
196           .int        L(P5Q7)-L(fwdPxQx)
197           .int        L(P6Q7)-L(fwdPxQx)
198           .int        L(P7Q7)-L(fwdPxQx)
199
200           .int        L(P0Q8)-L(fwdPxQx)
201           .int        L(P1Q8)-L(fwdPxQx)
202           .int        L(P2Q8)-L(fwdPxQx)
203           .int        L(P3Q8)-L(fwdPxQx)
204           .int        L(P4Q8)-L(fwdPxQx)
205           .int        L(P5Q8)-L(fwdPxQx)
206           .int        L(P6Q8)-L(fwdPxQx)
207           .int        L(P7Q8)-L(fwdPxQx)
208
209           .int        L(P0Q9)-L(fwdPxQx)
210           .int        L(P1Q9)-L(fwdPxQx)
211           .int        L(P2Q9)-L(fwdPxQx)
212           .int        L(P3Q9)-L(fwdPxQx)
213           .int        L(P4Q9)-L(fwdPxQx)
214           .int        L(P5Q9)-L(fwdPxQx)
215           .int        L(P6Q9)-L(fwdPxQx)
216           .int        L(P7Q9)-L(fwdPxQx)
217
218           .int        L(P0QA)-L(fwdPxQx)
219           .int        L(P1QA)-L(fwdPxQx)
220           .int        L(P2QA)-L(fwdPxQx)
221           .int        L(P3QA)-L(fwdPxQx)
222           .int        L(P4QA)-L(fwdPxQx)
223           .int        L(P5QA)-L(fwdPxQx)
224           .int        L(P6QA)-L(fwdPxQx)
225           .int        L(P7QA)-L(fwdPxQx)
226
227           .int        L(P0QB)-L(fwdPxQx)
228           .int        L(P1QB)-L(fwdPxQx)
229           .int        L(P2QB)-L(fwdPxQx)
230           .int        L(P3QB)-L(fwdPxQx)
231           .int        L(P4QB)-L(fwdPxQx)
232           .int        L(P5QB)-L(fwdPxQx)
233           .int        L(P6QB)-L(fwdPxQx)
234           .int        L(P7QB)-L(fwdPxQx)
235
236           .int        L(P0QC)-L(fwdPxQx)
237           .int        L(P1QC)-L(fwdPxQx)
238           .int        L(P2QC)-L(fwdPxQx)
239           .int        L(P3QC)-L(fwdPxQx)
240           .int        L(P4QC)-L(fwdPxQx)
241           .int        L(P5QC)-L(fwdPxQx)
242           .int        L(P6QC)-L(fwdPxQx)
243           .int        L(P7QC)-L(fwdPxQx)
244
245           .int        L(P0QD)-L(fwdPxQx)
246           .int        L(P1QD)-L(fwdPxQx)
247           .int        L(P2QD)-L(fwdPxQx)
248           .int        L(P3QD)-L(fwdPxQx)
249           .int        L(P4QD)-L(fwdPxQx)
250           .int        L(P5QD)-L(fwdPxQx)
251           .int        L(P6QD)-L(fwdPxQx)
252           .int        L(P7QD)-L(fwdPxQx)
253
254           .int        L(P0QE)-L(fwdPxQx)
255           .int        L(P1QE)-L(fwdPxQx)
256           .int        L(P2QE)-L(fwdPxQx)
257           .int        L(P3QE)-L(fwdPxQx)
258           .int        L(P4QE)-L(fwdPxQx)
259           .int        L(P5QE)-L(fwdPxQx)
260           .int        L(P6QE)-L(fwdPxQx)
261           .int        L(P7QE)-L(fwdPxQx)
262
263           .int        L(P0QF)-L(fwdPxQx)
264           .int        L(P1QF)-L(fwdPxQx)
265           .int        L(P2QF)-L(fwdPxQx)
266           .int        L(P3QF)-L(fwdPxQx)
267           .int        L(P4QF)-L(fwdPxQx)
268           .int        L(P5QF)-L(fwdPxQx)
269           .int        L(P6QF)-L(fwdPxQx)
270           .int        L(P7QF)-L(fwdPxQx)
271
272           .int        L(P0QG)-L(fwdPxQx)	# 0x80
273
274	   .balign 16
275L(AliPxQx): .int   L(now_qw_aligned)-L(AliPxQx)
276           .int        L(A1Q0)-L(AliPxQx)
277           .int        L(A2Q0)-L(AliPxQx)
278           .int        L(A3Q0)-L(AliPxQx)
279           .int        L(A4Q0)-L(AliPxQx)
280           .int        L(A5Q0)-L(AliPxQx)
281           .int        L(A6Q0)-L(AliPxQx)
282           .int        L(A7Q0)-L(AliPxQx)
283           .int        L(A0Q1)-L(AliPxQx)
284           .int        L(A1Q1)-L(AliPxQx)
285           .int        L(A2Q1)-L(AliPxQx)
286           .int        L(A3Q1)-L(AliPxQx)
287           .int        L(A4Q1)-L(AliPxQx)
288           .int        L(A5Q1)-L(AliPxQx)
289           .int        L(A6Q1)-L(AliPxQx)
290           .int        L(A7Q1)-L(AliPxQx)
291
292	.balign 16
293L(A1Q0):			# ; need to move 8+ 7=1+2+4 bytes
294	movzbq (%rdx),%r11
295	sub    $0xf,%r8
296	mov    %r11b,(%rcx)
297
298	movzwq 0x1(%rdx),%r10
299	mov    %r10w,0x1(%rcx)
300
301	mov    0x3(%rdx),%r9d
302	mov    %r9d,0x3(%rcx)
303
304	mov    0x7(%rdx),%r11
305	add    $0xf,%rdx
306	mov    %r11,0x7(%rcx)
307
308	add    $0xf,%rcx
309	jmp    L(now_qw_aligned)
310
311	.balign 16
312L(A2Q0):			# ; need to move 8+ 6=2+4 bytes
313	movzwq (%rdx),%r10
314	sub    $0xe,%r8
315	mov    %r10w,(%rcx)
316
317	mov    0x2(%rdx),%r9d
318	mov    %r9d,0x2(%rcx)
319
320	mov    0x6(%rdx),%r11
321	add    $0xe,%rdx
322	mov    %r11,0x6(%rcx)
323	add    $0xe,%rcx
324	jmp    L(now_qw_aligned)
325
326	.balign 16
327L(A3Q0):			# ; need to move 8+ 5=1+4 bytes
328	movzbq (%rdx),%r11
329	sub    $0xd,%r8
330	mov    %r11b,(%rcx)
331
332	mov    0x1(%rdx),%r9d
333	mov    %r9d,0x1(%rcx)
334
335	mov    0x5(%rdx),%r10
336	add    $0xd,%rdx
337	mov    %r10,0x5(%rcx)
338
339	add    $0xd,%rcx
340	jmp    L(now_qw_aligned)
341
342	.balign 16
343L(A4Q0):			# ; need to move 8+4 bytes
344	mov    (%rdx),%r9d
345	sub    $0xc,%r8
346	mov    %r9d,(%rcx)
347
348	mov    0x4(%rdx),%r10
349	add    $0xc,%rdx
350	mov    %r10,0x4(%rcx)
351
352	add    $0xc,%rcx
353	jmp    L(now_qw_aligned)
354
355	.balign 16
356L(A5Q0):			# ; need to move 8+ 3=1+2 bytes
357	movzbq (%rdx),%r11
358	sub    $0xb,%r8
359	mov    %r11b,(%rcx)
360
361	movzwq 0x1(%rdx),%r10
362	mov    %r10w,0x1(%rcx)
363
364	mov    0x3(%rdx),%r9
365	add    $0xb,%rdx
366	mov    %r9,0x3(%rcx)
367
368	add    $0xb,%rcx
369	jmp    L(now_qw_aligned)
370
371	.balign 16
372L(A6Q0):			# ; need to move 8+2 bytes
373	movzwq (%rdx),%r10
374	sub    $0xa,%r8
375	mov    %r10w,(%rcx)
376
377	mov    0x2(%rdx),%r9
378	add    $0xa,%rdx
379	mov    %r9,0x2(%rcx)
380
381	add    $0xa,%rcx
382	jmp    L(now_qw_aligned)
383
384	.balign 16
385L(A7Q0):			# ; need to move 8+1 byte
386	movzbq (%rdx),%r11
387	sub    $0x9,%r8
388	mov    %r11b,(%rcx)
389
390	mov    0x1(%rdx),%r10
391	add    $0x9,%rdx
392	mov    %r10,0x1(%rcx)
393
394	add    $0x9,%rcx
395	jmp    L(now_qw_aligned)
396
397	.balign 16
398L(A0Q1):			# ; need to move 8 bytes
399
400	mov    (%rdx),%r10
401	add    $0x8,%rdx
402	sub    $0x8,%r8
403	mov    %r10,(%rcx)
404
405	add    $0x8,%rcx
406	jmp    L(now_qw_aligned)
407
408	.balign 16
409L(A1Q1):			# ; need to move 7=1+2+4 bytes
410	movzbq (%rdx),%r11
411	sub    $0x7,%r8
412	mov    %r11b,(%rcx)
413
414	movzwq 0x1(%rdx),%r10
415	mov    %r10w,0x1(%rcx)
416
417	mov    0x3(%rdx),%r9d
418	add    $0x7,%rdx
419	mov    %r9d,0x3(%rcx)
420	add    $0x7,%rcx
421	jmp    L(now_qw_aligned)
422
423	.balign 16
424L(A2Q1):			# ; need to move 6=2+4 bytes
425	movzwq (%rdx),%r10
426	sub    $0x6,%r8
427	mov    %r10w,(%rcx)
428	mov    0x2(%rdx),%r9d
429	add    $0x6,%rdx
430	mov    %r9d,0x2(%rcx)
431	add    $0x6,%rcx
432	jmp    L(now_qw_aligned)
433
434	.balign 16
435L(A3Q1):			# ; need to move 5=1+4 bytes
436	movzbq (%rdx),%r11
437	sub    $0x5,%r8
438	mov    %r11b,(%rcx)
439	mov    0x1(%rdx),%r9d
440	add    $0x5,%rdx
441	mov    %r9d,0x1(%rcx)
442	add    $0x5,%rcx
443	jmp    L(now_qw_aligned)
444
445	.balign 16
446L(A4Q1):			# ; need to move 4 bytes
447	mov    (%rdx),%r9d
448	sub    $0x4,%r8
449	add    $0x4,%rdx
450	mov    %r9d,(%rcx)
451	add    $0x4,%rcx
452	jmp    L(now_qw_aligned)
453
454	.balign 16
455L(A5Q1):			# ; need to move 3=1+2 bytes
456	movzbq (%rdx),%r11
457	sub    $0x3,%r8
458	mov    %r11b,(%rcx)
459
460	movzwq 0x1(%rdx),%r10
461	add    $0x3,%rdx
462	mov    %r10w,0x1(%rcx)
463
464	add    $0x3,%rcx
465	jmp    L(now_qw_aligned)
466
467	.balign 16
468L(A6Q1):			# ; need to move 2 bytes
469	movzwq (%rdx),%r10
470	sub    $0x2,%r8
471	add    $0x2,%rdx
472	mov    %r10w,(%rcx)
473	add    $0x2,%rcx
474	jmp    L(now_qw_aligned)
475
476	.balign 16
477L(A7Q1):			# ; need to move 1 byte
478	movzbq (%rdx),%r11
479	dec    %r8
480	inc    %rdx
481	mov    %r11b,(%rcx)
482	inc    %rcx
483	jmp    L(now_qw_aligned)
484
485
486	.balign 16
487L(P0QG):
488	mov    -0x80(%rdx),%r9
489	mov    %r9,-0x80(%rcx)
490L(P0QF):
491	mov    -0x78(%rdx),%r10
492	mov    %r10,-0x78(%rcx)
493L(P0QE):
494	mov    -0x70(%rdx),%r9
495	mov    %r9,-0x70(%rcx)
496L(P0QD):
497	mov    -0x68(%rdx),%r10
498	mov    %r10,-0x68(%rcx)
499L(P0QC):
500	mov    -0x60(%rdx),%r9
501	mov    %r9,-0x60(%rcx)
502L(P0QB):
503	mov    -0x58(%rdx),%r10
504	mov    %r10,-0x58(%rcx)
505L(P0QA):
506	mov    -0x50(%rdx),%r9
507	mov    %r9,-0x50(%rcx)
508L(P0Q9):
509	mov    -0x48(%rdx),%r10
510	mov    %r10,-0x48(%rcx)
511L(P0Q8):
512	mov    -0x40(%rdx),%r9
513	mov    %r9,-0x40(%rcx)
514L(P0Q7):
515	mov    -0x38(%rdx),%r10
516	mov    %r10,-0x38(%rcx)
517L(P0Q6):
518	mov    -0x30(%rdx),%r9
519	mov    %r9,-0x30(%rcx)
520L(P0Q5):
521	mov    -0x28(%rdx),%r10
522	mov    %r10,-0x28(%rcx)
523L(P0Q4):
524	mov    -0x20(%rdx),%r9
525	mov    %r9,-0x20(%rcx)
526L(P0Q3):
527	mov    -0x18(%rdx),%r10
528	mov    %r10,-0x18(%rcx)
529L(P0Q2):
530	mov    -0x10(%rdx),%r9
531	mov    %r9,-0x10(%rcx)
532L(P0Q1):
533	mov    -0x8(%rdx),%r10
534	mov    %r10,-0x8(%rcx)
535L(P0Q0):
536	ret
537
538	.balign 16
539L(P1QF):
540	mov    -0x79(%rdx),%r9
541	mov    %r9,-0x79(%rcx)
542L(P1QE):
543	mov    -0x71(%rdx),%r11
544	mov    %r11,-0x71(%rcx)
545L(P1QD):
546	mov    -0x69(%rdx),%r10
547	mov    %r10,-0x69(%rcx)
548L(P1QC):
549	mov    -0x61(%rdx),%r9
550	mov    %r9,-0x61(%rcx)
551L(P1QB):
552	mov    -0x59(%rdx),%r11
553	mov    %r11,-0x59(%rcx)
554L(P1QA):
555	mov    -0x51(%rdx),%r10
556	mov    %r10,-0x51(%rcx)
557L(P1Q9):
558	mov    -0x49(%rdx),%r9
559	mov    %r9,-0x49(%rcx)
560L(P1Q8):
561	mov    -0x41(%rdx),%r11
562	mov    %r11,-0x41(%rcx)
563L(P1Q7):
564	mov    -0x39(%rdx),%r10
565	mov    %r10,-0x39(%rcx)
566L(P1Q6):
567	mov    -0x31(%rdx),%r9
568	mov    %r9,-0x31(%rcx)
569L(P1Q5):
570	mov    -0x29(%rdx),%r11
571	mov    %r11,-0x29(%rcx)
572L(P1Q4):
573	mov    -0x21(%rdx),%r10
574	mov    %r10,-0x21(%rcx)
575L(P1Q3):
576	mov    -0x19(%rdx),%r9
577	mov    %r9,-0x19(%rcx)
578L(P1Q2):
579	mov    -0x11(%rdx),%r11
580	mov    %r11,-0x11(%rcx)
581L(P1Q1):
582	mov    -0x9(%rdx),%r10
583	mov    %r10,-0x9(%rcx)
584L(P1Q0):
585	movzbq -0x1(%rdx),%r9
586	mov    %r9b,-0x1(%rcx)
587	ret
588
589	.balign 16
590L(P2QF):
591	mov    -0x7a(%rdx),%r9
592	mov    %r9,-0x7a(%rcx)
593L(P2QE):
594	mov    -0x72(%rdx),%r11
595	mov    %r11,-0x72(%rcx)
596L(P2QD):
597	mov    -0x6a(%rdx),%r10
598	mov    %r10,-0x6a(%rcx)
599L(P2QC):
600	mov    -0x62(%rdx),%r9
601	mov    %r9,-0x62(%rcx)
602L(P2QB):
603	mov    -0x5a(%rdx),%r11
604	mov    %r11,-0x5a(%rcx)
605L(P2QA):
606	mov    -0x52(%rdx),%r10
607	mov    %r10,-0x52(%rcx)
608L(P2Q9):
609	mov    -0x4a(%rdx),%r9
610	mov    %r9,-0x4a(%rcx)
611L(P2Q8):
612	mov    -0x42(%rdx),%r11
613	mov    %r11,-0x42(%rcx)
614L(P2Q7):
615	mov    -0x3a(%rdx),%r10
616	mov    %r10,-0x3a(%rcx)
617L(P2Q6):
618	mov    -0x32(%rdx),%r9
619	mov    %r9,-0x32(%rcx)
620L(P2Q5):
621	mov    -0x2a(%rdx),%r11
622	mov    %r11,-0x2a(%rcx)
623L(P2Q4):
624	mov    -0x22(%rdx),%r10
625	mov    %r10,-0x22(%rcx)
626L(P2Q3):
627	mov    -0x1a(%rdx),%r9
628	mov    %r9,-0x1a(%rcx)
629L(P2Q2):
630	mov    -0x12(%rdx),%r11
631	mov    %r11,-0x12(%rcx)
632L(P2Q1):
633	mov    -0xa(%rdx),%r10
634	mov    %r10,-0xa(%rcx)
635L(P2Q0):
636	movzwq -0x2(%rdx),%r9
637	mov    %r9w,-0x2(%rcx)
638	ret
639
640	.balign 16
641L(P3QF):
642	mov    -0x7b(%rdx),%r9
643	mov    %r9,-0x7b(%rcx)
644L(P3QE):
645	mov    -0x73(%rdx),%r11
646	mov    %r11,-0x73(%rcx)
647L(P3QD):
648	mov    -0x6b(%rdx),%r10
649	mov    %r10,-0x6b(%rcx)
650L(P3QC):
651	mov    -0x63(%rdx),%r9
652	mov    %r9,-0x63(%rcx)
653L(P3QB):
654	mov    -0x5b(%rdx),%r11
655	mov    %r11,-0x5b(%rcx)
656L(P3QA):
657	mov    -0x53(%rdx),%r10
658	mov    %r10,-0x53(%rcx)
659L(P3Q9):
660	mov    -0x4b(%rdx),%r9
661	mov    %r9,-0x4b(%rcx)
662L(P3Q8):
663	mov    -0x43(%rdx),%r11
664	mov    %r11,-0x43(%rcx)
665L(P3Q7):
666	mov    -0x3b(%rdx),%r10
667	mov    %r10,-0x3b(%rcx)
668L(P3Q6):
669	mov    -0x33(%rdx),%r9
670	mov    %r9,-0x33(%rcx)
671L(P3Q5):
672	mov    -0x2b(%rdx),%r11
673	mov    %r11,-0x2b(%rcx)
674L(P3Q4):
675	mov    -0x23(%rdx),%r10
676	mov    %r10,-0x23(%rcx)
677L(P3Q3):
678	mov    -0x1b(%rdx),%r9
679	mov    %r9,-0x1b(%rcx)
680L(P3Q2):
681	mov    -0x13(%rdx),%r11
682	mov    %r11,-0x13(%rcx)
683L(P3Q1):
684	mov    -0xb(%rdx),%r10
685	mov    %r10,-0xb(%rcx)
686	/*
687	 * These trailing loads/stores have to do all their loads 1st,
688	 * then do the stores.
689	 */
690L(P3Q0):
691	movzwq -0x3(%rdx),%r9
692	movzbq -0x1(%rdx),%r10
693	mov    %r9w,-0x3(%rcx)
694	mov    %r10b,-0x1(%rcx)
695	ret
696
697	.balign 16
698L(P4QF):
699	mov    -0x7c(%rdx),%r9
700	mov    %r9,-0x7c(%rcx)
701L(P4QE):
702	mov    -0x74(%rdx),%r11
703	mov    %r11,-0x74(%rcx)
704L(P4QD):
705	mov    -0x6c(%rdx),%r10
706	mov    %r10,-0x6c(%rcx)
707L(P4QC):
708	mov    -0x64(%rdx),%r9
709	mov    %r9,-0x64(%rcx)
710L(P4QB):
711	mov    -0x5c(%rdx),%r11
712	mov    %r11,-0x5c(%rcx)
713L(P4QA):
714	mov    -0x54(%rdx),%r10
715	mov    %r10,-0x54(%rcx)
716L(P4Q9):
717	mov    -0x4c(%rdx),%r9
718	mov    %r9,-0x4c(%rcx)
719L(P4Q8):
720	mov    -0x44(%rdx),%r11
721	mov    %r11,-0x44(%rcx)
722L(P4Q7):
723	mov    -0x3c(%rdx),%r10
724	mov    %r10,-0x3c(%rcx)
725L(P4Q6):
726	mov    -0x34(%rdx),%r9
727	mov    %r9,-0x34(%rcx)
728L(P4Q5):
729	mov    -0x2c(%rdx),%r11
730	mov    %r11,-0x2c(%rcx)
731L(P4Q4):
732	mov    -0x24(%rdx),%r10
733	mov    %r10,-0x24(%rcx)
734L(P4Q3):
735	mov    -0x1c(%rdx),%r9
736	mov    %r9,-0x1c(%rcx)
737L(P4Q2):
738	mov    -0x14(%rdx),%r11
739	mov    %r11,-0x14(%rcx)
740L(P4Q1):
741	mov    -0xc(%rdx),%r10
742	mov    %r10,-0xc(%rcx)
743L(P4Q0):
744	mov    -0x4(%rdx),%r9d
745	mov    %r9d,-0x4(%rcx)
746	ret
747
748	.balign 16
749L(P5QF):
750	mov    -0x7d(%rdx),%r9
751	mov    %r9,-0x7d(%rcx)
752L(P5QE):
753	mov    -0x75(%rdx),%r11
754	mov    %r11,-0x75(%rcx)
755L(P5QD):
756	mov    -0x6d(%rdx),%r10
757	mov    %r10,-0x6d(%rcx)
758L(P5QC):
759	mov    -0x65(%rdx),%r9
760	mov    %r9,-0x65(%rcx)
761L(P5QB):
762	mov    -0x5d(%rdx),%r11
763	mov    %r11,-0x5d(%rcx)
764L(P5QA):
765	mov    -0x55(%rdx),%r10
766	mov    %r10,-0x55(%rcx)
767L(P5Q9):
768	mov    -0x4d(%rdx),%r9
769	mov    %r9,-0x4d(%rcx)
770L(P5Q8):
771	mov    -0x45(%rdx),%r11
772	mov    %r11,-0x45(%rcx)
773L(P5Q7):
774	mov    -0x3d(%rdx),%r10
775	mov    %r10,-0x3d(%rcx)
776L(P5Q6):
777	mov    -0x35(%rdx),%r9
778	mov    %r9,-0x35(%rcx)
779L(P5Q5):
780	mov    -0x2d(%rdx),%r11
781	mov    %r11,-0x2d(%rcx)
782L(P5Q4):
783	mov    -0x25(%rdx),%r10
784	mov    %r10,-0x25(%rcx)
785L(P5Q3):
786	mov    -0x1d(%rdx),%r9
787	mov    %r9,-0x1d(%rcx)
788L(P5Q2):
789	mov    -0x15(%rdx),%r11
790	mov    %r11,-0x15(%rcx)
791L(P5Q1):
792	mov    -0xd(%rdx),%r10
793	mov    %r10,-0xd(%rcx)
794	/*
795	 * These trailing loads/stores have to do all their loads 1st,
796	 * then do the stores.
797	 */
798L(P5Q0):
799	mov    -0x5(%rdx),%r9d
800	movzbq -0x1(%rdx),%r10
801	mov    %r9d,-0x5(%rcx)
802	mov    %r10b,-0x1(%rcx)
803	ret
804
805	.balign 16
806L(P6QF):
807	mov    -0x7e(%rdx),%r9
808	mov    %r9,-0x7e(%rcx)
809L(P6QE):
810	mov    -0x76(%rdx),%r11
811	mov    %r11,-0x76(%rcx)
812L(P6QD):
813	mov    -0x6e(%rdx),%r10
814	mov    %r10,-0x6e(%rcx)
815L(P6QC):
816	mov    -0x66(%rdx),%r9
817	mov    %r9,-0x66(%rcx)
818L(P6QB):
819	mov    -0x5e(%rdx),%r11
820	mov    %r11,-0x5e(%rcx)
821L(P6QA):
822	mov    -0x56(%rdx),%r10
823	mov    %r10,-0x56(%rcx)
824L(P6Q9):
825	mov    -0x4e(%rdx),%r9
826	mov    %r9,-0x4e(%rcx)
827L(P6Q8):
828	mov    -0x46(%rdx),%r11
829	mov    %r11,-0x46(%rcx)
830L(P6Q7):
831	mov    -0x3e(%rdx),%r10
832	mov    %r10,-0x3e(%rcx)
833L(P6Q6):
834	mov    -0x36(%rdx),%r9
835	mov    %r9,-0x36(%rcx)
836L(P6Q5):
837	mov    -0x2e(%rdx),%r11
838	mov    %r11,-0x2e(%rcx)
839L(P6Q4):
840	mov    -0x26(%rdx),%r10
841	mov    %r10,-0x26(%rcx)
842L(P6Q3):
843	mov    -0x1e(%rdx),%r9
844	mov    %r9,-0x1e(%rcx)
845L(P6Q2):
846	mov    -0x16(%rdx),%r11
847	mov    %r11,-0x16(%rcx)
848L(P6Q1):
849	mov    -0xe(%rdx),%r10
850	mov    %r10,-0xe(%rcx)
851	/*
852	 * These trailing loads/stores have to do all their loads 1st,
853	 * then do the stores.
854	 */
855L(P6Q0):
856	mov    -0x6(%rdx),%r9d
857	movzwq -0x2(%rdx),%r10
858	mov    %r9d,-0x6(%rcx)
859	mov    %r10w,-0x2(%rcx)
860	ret
861
862	.balign 16
863L(P7QF):
864	mov    -0x7f(%rdx),%r9
865	mov    %r9,-0x7f(%rcx)
866L(P7QE):
867	mov    -0x77(%rdx),%r11
868	mov    %r11,-0x77(%rcx)
869L(P7QD):
870	mov    -0x6f(%rdx),%r10
871	mov    %r10,-0x6f(%rcx)
872L(P7QC):
873	mov    -0x67(%rdx),%r9
874	mov    %r9,-0x67(%rcx)
875L(P7QB):
876	mov    -0x5f(%rdx),%r11
877	mov    %r11,-0x5f(%rcx)
878L(P7QA):
879	mov    -0x57(%rdx),%r10
880	mov    %r10,-0x57(%rcx)
881L(P7Q9):
882	mov    -0x4f(%rdx),%r9
883	mov    %r9,-0x4f(%rcx)
884L(P7Q8):
885	mov    -0x47(%rdx),%r11
886	mov    %r11,-0x47(%rcx)
887L(P7Q7):
888	mov    -0x3f(%rdx),%r10
889	mov    %r10,-0x3f(%rcx)
890L(P7Q6):
891	mov    -0x37(%rdx),%r9
892	mov    %r9,-0x37(%rcx)
893L(P7Q5):
894	mov    -0x2f(%rdx),%r11
895	mov    %r11,-0x2f(%rcx)
896L(P7Q4):
897	mov    -0x27(%rdx),%r10
898	mov    %r10,-0x27(%rcx)
899L(P7Q3):
900	mov    -0x1f(%rdx),%r9
901	mov    %r9,-0x1f(%rcx)
902L(P7Q2):
903	mov    -0x17(%rdx),%r11
904	mov    %r11,-0x17(%rcx)
905L(P7Q1):
906	mov    -0xf(%rdx),%r10
907	mov    %r10,-0xf(%rcx)
908	/*
909	 * These trailing loads/stores have to do all their loads 1st,
910	 * then do the stores.
911	 */
912L(P7Q0):
913	mov    -0x7(%rdx),%r9d
914	movzwq -0x3(%rdx),%r10
915	movzbq -0x1(%rdx),%r11
916	mov    %r9d,-0x7(%rcx)
917	mov    %r10w,-0x3(%rcx)
918	mov    %r11b,-0x1(%rcx)
919	ret
920
921	.balign 16
922L(ck_use_sse2):
923	/*
924	 * Align dest to 16 byte boundary.
925	 */
926	test   $0xf,%rcx
927	jnz    L(ShrtAlignNew)
928
929L(now_qw_aligned):
930	cmpl   $NO_SSE,.memops_method(%rip)
931	je     L(Loop8byte_pre)
932
933	/*
934	 * The fall-through path is to do SSE2 16-byte load/stores
935	 */
936
937	/*
938	 * If current move size is larger than half of the highest level cache
939	 * size, then do non-temporal moves.
940	 */
941	mov    .largest_level_cache_size(%rip),%r9d
942	shr    %r9		# take half of it
943	cmp    %r9,%r8
944	jg     L(sse2_nt_move)
945
946	/*
947	 * If both the source and dest are aligned, then use the both aligned
948	 * logic. Well aligned data should reap the rewards.
949	 */
950	test   $0xf,%rdx
951	jz     L(pre_both_aligned)
952
953	lea    L(SSE_src)(%rip),%r10		# SSE2 (default)
954	testl  $USE_SSSE3,.memops_method(%rip)
955	jz     1f
956	lea    L(SSSE3_src)(%rip),%r10		# SSSE3
957
9581:
959	/*
960	 * if the src is not 16 byte aligned...
961	 */
962	mov    %rdx,%r11
963	and    $0xf,%r11
964	movdqu (%rdx),%xmm0
965	movdqa %xmm0,(%rcx)
966	add    $0x10,%rdx
967	sub    %r11,%rdx
968	add    $0x10,%rcx
969	sub    $0x10,%r8
970	movdqa (%rdx),%xmm1
971
972	movslq (%r10,%r11,4),%r9
973	lea    (%r9,%r10,1),%r10
974	jmpq   *%r10
975
976	    .balign 16
977L(SSSE3_src): .int	L(pre_both_aligned)-L(SSSE3_src)
978	    .int        L(mov3dqa1) -L(SSSE3_src)
979	    .int        L(mov3dqa2) -L(SSSE3_src)
980	    .int        L(mov3dqa3) -L(SSSE3_src)
981	    .int        L(mov3dqa4) -L(SSSE3_src)
982	    .int        L(mov3dqa5) -L(SSSE3_src)
983	    .int        L(mov3dqa6) -L(SSSE3_src)
984	    .int        L(mov3dqa7) -L(SSSE3_src)
985	    .int        L(movdqa8)  -L(SSSE3_src)
986	    .int        L(mov3dqa9) -L(SSSE3_src)
987	    .int        L(mov3dqa10)-L(SSSE3_src)
988	    .int        L(mov3dqa11)-L(SSSE3_src)
989	    .int        L(mov3dqa12)-L(SSSE3_src)
990	    .int        L(mov3dqa13)-L(SSSE3_src)
991	    .int        L(mov3dqa14)-L(SSSE3_src)
992	    .int        L(mov3dqa15)-L(SSSE3_src)
993L(SSE_src): .int    L(pre_both_aligned)-L(SSE_src)
994	    .int        L(movdqa1) -L(SSE_src)
995	    .int        L(movdqa2) -L(SSE_src)
996	    .int        L(movdqa3) -L(SSE_src)
997	    .int        L(movdqa4) -L(SSE_src)
998	    .int        L(movdqa5) -L(SSE_src)
999	    .int        L(movdqa6) -L(SSE_src)
1000	    .int        L(movdqa7) -L(SSE_src)
1001	    .int        L(movdqa8) -L(SSE_src)
1002	    .int        L(movdqa9) -L(SSE_src)
1003	    .int        L(movdqa10)-L(SSE_src)
1004	    .int        L(movdqa11)-L(SSE_src)
1005	    .int        L(movdqa12)-L(SSE_src)
1006	    .int        L(movdqa13)-L(SSE_src)
1007	    .int        L(movdqa14)-L(SSE_src)
1008	    .int        L(movdqa15)-L(SSE_src)
1009
1010	.balign 16
1011L(movdqa1):
1012	movdqa 0x10(%rdx),%xmm3 # load the upper source buffer
1013	movdqa 0x20(%rdx),%xmm0 # load the upper source buffer
1014	lea    0x20(%rdx),%rdx
1015	lea    -0x20(%r8),%r8
1016
1017	psrldq $0x1,%xmm1  # shift right prev buffer (saved from last iteration)
1018	movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration
1019	pslldq $0xf,%xmm3  # shift the current buffer left (shift in zeros)
1020	por    %xmm1,%xmm3 # OR them together
1021	cmp    $0x20,%r8
1022
1023	psrldq $0x1,%xmm2  # shift right prev buffer (saved from last iteration)
1024	movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration
1025	pslldq $0xf,%xmm0  # shift the current buffer left (shift in zeros)
1026	por    %xmm2,%xmm0 # OR them together
1027	movdqa %xmm3,(%rcx)     # store it
1028	movdqa %xmm0,0x10(%rcx) # store it
1029	lea    0x20(%rcx),%rcx
1030
1031	jge    L(movdqa1)
1032	jmp    L(movdqa_epi)
1033
1034	.balign 16
1035L(movdqa2):
1036	sub    $0x20,%r8
1037	movdqa 0x10(%rdx),%xmm3
1038	movdqa 0x20(%rdx),%xmm0
1039	add    $0x20,%rdx
1040
1041	psrldq $0x2,%xmm1
1042	movdqa %xmm3,%xmm2
1043	pslldq $0xe,%xmm3
1044	por    %xmm1,%xmm3
1045
1046	psrldq $0x2,%xmm2
1047	movdqa %xmm0,%xmm1
1048	pslldq $0xe,%xmm0
1049	por    %xmm2,%xmm0
1050	movdqa %xmm3,(%rcx)
1051	movdqa %xmm0,0x10(%rcx)
1052
1053	add    $0x20,%rcx
1054	cmp    $0x20,%r8
1055	jge    L(movdqa2)
1056	jmp    L(movdqa_epi)
1057
1058	.balign 16
1059L(movdqa3):
1060	sub    $0x20,%r8
1061	movdqa 0x10(%rdx),%xmm3
1062	movdqa 0x20(%rdx),%xmm0
1063	add    $0x20,%rdx
1064
1065	psrldq $0x3,%xmm1
1066	movdqa %xmm3,%xmm2
1067	pslldq $0xd,%xmm3
1068	por    %xmm1,%xmm3
1069
1070	psrldq $0x3,%xmm2
1071	movdqa %xmm0,%xmm1
1072	pslldq $0xd,%xmm0
1073	por    %xmm2,%xmm0
1074	movdqa %xmm3,(%rcx)
1075	movdqa %xmm0,0x10(%rcx)
1076
1077	add    $0x20,%rcx
1078	cmp    $0x20,%r8
1079	jge    L(movdqa3)
1080	jmp    L(movdqa_epi)
1081
1082	.balign 16
1083L(movdqa4):
1084	sub    $0x20,%r8
1085	movdqa 0x10(%rdx),%xmm3
1086	movdqa 0x20(%rdx),%xmm0
1087	add    $0x20,%rdx
1088
1089	psrldq $0x4,%xmm1
1090	movdqa %xmm3,%xmm2
1091	pslldq $0xc,%xmm3
1092	por    %xmm1,%xmm3
1093
1094	psrldq $0x4,%xmm2
1095	movdqa %xmm0,%xmm1
1096	pslldq $0xc,%xmm0
1097	por    %xmm2,%xmm0
1098
1099	movdqa %xmm3,(%rcx)
1100	movdqa %xmm0,0x10(%rcx)
1101
1102	add    $0x20,%rcx
1103	cmp    $0x20,%r8
1104	jge    L(movdqa4)
1105	jmp    L(movdqa_epi)
1106
1107	.balign 16
1108L(movdqa5):
1109	sub    $0x20,%r8
1110	movdqa 0x10(%rdx),%xmm3
1111	movdqa 0x20(%rdx),%xmm0
1112	add    $0x20,%rdx
1113
1114	psrldq $0x5,%xmm1
1115	movdqa %xmm3,%xmm2
1116	pslldq $0xb,%xmm3
1117	por    %xmm1,%xmm3
1118
1119	psrldq $0x5,%xmm2
1120	movdqa %xmm0,%xmm1
1121	pslldq $0xb,%xmm0
1122	por    %xmm2,%xmm0
1123
1124	movdqa %xmm3,(%rcx)
1125	movdqa %xmm0,0x10(%rcx)
1126
1127	add    $0x20,%rcx
1128	cmp    $0x20,%r8
1129	jge    L(movdqa5)
1130	jmp    L(movdqa_epi)
1131
1132	.balign 16
1133L(movdqa6):
1134	sub    $0x20,%r8
1135	movdqa 0x10(%rdx),%xmm3
1136	movdqa 0x20(%rdx),%xmm0
1137	add    $0x20,%rdx
1138
1139	psrldq $0x6,%xmm1
1140	movdqa %xmm3,%xmm2
1141	pslldq $0xa,%xmm3
1142	por    %xmm1,%xmm3
1143
1144	psrldq $0x6,%xmm2
1145	movdqa %xmm0,%xmm1
1146	pslldq $0xa,%xmm0
1147	por    %xmm2,%xmm0
1148	movdqa %xmm3,(%rcx)
1149	movdqa %xmm0,0x10(%rcx)
1150
1151	add    $0x20,%rcx
1152	cmp    $0x20,%r8
1153	jge    L(movdqa6)
1154	jmp    L(movdqa_epi)
1155
1156	.balign 16
1157L(movdqa7):
1158	sub    $0x20,%r8
1159	movdqa 0x10(%rdx),%xmm3
1160	movdqa 0x20(%rdx),%xmm0
1161	add    $0x20,%rdx
1162
1163	psrldq $0x7,%xmm1
1164	movdqa %xmm3,%xmm2
1165	pslldq $0x9,%xmm3
1166	por    %xmm1,%xmm3
1167
1168	psrldq $0x7,%xmm2
1169	movdqa %xmm0,%xmm1
1170	pslldq $0x9,%xmm0
1171	por    %xmm2,%xmm0
1172	movdqa %xmm3,(%rcx)
1173	movdqa %xmm0,0x10(%rcx)
1174
1175	add    $0x20,%rcx
1176	cmp    $0x20,%r8
1177	jge    L(movdqa7)
1178	jmp    L(movdqa_epi)
1179
1180	.balign 16
1181L(movdqa8):
1182	movdqa 0x10(%rdx),%xmm3
1183	sub    $0x30,%r8
1184	movdqa 0x20(%rdx),%xmm0
1185	movdqa 0x30(%rdx),%xmm5
1186	lea    0x30(%rdx),%rdx
1187
1188	shufpd $0x1,%xmm3,%xmm1
1189	movdqa %xmm1,(%rcx)
1190
1191	cmp    $0x30,%r8
1192
1193	shufpd $0x1,%xmm0,%xmm3
1194	movdqa %xmm3,0x10(%rcx)
1195
1196	movdqa %xmm5,%xmm1
1197	shufpd $0x1,%xmm5,%xmm0
1198	movdqa %xmm0,0x20(%rcx)
1199
1200	lea    0x30(%rcx),%rcx
1201
1202	jge    L(movdqa8)
1203	jmp    L(movdqa_epi)
1204
1205	.balign 16
1206L(movdqa9):
1207	sub    $0x20,%r8
1208	movdqa 0x10(%rdx),%xmm3
1209	movdqa 0x20(%rdx),%xmm0
1210	add    $0x20,%rdx
1211
1212	psrldq $0x9,%xmm1
1213	movdqa %xmm3,%xmm2
1214	pslldq $0x7,%xmm3
1215	por    %xmm1,%xmm3
1216
1217	psrldq $0x9,%xmm2
1218	movdqa %xmm0,%xmm1
1219	pslldq $0x7,%xmm0
1220	por    %xmm2,%xmm0
1221	movdqa %xmm3,(%rcx)
1222	movdqa %xmm0,0x10(%rcx)
1223
1224	add    $0x20,%rcx
1225	cmp    $0x20,%r8
1226	jge    L(movdqa9)
1227	jmp    L(movdqa_epi)
1228
1229	.balign 16
1230L(movdqa10):
1231	sub    $0x20,%r8
1232	movdqa 0x10(%rdx),%xmm3
1233	movdqa 0x20(%rdx),%xmm0
1234	add    $0x20,%rdx
1235
1236	psrldq $0xa,%xmm1
1237	movdqa %xmm3,%xmm2
1238	pslldq $0x6,%xmm3
1239	por    %xmm1,%xmm3
1240
1241	psrldq $0xa,%xmm2
1242	movdqa %xmm0,%xmm1
1243	pslldq $0x6,%xmm0
1244	por    %xmm2,%xmm0
1245	movdqa %xmm3,(%rcx)
1246	movdqa %xmm0,0x10(%rcx)
1247
1248	add    $0x20,%rcx
1249	cmp    $0x20,%r8
1250	jge    L(movdqa10)
1251	jmp    L(movdqa_epi)
1252
1253	.balign 16
1254L(movdqa11):
1255	sub    $0x20,%r8
1256	movdqa 0x10(%rdx),%xmm3
1257	movdqa 0x20(%rdx),%xmm0
1258	add    $0x20,%rdx
1259
1260	psrldq $0xb,%xmm1
1261	movdqa %xmm3,%xmm2
1262	pslldq $0x5,%xmm3
1263	por    %xmm1,%xmm3
1264
1265	psrldq $0xb,%xmm2
1266	movdqa %xmm0,%xmm1
1267	pslldq $0x5,%xmm0
1268	por    %xmm2,%xmm0
1269	movdqa %xmm3,(%rcx)
1270	movdqa %xmm0,0x10(%rcx)
1271
1272	add    $0x20,%rcx
1273	cmp    $0x20,%r8
1274	jge    L(movdqa11)
1275	jmp    L(movdqa_epi)
1276
1277	.balign 16
1278L(movdqa12):
1279	sub    $0x20,%r8
1280	movdqa 0x10(%rdx),%xmm3
1281	movdqa 0x20(%rdx),%xmm0
1282	add    $0x20,%rdx
1283
1284	psrldq $0xc,%xmm1
1285	movdqa %xmm3,%xmm2
1286	pslldq $0x4,%xmm3
1287	por    %xmm1,%xmm3
1288
1289	psrldq $0xc,%xmm2
1290	movdqa %xmm0,%xmm1
1291	pslldq $0x4,%xmm0
1292	por    %xmm2,%xmm0
1293	movdqa %xmm3,(%rcx)
1294	movdqa %xmm0,0x10(%rcx)
1295
1296	add    $0x20,%rcx
1297	cmp    $0x20,%r8
1298	jge    L(movdqa12)
1299	jmp    L(movdqa_epi)
1300
1301	.balign 16
1302L(movdqa13):
1303	sub    $0x20,%r8
1304	movdqa 0x10(%rdx),%xmm3
1305	movdqa 0x20(%rdx),%xmm0
1306	add    $0x20,%rdx
1307
1308	psrldq $0xd,%xmm1
1309	movdqa %xmm3,%xmm2
1310	pslldq $0x3,%xmm3
1311	por    %xmm1,%xmm3
1312
1313	psrldq $0xd,%xmm2
1314	movdqa %xmm0,%xmm1
1315	pslldq $0x3,%xmm0
1316	por    %xmm2,%xmm0
1317	movdqa %xmm3,(%rcx)
1318	movdqa %xmm0,0x10(%rcx)
1319
1320	add    $0x20,%rcx
1321	cmp    $0x20,%r8
1322	jge    L(movdqa13)
1323	jmp    L(movdqa_epi)
1324
1325	.balign 16
1326L(movdqa14):
1327	sub    $0x20,%r8
1328	movdqa 0x10(%rdx),%xmm3
1329	movdqa 0x20(%rdx),%xmm0
1330	add    $0x20,%rdx
1331
1332	psrldq $0xe,%xmm1
1333	movdqa %xmm3,%xmm2
1334	pslldq $0x2,%xmm3
1335	por    %xmm1,%xmm3
1336
1337	psrldq $0xe,%xmm2
1338	movdqa %xmm0,%xmm1
1339	pslldq $0x2,%xmm0
1340	por    %xmm2,%xmm0
1341	movdqa %xmm3,(%rcx)
1342	movdqa %xmm0,0x10(%rcx)
1343
1344	add    $0x20,%rcx
1345	cmp    $0x20,%r8
1346	jge    L(movdqa14)
1347	jmp    L(movdqa_epi)
1348
1349	.balign 16
1350L(movdqa15):
1351	sub    $0x20,%r8
1352	movdqa 0x10(%rdx),%xmm3
1353	movdqa 0x20(%rdx),%xmm0
1354	add    $0x20,%rdx
1355
1356	psrldq $0xf,%xmm1
1357	movdqa %xmm3,%xmm2
1358	pslldq $0x1,%xmm3
1359	por    %xmm1,%xmm3
1360
1361	psrldq $0xf,%xmm2
1362	movdqa %xmm0,%xmm1
1363	pslldq $0x1,%xmm0
1364	por    %xmm2,%xmm0
1365	movdqa %xmm3,(%rcx)
1366	movdqa %xmm0,0x10(%rcx)
1367
1368	add    $0x20,%rcx
1369	cmp    $0x20,%r8
1370	jge    L(movdqa15)
1371	#jmp   L(movdqa_epi)
1372
1373	.balign 16
1374L(movdqa_epi):
1375	lea    L(fwdPxQx)(%rip),%r10
1376	add    %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop)
1377	add    %r8,%rcx
1378	add    %r8,%rdx
1379
1380	movslq (%r10,%r8,4),%r9
1381	lea    (%r9,%r10,1),%r10
1382	jmpq   *%r10
1383
1384	.balign 16
1385L(mov3dqa1):
1386	movdqa	0x10(%rdx),%xmm3 # load the upper source buffer
1387	sub	$0x30,%r8
1388	movdqa	0x20(%rdx),%xmm0 # load the upper source buffer
1389	movdqa	0x30(%rdx),%xmm5 # load the upper source buffer
1390	lea	0x30(%rdx),%rdx
1391	cmp	$0x30,%r8
1392
1393	movdqa	%xmm3,%xmm2       # store off xmm reg for use next iteration
1394	#palignr	$0x1,%xmm1,%xmm3
1395	.byte	0x66,0x0f,0x3a,0x0f
1396	.byte	0xd9,0x01
1397	movdqa	%xmm3,(%rcx)      # store it
1398
1399	movdqa	%xmm0,%xmm4       # store off xmm reg for use next iteration
1400	#palignr	$0x1,%xmm2,%xmm0
1401	.byte	0x66,0x0f,0x3a,0x0f
1402	.byte	0xc2,0x01
1403	movdqa	%xmm0,0x10(%rcx)  # store it
1404
1405	movdqa	%xmm5,%xmm1       # store off xmm reg for use next iteration
1406	#palignr	$0x1,%xmm4,%xmm5
1407	.byte	0x66,0x0f,0x3a,0x0f
1408	.byte	0xec,0x01
1409	movdqa	%xmm5,0x20(%rcx)  # store it
1410
1411	lea	0x30(%rcx),%rcx
1412	jge	L(mov3dqa1)
1413
1414	cmp	$0x10,%r8
1415	jl	L(movdqa_epi)
1416	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1417	sub	$0x10,%r8
1418	lea	0x10(%rdx),%rdx
1419	movdqa	%xmm3,%xmm2		# save for use next concat
1420	#palignr	$0x1,%xmm1,%xmm3
1421	.byte	0x66,0x0f,0x3a,0x0f
1422	.byte	0xd9,0x01
1423
1424	cmp	$0x10,%r8
1425	movdqa	%xmm3,(%rcx)      	# store it
1426	lea	0x10(%rcx),%rcx
1427	jl	L(movdqa_epi)
1428
1429	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1430	sub	$0x10,%r8
1431	lea	0x10(%rdx),%rdx
1432	#palignr	$0x1,%xmm2,%xmm0
1433	.byte	0x66,0x0f,0x3a,0x0f
1434	.byte	0xc2,0x01
1435	movdqa	%xmm0,(%rcx)      	# store it
1436	lea	0x10(%rcx),%rcx
1437	jmp	L(movdqa_epi)
1438
1439	.balign 16
1440L(mov3dqa2):
1441	movdqa	0x10(%rdx),%xmm3
1442	sub	$0x30,%r8
1443	movdqa	0x20(%rdx),%xmm0
1444	movdqa	0x30(%rdx),%xmm5
1445	lea	0x30(%rdx),%rdx
1446	cmp	$0x30,%r8
1447
1448	movdqa	%xmm3,%xmm2
1449	#palignr	$0x2,%xmm1,%xmm3
1450	.byte	0x66,0x0f,0x3a,0x0f
1451	.byte	0xd9,0x02
1452	movdqa	%xmm3,(%rcx)
1453
1454	movdqa	%xmm0,%xmm4
1455	#palignr	$0x2,%xmm2,%xmm0
1456	.byte	0x66,0x0f,0x3a,0x0f
1457	.byte	0xc2,0x02
1458	movdqa	%xmm0,0x10(%rcx)
1459
1460	movdqa	%xmm5,%xmm1
1461	#palignr	$0x2,%xmm4,%xmm5
1462	.byte	0x66,0x0f,0x3a,0x0f
1463	.byte	0xec,0x02
1464	movdqa	%xmm5,0x20(%rcx)
1465
1466	lea	0x30(%rcx),%rcx
1467	jge	L(mov3dqa2)
1468
1469	cmp	$0x10,%r8
1470	jl	L(movdqa_epi)
1471	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1472	sub	$0x10,%r8
1473	lea	0x10(%rdx),%rdx
1474	movdqa	%xmm3,%xmm2		# save for use next concat
1475	#palignr	$0x2,%xmm1,%xmm3
1476	.byte	0x66,0x0f,0x3a,0x0f
1477	.byte	0xd9,0x02
1478
1479	cmp	$0x10,%r8
1480	movdqa	%xmm3,(%rcx)      	# store it
1481	lea	0x10(%rcx),%rcx
1482	jl	L(movdqa_epi)
1483
1484	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1485	sub	$0x10,%r8
1486	lea	0x10(%rdx),%rdx
1487	#palignr	$0x2,%xmm2,%xmm0
1488	.byte	0x66,0x0f,0x3a,0x0f
1489	.byte	0xc2,0x02
1490	movdqa	%xmm0,(%rcx)      	# store it
1491	lea	0x10(%rcx),%rcx
1492	jmp	L(movdqa_epi)
1493
1494	.balign 16
1495L(mov3dqa3):
1496	movdqa	0x10(%rdx),%xmm3
1497	sub	$0x30,%r8
1498	movdqa	0x20(%rdx),%xmm0
1499	movdqa	0x30(%rdx),%xmm5
1500	lea	0x30(%rdx),%rdx
1501	cmp	$0x30,%r8
1502
1503	movdqa	%xmm3,%xmm2
1504	#palignr	$0x3,%xmm1,%xmm3
1505	.byte	0x66,0x0f,0x3a,0x0f
1506	.byte	0xd9,0x03
1507	movdqa	%xmm3,(%rcx)
1508
1509	movdqa	%xmm0,%xmm4
1510	#palignr	$0x3,%xmm2,%xmm0
1511	.byte	0x66,0x0f,0x3a,0x0f
1512	.byte	0xc2,0x03
1513	movdqa	%xmm0,0x10(%rcx)
1514
1515	movdqa	%xmm5,%xmm1
1516	#palignr	$0x3,%xmm4,%xmm5
1517	.byte	0x66,0x0f,0x3a,0x0f
1518	.byte	0xec,0x03
1519	movdqa	%xmm5,0x20(%rcx)
1520
1521	lea	0x30(%rcx),%rcx
1522	jge	L(mov3dqa3)
1523
1524	cmp	$0x10,%r8
1525	jl	L(movdqa_epi)
1526	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1527	sub	$0x10,%r8
1528	lea	0x10(%rdx),%rdx
1529	movdqa	%xmm3,%xmm2		# save for use next concat
1530	#palignr	$0x3,%xmm1,%xmm3
1531	.byte	0x66,0x0f,0x3a,0x0f
1532	.byte	0xd9,0x03
1533
1534	cmp	$0x10,%r8
1535	movdqa	%xmm3,(%rcx)      	# store it
1536	lea	0x10(%rcx),%rcx
1537	jl	L(movdqa_epi)
1538
1539	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1540	sub	$0x10,%r8
1541	lea	0x10(%rdx),%rdx
1542	#palignr	$0x3,%xmm2,%xmm0
1543	.byte	0x66,0x0f,0x3a,0x0f
1544	.byte	0xc2,0x03
1545	movdqa	%xmm0,(%rcx)      	# store it
1546	lea	0x10(%rcx),%rcx
1547	jmp	L(movdqa_epi)
1548
1549	.balign 16
1550L(mov3dqa4):
1551	movdqa	0x10(%rdx),%xmm3
1552	sub	$0x30,%r8
1553	movdqa	0x20(%rdx),%xmm0
1554	movdqa	0x30(%rdx),%xmm5
1555	lea	0x30(%rdx),%rdx
1556	cmp	$0x30,%r8
1557
1558	movdqa	%xmm3,%xmm2
1559	#palignr	$0x4,%xmm1,%xmm3
1560	.byte	0x66,0x0f,0x3a,0x0f
1561	.byte	0xd9,0x04
1562	movdqa	%xmm3,(%rcx)
1563
1564	movdqa	%xmm0,%xmm4
1565	#palignr	$0x4,%xmm2,%xmm0
1566	.byte	0x66,0x0f,0x3a,0x0f
1567	.byte	0xc2,0x04
1568	movdqa	%xmm0,0x10(%rcx)
1569
1570	movdqa	%xmm5,%xmm1
1571	#palignr	$0x4,%xmm4,%xmm5
1572	.byte	0x66,0x0f,0x3a,0x0f
1573	.byte	0xec,0x04
1574	movdqa	%xmm5,0x20(%rcx)
1575
1576	lea	0x30(%rcx),%rcx
1577	jge	L(mov3dqa4)
1578
1579	cmp	$0x10,%r8
1580	jl	L(movdqa_epi)
1581	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1582	sub	$0x10,%r8
1583	lea	0x10(%rdx),%rdx
1584	movdqa	%xmm3,%xmm2		# save for use next concat
1585	#palignr	$0x4,%xmm1,%xmm3
1586	.byte	0x66,0x0f,0x3a,0x0f
1587	.byte	0xd9,0x04
1588
1589	cmp	$0x10,%r8
1590	movdqa	%xmm3,(%rcx)      	# store it
1591	lea	0x10(%rcx),%rcx
1592	jl	L(movdqa_epi)
1593
1594	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1595	sub	$0x10,%r8
1596	lea	0x10(%rdx),%rdx
1597	#palignr	$0x4,%xmm2,%xmm0
1598	.byte	0x66,0x0f,0x3a,0x0f
1599	.byte	0xc2,0x04
1600	movdqa	%xmm0,(%rcx)      	# store it
1601	lea	0x10(%rcx),%rcx
1602	jmp	L(movdqa_epi)
1603
1604	.balign 16
1605L(mov3dqa5):
1606	movdqa	0x10(%rdx),%xmm3
1607	sub	$0x30,%r8
1608	movdqa	0x20(%rdx),%xmm0
1609	movdqa	0x30(%rdx),%xmm5
1610	lea	0x30(%rdx),%rdx
1611	cmp	$0x30,%r8
1612
1613	movdqa	%xmm3,%xmm2
1614	#palignr	$0x5,%xmm1,%xmm3
1615	.byte	0x66,0x0f,0x3a,0x0f
1616	.byte	0xd9,0x05
1617	movdqa	%xmm3,(%rcx)
1618
1619	movdqa	%xmm0,%xmm4
1620	#palignr	$0x5,%xmm2,%xmm0
1621	.byte	0x66,0x0f,0x3a,0x0f
1622	.byte	0xc2,0x05
1623	movdqa	%xmm0,0x10(%rcx)
1624
1625	movdqa	%xmm5,%xmm1
1626	#palignr	$0x5,%xmm4,%xmm5
1627	.byte	0x66,0x0f,0x3a,0x0f
1628	.byte	0xec,0x05
1629	movdqa	%xmm5,0x20(%rcx)
1630
1631	lea	0x30(%rcx),%rcx
1632	jge	L(mov3dqa5)
1633
1634	cmp	$0x10,%r8
1635	jl	L(movdqa_epi)
1636	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1637	sub	$0x10,%r8
1638	lea	0x10(%rdx),%rdx
1639	movdqa	%xmm3,%xmm2		# save for use next concat
1640	#palignr	$0x5,%xmm1,%xmm3
1641	.byte	0x66,0x0f,0x3a,0x0f
1642	.byte	0xd9,0x05
1643
1644	cmp	$0x10,%r8
1645	movdqa	%xmm3,(%rcx)      	# store it
1646	lea	0x10(%rcx),%rcx
1647	jl	L(movdqa_epi)
1648
1649	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1650	sub	$0x10,%r8
1651	lea	0x10(%rdx),%rdx
1652	#palignr	$0x5,%xmm2,%xmm0
1653	.byte	0x66,0x0f,0x3a,0x0f
1654	.byte	0xc2,0x05
1655	movdqa	%xmm0,(%rcx)      	# store it
1656	lea	0x10(%rcx),%rcx
1657	jmp	L(movdqa_epi)
1658
1659	.balign 16
1660L(mov3dqa6):
1661	movdqa	0x10(%rdx),%xmm3
1662	sub	$0x30,%r8
1663	movdqa	0x20(%rdx),%xmm0
1664	movdqa	0x30(%rdx),%xmm5
1665	lea	0x30(%rdx),%rdx
1666	cmp	$0x30,%r8
1667
1668	movdqa	%xmm3,%xmm2
1669	#palignr	$0x6,%xmm1,%xmm3
1670	.byte	0x66,0x0f,0x3a,0x0f
1671	.byte	0xd9,0x06
1672	movdqa	%xmm3,(%rcx)
1673
1674	movdqa	%xmm0,%xmm4
1675	#palignr	$0x6,%xmm2,%xmm0
1676	.byte	0x66,0x0f,0x3a,0x0f
1677	.byte	0xc2,0x06
1678	movdqa	%xmm0,0x10(%rcx)
1679
1680	movdqa	%xmm5,%xmm1
1681	#palignr	$0x6,%xmm4,%xmm5
1682	.byte	0x66,0x0f,0x3a,0x0f
1683	.byte	0xec,0x06
1684	movdqa	%xmm5,0x20(%rcx)
1685
1686	lea	0x30(%rcx),%rcx
1687	jge	L(mov3dqa6)
1688
1689	cmp	$0x10,%r8
1690	jl	L(movdqa_epi)
1691	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1692	sub	$0x10,%r8
1693	lea	0x10(%rdx),%rdx
1694	movdqa	%xmm3,%xmm2		# save for use next concat
1695	#palignr	$0x6,%xmm1,%xmm3
1696	.byte	0x66,0x0f,0x3a,0x0f
1697	.byte	0xd9,0x06
1698
1699	cmp	$0x10,%r8
1700	movdqa	%xmm3,(%rcx)      	# store it
1701	lea	0x10(%rcx),%rcx
1702	jl	L(movdqa_epi)
1703
1704	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1705	sub	$0x10,%r8
1706	lea	0x10(%rdx),%rdx
1707	#palignr	$0x6,%xmm2,%xmm0
1708	.byte	0x66,0x0f,0x3a,0x0f
1709	.byte	0xc2,0x06
1710	movdqa	%xmm0,(%rcx)      	# store it
1711	lea	0x10(%rcx),%rcx
1712	jmp	L(movdqa_epi)
1713
1714	.balign 16
1715L(mov3dqa7):
1716	movdqa	0x10(%rdx),%xmm3
1717	sub	$0x30,%r8
1718	movdqa	0x20(%rdx),%xmm0
1719	movdqa	0x30(%rdx),%xmm5
1720	lea	0x30(%rdx),%rdx
1721	cmp	$0x30,%r8
1722
1723	movdqa	%xmm3,%xmm2
1724	#palignr	$0x7,%xmm1,%xmm3
1725	.byte	0x66,0x0f,0x3a,0x0f
1726	.byte	0xd9,0x07
1727	movdqa	%xmm3,(%rcx)
1728
1729	movdqa	%xmm0,%xmm4
1730	#palignr	$0x7,%xmm2,%xmm0
1731	.byte	0x66,0x0f,0x3a,0x0f
1732	.byte	0xc2,0x07
1733	movdqa	%xmm0,0x10(%rcx)
1734
1735	movdqa	%xmm5,%xmm1
1736	#palignr	$0x7,%xmm4,%xmm5
1737	.byte	0x66,0x0f,0x3a,0x0f
1738	.byte	0xec,0x07
1739	movdqa	%xmm5,0x20(%rcx)
1740
1741	lea	0x30(%rcx),%rcx
1742	jge	L(mov3dqa7)
1743
1744	cmp	$0x10,%r8
1745	jl	L(movdqa_epi)
1746	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1747	sub	$0x10,%r8
1748	lea	0x10(%rdx),%rdx
1749	movdqa	%xmm3,%xmm2		# save for use next concat
1750	#palignr	$0x7,%xmm1,%xmm3
1751	.byte	0x66,0x0f,0x3a,0x0f
1752	.byte	0xd9,0x07
1753
1754	cmp	$0x10,%r8
1755	movdqa	%xmm3,(%rcx)      	# store it
1756	lea	0x10(%rcx),%rcx
1757	jl	L(movdqa_epi)
1758
1759	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1760	sub	$0x10,%r8
1761	lea	0x10(%rdx),%rdx
1762	#palignr	$0x7,%xmm2,%xmm0
1763	.byte	0x66,0x0f,0x3a,0x0f
1764	.byte	0xc2,0x07
1765	movdqa	%xmm0,(%rcx)      	# store it
1766	lea	0x10(%rcx),%rcx
1767	jmp	L(movdqa_epi)
1768
1769	.balign 16
1770L(mov3dqa9):
1771	movdqa	0x10(%rdx),%xmm3
1772	sub	$0x30,%r8
1773	movdqa	0x20(%rdx),%xmm0
1774	movdqa	0x30(%rdx),%xmm5
1775	lea	0x30(%rdx),%rdx
1776	cmp	$0x30,%r8
1777
1778	movdqa	%xmm3,%xmm2
1779	#palignr	$0x9,%xmm1,%xmm3
1780	.byte	0x66,0x0f,0x3a,0x0f
1781	.byte	0xd9,0x09
1782	movdqa	%xmm3,(%rcx)
1783
1784	movdqa	%xmm0,%xmm4
1785	#palignr	$0x9,%xmm2,%xmm0
1786	.byte	0x66,0x0f,0x3a,0x0f
1787	.byte	0xc2,0x09
1788	movdqa	%xmm0,0x10(%rcx)
1789
1790	movdqa	%xmm5,%xmm1
1791	#palignr	$0x9,%xmm4,%xmm5
1792	.byte	0x66,0x0f,0x3a,0x0f
1793	.byte	0xec,0x09
1794	movdqa	%xmm5,0x20(%rcx)
1795
1796	lea	0x30(%rcx),%rcx
1797	jge	L(mov3dqa9)
1798
1799	cmp	$0x10,%r8
1800	jl	L(movdqa_epi)
1801	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1802	sub	$0x10,%r8
1803	lea	0x10(%rdx),%rdx
1804	movdqa	%xmm3,%xmm2		# save for use next concat
1805	#palignr	$0x9,%xmm1,%xmm3
1806	.byte	0x66,0x0f,0x3a,0x0f
1807	.byte	0xd9,0x09
1808
1809	cmp	$0x10,%r8
1810	movdqa	%xmm3,(%rcx)      	# store it
1811	lea	0x10(%rcx),%rcx
1812	jl	L(movdqa_epi)
1813
1814	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1815	sub	$0x10,%r8
1816	lea	0x10(%rdx),%rdx
1817	#palignr	$0x9,%xmm2,%xmm0
1818	.byte	0x66,0x0f,0x3a,0x0f
1819	.byte	0xc2,0x09
1820	movdqa	%xmm0,(%rcx)      	# store it
1821	lea	0x10(%rcx),%rcx
1822	jmp	L(movdqa_epi)
1823
1824	.balign 16
1825L(mov3dqa10):
1826	movdqa	0x10(%rdx),%xmm3
1827	sub	$0x30,%r8
1828	movdqa	0x20(%rdx),%xmm0
1829	movdqa	0x30(%rdx),%xmm5
1830	lea	0x30(%rdx),%rdx
1831	cmp	$0x30,%r8
1832
1833	movdqa	%xmm3,%xmm2
1834	#palignr	$0xa,%xmm1,%xmm3
1835	.byte	0x66,0x0f,0x3a,0x0f
1836	.byte	0xd9,0x0a
1837	movdqa	%xmm3,(%rcx)
1838
1839	movdqa	%xmm0,%xmm4
1840	#palignr	$0xa,%xmm2,%xmm0
1841	.byte	0x66,0x0f,0x3a,0x0f
1842	.byte	0xc2,0x0a
1843	movdqa	%xmm0,0x10(%rcx)
1844
1845	movdqa	%xmm5,%xmm1
1846	#palignr	$0xa,%xmm4,%xmm5
1847	.byte	0x66,0x0f,0x3a,0x0f
1848	.byte	0xec,0x0a
1849	movdqa	%xmm5,0x20(%rcx)
1850
1851	lea	0x30(%rcx),%rcx
1852	jge	L(mov3dqa10)
1853
1854	cmp	$0x10,%r8
1855	jl	L(movdqa_epi)
1856	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1857	sub	$0x10,%r8
1858	lea	0x10(%rdx),%rdx
1859	movdqa	%xmm3,%xmm2		# save for use next concat
1860	#palignr	$0xa,%xmm1,%xmm3
1861	.byte	0x66,0x0f,0x3a,0x0f
1862	.byte	0xd9,0x0a
1863
1864	cmp	$0x10,%r8
1865	movdqa	%xmm3,(%rcx)      	# store it
1866	lea	0x10(%rcx),%rcx
1867	jl	L(movdqa_epi)
1868
1869	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1870	sub	$0x10,%r8
1871	lea	0x10(%rdx),%rdx
1872	#palignr	$0xa,%xmm2,%xmm0
1873	.byte	0x66,0x0f,0x3a,0x0f
1874	.byte	0xc2,0x0a
1875	movdqa	%xmm0,(%rcx)      	# store it
1876	lea	0x10(%rcx),%rcx
1877	jmp	L(movdqa_epi)
1878
1879	.balign 16
1880L(mov3dqa11):
1881	movdqa	0x10(%rdx),%xmm3
1882	sub	$0x30,%r8
1883	movdqa	0x20(%rdx),%xmm0
1884	movdqa	0x30(%rdx),%xmm5
1885	lea	0x30(%rdx),%rdx
1886	cmp	$0x30,%r8
1887
1888	movdqa	%xmm3,%xmm2
1889	#palignr	$0xb,%xmm1,%xmm3
1890	.byte	0x66,0x0f,0x3a,0x0f
1891	.byte	0xd9,0x0b
1892	movdqa	%xmm3,(%rcx)
1893
1894	movdqa	%xmm0,%xmm4
1895	#palignr	$0xb,%xmm2,%xmm0
1896	.byte	0x66,0x0f,0x3a,0x0f
1897	.byte	0xc2,0x0b
1898	movdqa	%xmm0,0x10(%rcx)
1899
1900	movdqa	%xmm5,%xmm1
1901	#palignr	$0xb,%xmm4,%xmm5
1902	.byte	0x66,0x0f,0x3a,0x0f
1903	.byte	0xec,0x0b
1904	movdqa	%xmm5,0x20(%rcx)
1905
1906	lea	0x30(%rcx),%rcx
1907	jge	L(mov3dqa11)
1908
1909	cmp	$0x10,%r8
1910	jl	L(movdqa_epi)
1911	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1912	sub	$0x10,%r8
1913	lea	0x10(%rdx),%rdx
1914	movdqa	%xmm3,%xmm2		# save for use next concat
1915	#palignr	$0xb,%xmm1,%xmm3
1916	.byte	0x66,0x0f,0x3a,0x0f
1917	.byte	0xd9,0x0b
1918
1919	cmp	$0x10,%r8
1920	movdqa	%xmm3,(%rcx)      	# store it
1921	lea	0x10(%rcx),%rcx
1922	jl	L(movdqa_epi)
1923
1924	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1925	sub	$0x10,%r8
1926	lea	0x10(%rdx),%rdx
1927	#palignr	$0xb,%xmm2,%xmm0
1928	.byte	0x66,0x0f,0x3a,0x0f
1929	.byte	0xc2,0x0b
1930	movdqa	%xmm0,(%rcx)      	# store it
1931	lea	0x10(%rcx),%rcx
1932	jmp	L(movdqa_epi)
1933
1934	.balign 16
1935L(mov3dqa12):
1936	movdqa	0x10(%rdx),%xmm3
1937	sub	$0x30,%r8
1938	movdqa	0x20(%rdx),%xmm0
1939	movdqa	0x30(%rdx),%xmm5
1940	lea	0x30(%rdx),%rdx
1941	cmp	$0x30,%r8
1942
1943	movdqa	%xmm3,%xmm2
1944	#palignr	$0xc,%xmm1,%xmm3
1945	.byte	0x66,0x0f,0x3a,0x0f
1946	.byte	0xd9,0x0c
1947	movdqa	%xmm3,(%rcx)
1948
1949	movdqa	%xmm0,%xmm4
1950	#palignr	$0xc,%xmm2,%xmm0
1951	.byte	0x66,0x0f,0x3a,0x0f
1952	.byte	0xc2,0x0c
1953	movdqa	%xmm0,0x10(%rcx)
1954
1955	movdqa	%xmm5,%xmm1
1956	#palignr	$0xc,%xmm4,%xmm5
1957	.byte	0x66,0x0f,0x3a,0x0f
1958	.byte	0xec,0x0c
1959	movdqa	%xmm5,0x20(%rcx)
1960
1961	lea	0x30(%rcx),%rcx
1962	jge	L(mov3dqa12)
1963
1964	cmp	$0x10,%r8
1965	jl	L(movdqa_epi)
1966	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
1967	sub	$0x10,%r8
1968	lea	0x10(%rdx),%rdx
1969	movdqa	%xmm3,%xmm2		# save for use next concat
1970	#palignr	$0xc,%xmm1,%xmm3
1971	.byte	0x66,0x0f,0x3a,0x0f
1972	.byte	0xd9,0x0c
1973
1974	cmp	$0x10,%r8
1975	movdqa	%xmm3,(%rcx)      	# store it
1976	lea	0x10(%rcx),%rcx
1977	jl	L(movdqa_epi)
1978
1979	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
1980	sub	$0x10,%r8
1981	lea	0x10(%rdx),%rdx
1982	#palignr	$0xc,%xmm2,%xmm0
1983	.byte	0x66,0x0f,0x3a,0x0f
1984	.byte	0xc2,0x0c
1985	movdqa	%xmm0,(%rcx)      	# store it
1986	lea	0x10(%rcx),%rcx
1987	jmp	L(movdqa_epi)
1988
1989	.balign 16
1990L(mov3dqa13):
1991	movdqa	0x10(%rdx),%xmm3
1992	sub	$0x30,%r8
1993	movdqa	0x20(%rdx),%xmm0
1994	movdqa	0x30(%rdx),%xmm5
1995	lea	0x30(%rdx),%rdx
1996	cmp	$0x30,%r8
1997
1998	movdqa	%xmm3,%xmm2
1999	#palignr	$0xd,%xmm1,%xmm3
2000	.byte	0x66,0x0f,0x3a,0x0f
2001	.byte	0xd9,0x0d
2002	movdqa	%xmm3,(%rcx)
2003
2004	movdqa	%xmm0,%xmm4
2005	#palignr	$0xd,%xmm2,%xmm0
2006	.byte	0x66,0x0f,0x3a,0x0f
2007	.byte	0xc2,0x0d
2008	movdqa	%xmm0,0x10(%rcx)
2009
2010	movdqa	%xmm5,%xmm1
2011	#palignr	$0xd,%xmm4,%xmm5
2012	.byte	0x66,0x0f,0x3a,0x0f
2013	.byte	0xec,0x0d
2014	movdqa	%xmm5,0x20(%rcx)
2015
2016	lea	0x30(%rcx),%rcx
2017	jge	L(mov3dqa13)
2018
2019	cmp	$0x10,%r8
2020	jl	L(movdqa_epi)
2021	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2022	sub	$0x10,%r8
2023	lea	0x10(%rdx),%rdx
2024	movdqa	%xmm3,%xmm2		# save for use next concat
2025	#palignr	$0xd,%xmm1,%xmm3
2026	.byte	0x66,0x0f,0x3a,0x0f
2027	.byte	0xd9,0x0d
2028
2029	cmp	$0x10,%r8
2030	movdqa	%xmm3,(%rcx)      	# store it
2031	lea	0x10(%rcx),%rcx
2032	jl	L(movdqa_epi)
2033
2034	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2035	sub	$0x10,%r8
2036	lea	0x10(%rdx),%rdx
2037	#palignr	$0xd,%xmm2,%xmm0
2038	.byte	0x66,0x0f,0x3a,0x0f
2039	.byte	0xc2,0x0d
2040	movdqa	%xmm0,(%rcx)      	# store it
2041	lea	0x10(%rcx),%rcx
2042	jmp	L(movdqa_epi)
2043
2044	.balign 16
2045L(mov3dqa14):
2046	movdqa	0x10(%rdx),%xmm3
2047	sub	$0x30,%r8
2048	movdqa	0x20(%rdx),%xmm0
2049	movdqa	0x30(%rdx),%xmm5
2050	lea	0x30(%rdx),%rdx
2051	cmp	$0x30,%r8
2052
2053	movdqa	%xmm3,%xmm2
2054	#palignr	$0xe,%xmm1,%xmm3
2055	.byte	0x66,0x0f,0x3a,0x0f
2056	.byte	0xd9,0x0e
2057	movdqa	%xmm3,(%rcx)
2058
2059	movdqa	%xmm0,%xmm4
2060	#palignr	$0xe,%xmm2,%xmm0
2061	.byte	0x66,0x0f,0x3a,0x0f
2062	.byte	0xc2,0x0e
2063	movdqa	%xmm0,0x10(%rcx)
2064
2065	movdqa	%xmm5,%xmm1
2066	#palignr	$0xe,%xmm4,%xmm5
2067	.byte	0x66,0x0f,0x3a,0x0f
2068	.byte	0xec,0x0e
2069	movdqa	%xmm5,0x20(%rcx)
2070
2071	lea	0x30(%rcx),%rcx
2072	jge	L(mov3dqa14)
2073
2074	cmp	$0x10,%r8
2075	jl	L(movdqa_epi)
2076	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2077	sub	$0x10,%r8
2078	lea	0x10(%rdx),%rdx
2079	movdqa	%xmm3,%xmm2		# save for use next concat
2080	#palignr	$0xe,%xmm1,%xmm3
2081	.byte	0x66,0x0f,0x3a,0x0f
2082	.byte	0xd9,0x0e
2083
2084	cmp	$0x10,%r8
2085	movdqa	%xmm3,(%rcx)      	# store it
2086	lea	0x10(%rcx),%rcx
2087	jl	L(movdqa_epi)
2088
2089	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2090	sub	$0x10,%r8
2091	lea	0x10(%rdx),%rdx
2092	#palignr	$0xe,%xmm2,%xmm0
2093	.byte	0x66,0x0f,0x3a,0x0f
2094	.byte	0xc2,0x0e
2095	movdqa	%xmm0,(%rcx)      	# store it
2096	lea	0x10(%rcx),%rcx
2097	jmp	L(movdqa_epi)
2098
2099	.balign 16
2100L(mov3dqa15):
2101	movdqa	0x10(%rdx),%xmm3
2102	sub	$0x30,%r8
2103	movdqa	0x20(%rdx),%xmm0
2104	movdqa	0x30(%rdx),%xmm5
2105	lea	0x30(%rdx),%rdx
2106	cmp	$0x30,%r8
2107
2108	movdqa	%xmm3,%xmm2
2109	#palignr	$0xf,%xmm1,%xmm3
2110	.byte	0x66,0x0f,0x3a,0x0f
2111	.byte	0xd9,0x0f
2112	movdqa	%xmm3,(%rcx)
2113
2114	movdqa	%xmm0,%xmm4
2115	#palignr	$0xf,%xmm2,%xmm0
2116	.byte	0x66,0x0f,0x3a,0x0f
2117	.byte	0xc2,0x0f
2118	movdqa	%xmm0,0x10(%rcx)
2119
2120	movdqa	%xmm5,%xmm1
2121	#palignr	$0xf,%xmm4,%xmm5
2122	.byte	0x66,0x0f,0x3a,0x0f
2123	.byte	0xec,0x0f
2124	movdqa	%xmm5,0x20(%rcx)
2125
2126	lea	0x30(%rcx),%rcx
2127	jge	L(mov3dqa15)
2128
2129	cmp	$0x10,%r8
2130	jl	L(movdqa_epi)
2131	movdqa	0x10(%rdx),%xmm3	# load the upper source buffer
2132	sub	$0x10,%r8
2133	lea	0x10(%rdx),%rdx
2134	movdqa	%xmm3,%xmm2		# save for use next concat
2135	#palignr	$0xf,%xmm1,%xmm3
2136	.byte	0x66,0x0f,0x3a,0x0f
2137	.byte	0xd9,0x0f
2138
2139	cmp	$0x10,%r8
2140	movdqa	%xmm3,(%rcx)      	# store it
2141	lea	0x10(%rcx),%rcx
2142	jl	L(movdqa_epi)
2143
2144	movdqa	0x10(%rdx),%xmm0	# load the upper source buffer
2145	sub	$0x10,%r8
2146	lea	0x10(%rdx),%rdx
2147	#palignr	$0xf,%xmm2,%xmm0
2148	.byte	0x66,0x0f,0x3a,0x0f
2149	.byte	0xc2,0x0f
2150	movdqa	%xmm0,(%rcx)      	# store it
2151	lea	0x10(%rcx),%rcx
2152	jmp	L(movdqa_epi)
2153
2154	.balign 16
2155L(sse2_nt_move):
2156	lea	0x40(%rcx),%rcx
2157	lea	0x40(%rdx),%rdx
2158	lea	-0x40(%r8),%r8
2159
2160	/*
2161	 * doesn't matter if source is aligned for stuff out of cache.
2162	 * the mis-aligned penalty is masked by the slowness of main memory.
2163	 */
2164	prefetchnta 0x180(%rdx)
2165	movdqu	-0x40(%rdx),%xmm0
2166	movdqu	-0x30(%rdx),%xmm1
2167
2168	cmp	$0x40,%r8
2169	movntdq	%xmm0,-0x40(%rcx)
2170	movntdq	%xmm1,-0x30(%rcx)
2171
2172	movdqu	-0x20(%rdx),%xmm2
2173	movdqu	-0x10(%rdx),%xmm3
2174
2175	movntdq	%xmm2,-0x20(%rcx)
2176	movntdq	%xmm3,-0x10(%rcx)
2177
2178	jge	L(sse2_nt_move)
2179
2180	lea	L(Fix16EndTable)(%rip),%r10
2181	mov	%r8,%r9
2182	and	$0xFFFFFFFFFFFFFFF0,%r9
2183	add	%r9,%rcx
2184	add	%r9,%rdx
2185	sub	%r9,%r8
2186	shr	$0x4,%r9
2187	sfence
2188
2189	movslq	(%r10,%r9,4),%r11
2190	lea	(%r11,%r10,1),%r10
2191	jmpq	*%r10
2192
2193	.balign 16
2194L(Fix16EndTable):
2195	.int    L(fix16_0)-L(Fix16EndTable)
2196	.int    L(fix16_1)-L(Fix16EndTable)
2197	.int    L(fix16_2)-L(Fix16EndTable)
2198	.int    L(fix16_3)-L(Fix16EndTable)
2199
2200	.balign 16
2201L(fix16_3):
2202	movdqu -0x30(%rdx),%xmm1
2203	movdqa %xmm1,-0x30(%rcx)
2204L(fix16_2):
2205	movdqu -0x20(%rdx),%xmm2
2206	movdqa %xmm2,-0x20(%rcx)
2207L(fix16_1):
2208	movdqu -0x10(%rdx),%xmm3
2209	movdqa %xmm3,-0x10(%rcx)
2210L(fix16_0):
2211	lea    L(fwdPxQx)(%rip),%r10
2212	add    %r8,%rdx
2213	add    %r8,%rcx
2214
2215	movslq (%r10,%r8,4),%r9
2216	lea    (%r9,%r10,1),%r10
2217	jmpq   *%r10
2218
2219	.balign 16
2220L(pre_both_aligned):
2221	cmp    $0x80,%r8
2222	jl     L(fix_16b)
2223
2224	.balign 16
2225L(both_aligned):
2226
2227	/*
2228	 * this 'paired' load/load/store/store seems to do best.
2229	 */
2230	movdqa (%rdx),%xmm0
2231	movdqa 0x10(%rdx),%xmm1
2232
2233	movdqa %xmm0,(%rcx)
2234	movdqa %xmm1,0x10(%rcx)
2235	lea    -0x80(%r8),%r8
2236
2237	movdqa 0x20(%rdx),%xmm2
2238	movdqa 0x30(%rdx),%xmm3
2239
2240	movdqa %xmm2,0x20(%rcx)
2241	movdqa %xmm3,0x30(%rcx)
2242
2243	movdqa 0x40(%rdx),%xmm0
2244	movdqa 0x50(%rdx),%xmm1
2245	cmp    $0x80,%r8
2246
2247	movdqa %xmm0,0x40(%rcx)
2248	movdqa %xmm1,0x50(%rcx)
2249
2250	movdqa 0x60(%rdx),%xmm2
2251	movdqa 0x70(%rdx),%xmm3
2252	lea    0x80(%rdx),%rdx
2253	movdqa %xmm2,0x60(%rcx)
2254	movdqa %xmm3,0x70(%rcx)
2255	lea    0x80(%rcx),%rcx
2256	jge    L(both_aligned)
2257
2258L(fix_16b):
2259	add    %r8,%rcx
2260	lea    L(fwdPxQx)(%rip),%r10
2261	add    %r8,%rdx
2262
2263	movslq (%r10,%r8,4),%r9
2264	lea    (%r9,%r10,1),%r10
2265	jmpq   *%r10
2266
2267	.balign 16
2268L(Loop8byte_pre):
2269	# Use 8-byte moves
2270	mov    .largest_level_cache_size(%rip),%r9d
2271	shr    %r9		# take half of it
2272	cmp    %r9,%r8
2273	jg     L(byte8_nt_top)
2274	# Find out whether to use rep movsq
2275	cmp    $4096,%r8
2276	jle    L(byte8_top)
2277	mov    .amd64cache1half(%rip),%r9d	# half of l1 cache
2278	cmp    %r9,%r8
2279	jle    L(use_rep)
2280
2281	.balign     16
2282L(byte8_top):
2283	mov    (%rdx),%r9
2284	mov    0x8(%rdx),%r10
2285	lea    -0x40(%r8),%r8
2286	mov    %r9,(%rcx)
2287	mov    %r10,0x8(%rcx)
2288	mov    0x10(%rdx),%r11
2289	mov    0x18(%rdx),%r9
2290	mov    %r11,0x10(%rcx)
2291	mov    %r9,0x18(%rcx)
2292
2293	cmp    $0x40,%r8
2294	mov    0x20(%rdx),%r10
2295	mov    0x28(%rdx),%r11
2296	mov    %r10,0x20(%rcx)
2297	mov    %r11,0x28(%rcx)
2298	mov    0x30(%rdx),%r9
2299	mov    0x38(%rdx),%r10
2300	lea    0x40(%rdx),%rdx
2301	mov    %r9,0x30(%rcx)
2302	mov    %r10,0x38(%rcx)
2303	lea    0x40(%rcx),%rcx
2304	jg     L(byte8_top)
2305
2306L(byte8_end):
2307	lea    L(fwdPxQx)(%rip),%r10
2308	lea    (%rdx,%r8,1),%rdx
2309	lea    (%rcx,%r8,1),%rcx
2310
2311	movslq (%r10,%r8,4),%r9
2312	lea    (%r9,%r10,1),%r10
2313	jmpq   *%r10
2314
2315	.balign	16
2316L(use_rep):
2317	mov    %rdx,%rsi		# %rsi = source
2318	mov    %rcx,%rdi		# %rdi = destination
2319	mov    %r8,%rcx			# %rcx = count
2320	shrq   $3,%rcx			# 8-byte word count
2321	rep
2322	  movsq
2323	mov    %rsi,%rdx		# source
2324	mov    %rdi,%rcx		# destination
2325	andq   $7,%r8			# remainder
2326	jnz    L(byte8_end)
2327	ret
2328
2329	.balign 16
2330L(byte8_nt_top):
2331	sub    $0x40,%r8
2332	prefetchnta 0x180(%rdx)
2333	mov    (%rdx),%r9
2334	movnti %r9,(%rcx)
2335	mov    0x8(%rdx),%r10
2336	movnti %r10,0x8(%rcx)
2337	mov    0x10(%rdx),%r11
2338	movnti %r11,0x10(%rcx)
2339	mov    0x18(%rdx),%r9
2340	movnti %r9,0x18(%rcx)
2341	mov    0x20(%rdx),%r10
2342	movnti %r10,0x20(%rcx)
2343	mov    0x28(%rdx),%r11
2344	movnti %r11,0x28(%rcx)
2345	mov    0x30(%rdx),%r9
2346	movnti %r9,0x30(%rcx)
2347	mov    0x38(%rdx),%r10
2348	movnti %r10,0x38(%rcx)
2349
2350	lea    0x40(%rdx),%rdx
2351	lea    0x40(%rcx),%rcx
2352	cmp    $0x40,%r8
2353	jge    L(byte8_nt_top)
2354	sfence
2355	jmp    L(byte8_end)
2356
2357	SET_SIZE(memcpy)
2358
2359	.balign 16
2360L(CopyBackwards):
2361	mov    %rdx,%r8
2362	mov    %rdi,%rcx
2363	mov    %rsi,%rdx
2364	mov    %rdi,%rax		# return value
2365
2366	# ck alignment of last byte
2367	lea    (%rcx,%r8,1),%rcx
2368	test   $0x7,%rcx
2369	lea    (%rdx,%r8,1),%rdx
2370	jne    L(bk_align)
2371
2372L(bk_qw_aligned):
2373	lea    L(bkPxQx)(%rip),%r10
2374
2375	cmp    $0x90,%r8		# 144
2376	jg     L(bk_ck_sse2_alignment)
2377
2378	sub    %r8,%rcx
2379	sub    %r8,%rdx
2380
2381	movslq (%r10,%r8,4),%r9
2382	lea    (%r9,%r10,1),%r10
2383	jmpq   *%r10
2384
2385	.balign 16
2386L(bk_align):
2387	# only align if len > 8
2388	cmp    $8,%r8
2389	jle    L(bk_qw_aligned)
2390	test   $0x1,%rcx
2391	je     L(bk_tst2)
2392	dec    %rcx
2393	dec    %rdx
2394	dec    %r8
2395	mov    (%rdx),%r9b
2396	mov    %r9b,(%rcx)
2397
2398L(bk_tst2):
2399	test   $0x2,%rcx
2400	je     L(bk_tst3)
2401
2402L(bk_got2):
2403	sub    $0x2,%rcx
2404	sub    $0x2,%rdx
2405	sub    $0x2,%r8
2406	movzwq (%rdx),%r9
2407	mov    %r9w,(%rcx)
2408
2409L(bk_tst3):
2410	test   $0x4,%rcx
2411	je     L(bk_qw_aligned)
2412
2413L(bk_got3):
2414	sub    $0x4,%rcx
2415	sub    $0x4,%rdx
2416	sub    $0x4,%r8
2417	mov    (%rdx),%r9d
2418	mov    %r9d,(%rcx)
2419	jmp    L(bk_qw_aligned)
2420
2421	.balign 16
2422L(bk_ck_sse2_alignment):
2423	cmpl   $NO_SSE,.memops_method(%rip)
2424	je     L(bk_use_rep)
2425	# check alignment of last byte
2426	test   $0xf,%rcx
2427	jz     L(bk_sse2_cpy)
2428
2429L(bk_sse2_align):
2430	# only here if already aligned on at least a qword bndry
2431	sub    $0x8,%rcx
2432	sub    $0x8,%rdx
2433	sub    $0x8,%r8
2434	mov    (%rdx),%r9
2435	mov    %r9,(%rcx)
2436	#jmp   L(bk_sse2_cpy)
2437
2438	.balign 16
2439L(bk_sse2_cpy):
2440	sub    $0x80,%rcx		# 128
2441	sub    $0x80,%rdx
2442	movdqu 0x70(%rdx),%xmm3
2443	movdqu 0x60(%rdx),%xmm2
2444	movdqa %xmm3,0x70(%rcx)
2445	movdqa %xmm2,0x60(%rcx)
2446	sub    $0x80,%r8
2447	movdqu 0x50(%rdx),%xmm1
2448	movdqu 0x40(%rdx),%xmm0
2449	movdqa %xmm1,0x50(%rcx)
2450	movdqa %xmm0,0x40(%rcx)
2451
2452	cmp    $0x80,%r8
2453	movdqu 0x30(%rdx),%xmm3
2454	movdqu 0x20(%rdx),%xmm2
2455	movdqa %xmm3,0x30(%rcx)
2456	movdqa %xmm2,0x20(%rcx)
2457	movdqu 0x10(%rdx),%xmm1
2458	movdqu (%rdx),%xmm0
2459	movdqa %xmm1,0x10(%rcx)
2460	movdqa %xmm0,(%rcx)
2461	jge    L(bk_sse2_cpy)
2462
2463L(bk_sse2_cpy_end):
2464	lea    L(bkPxQx)(%rip),%r10
2465	sub    %r8,%rdx
2466	sub    %r8,%rcx
2467	movslq (%r10,%r8,4),%r9
2468	lea    (%r9,%r10,1),%r10
2469	jmpq   *%r10
2470
2471	.balign 16
2472L(bk_use_rep):
2473	xchg   %rcx,%r9
2474	mov    %rdx,%rsi		# source
2475	mov    %r9,%rdi			# destination
2476	mov    %r8,%rcx			# count
2477	sub    $8,%rsi
2478	sub    $8,%rdi
2479	shr    $3,%rcx
2480	std				# reverse direction
2481	rep
2482	  movsq
2483	cld				# reset direction flag
2484
2485	xchg   %rcx,%r9
2486	lea    L(bkPxQx)(%rip),%r10
2487	sub    %r8,%rdx
2488	sub    %r8,%rcx
2489	andq   $7,%r8			# remainder
2490	jz     2f
2491	movslq (%r10,%r8,4),%r9
2492	lea    (%r9,%r10,1),%r10
2493	jmpq   *%r10
24942:
2495	ret
2496
2497	.balign 16
2498L(bkP0QI):
2499	mov    0x88(%rdx),%r10
2500	mov    %r10,0x88(%rcx)
2501L(bkP0QH):
2502	mov    0x80(%rdx),%r10
2503	mov    %r10,0x80(%rcx)
2504L(bkP0QG):
2505	mov    0x78(%rdx),%r9
2506	mov    %r9,0x78(%rcx)
2507L(bkP0QF):
2508	mov    0x70(%rdx),%r11
2509	mov    %r11,0x70(%rcx)
2510L(bkP0QE):
2511	mov    0x68(%rdx),%r10
2512	mov    %r10,0x68(%rcx)
2513L(bkP0QD):
2514	mov    0x60(%rdx),%r9
2515	mov    %r9,0x60(%rcx)
2516L(bkP0QC):
2517	mov    0x58(%rdx),%r11
2518	mov    %r11,0x58(%rcx)
2519L(bkP0QB):
2520	mov    0x50(%rdx),%r10
2521	mov    %r10,0x50(%rcx)
2522L(bkP0QA):
2523	mov    0x48(%rdx),%r9
2524	mov    %r9,0x48(%rcx)
2525L(bkP0Q9):
2526	mov    0x40(%rdx),%r11
2527	mov    %r11,0x40(%rcx)
2528L(bkP0Q8):
2529	mov    0x38(%rdx),%r10
2530	mov    %r10,0x38(%rcx)
2531L(bkP0Q7):
2532	mov    0x30(%rdx),%r9
2533	mov    %r9,0x30(%rcx)
2534L(bkP0Q6):
2535	mov    0x28(%rdx),%r11
2536	mov    %r11,0x28(%rcx)
2537L(bkP0Q5):
2538	mov    0x20(%rdx),%r10
2539	mov    %r10,0x20(%rcx)
2540L(bkP0Q4):
2541	mov    0x18(%rdx),%r9
2542	mov    %r9,0x18(%rcx)
2543L(bkP0Q3):
2544	mov    0x10(%rdx),%r11
2545	mov    %r11,0x10(%rcx)
2546L(bkP0Q2):
2547	mov    0x8(%rdx),%r10
2548	mov    %r10,0x8(%rcx)
2549L(bkP0Q1):
2550	mov    (%rdx),%r9
2551	mov    %r9,(%rcx)
2552L(bkP0Q0):
2553	ret
2554
2555	.balign 16
2556L(bkP1QI):
2557	mov    0x89(%rdx),%r10
2558	mov    %r10,0x89(%rcx)
2559L(bkP1QH):
2560	mov    0x81(%rdx),%r11
2561	mov    %r11,0x81(%rcx)
2562L(bkP1QG):
2563	mov    0x79(%rdx),%r10
2564	mov    %r10,0x79(%rcx)
2565L(bkP1QF):
2566	mov    0x71(%rdx),%r9
2567	mov    %r9,0x71(%rcx)
2568L(bkP1QE):
2569	mov    0x69(%rdx),%r11
2570	mov    %r11,0x69(%rcx)
2571L(bkP1QD):
2572	mov    0x61(%rdx),%r10
2573	mov    %r10,0x61(%rcx)
2574L(bkP1QC):
2575	mov    0x59(%rdx),%r9
2576	mov    %r9,0x59(%rcx)
2577L(bkP1QB):
2578	mov    0x51(%rdx),%r11
2579	mov    %r11,0x51(%rcx)
2580L(bkP1QA):
2581	mov    0x49(%rdx),%r10
2582	mov    %r10,0x49(%rcx)
2583L(bkP1Q9):
2584	mov    0x41(%rdx),%r9
2585	mov    %r9,0x41(%rcx)
2586L(bkP1Q8):
2587	mov    0x39(%rdx),%r11
2588	mov    %r11,0x39(%rcx)
2589L(bkP1Q7):
2590	mov    0x31(%rdx),%r10
2591	mov    %r10,0x31(%rcx)
2592L(bkP1Q6):
2593	mov    0x29(%rdx),%r9
2594	mov    %r9,0x29(%rcx)
2595L(bkP1Q5):
2596	mov    0x21(%rdx),%r11
2597	mov    %r11,0x21(%rcx)
2598L(bkP1Q4):
2599	mov    0x19(%rdx),%r10
2600	mov    %r10,0x19(%rcx)
2601L(bkP1Q3):
2602	mov    0x11(%rdx),%r9
2603	mov    %r9,0x11(%rcx)
2604L(bkP1Q2):
2605	mov    0x9(%rdx),%r11
2606	mov    %r11,0x9(%rcx)
2607L(bkP1Q1):
2608	mov    0x1(%rdx),%r10
2609	mov    %r10,0x1(%rcx)
2610L(bkP1Q0):
2611	mov    (%rdx),%r9b
2612	mov    %r9b,(%rcx)
2613	ret
2614
2615	.balign 16
2616L(bkP2QI):
2617	mov    0x8a(%rdx),%r10
2618	mov    %r10,0x8a(%rcx)
2619L(bkP2QH):
2620	mov    0x82(%rdx),%r11
2621	mov    %r11,0x82(%rcx)
2622L(bkP2QG):
2623	mov    0x7a(%rdx),%r10
2624	mov    %r10,0x7a(%rcx)
2625L(bkP2QF):
2626	mov    0x72(%rdx),%r9
2627	mov    %r9,0x72(%rcx)
2628L(bkP2QE):
2629	mov    0x6a(%rdx),%r11
2630	mov    %r11,0x6a(%rcx)
2631L(bkP2QD):
2632	mov    0x62(%rdx),%r10
2633	mov    %r10,0x62(%rcx)
2634L(bkP2QC):
2635	mov    0x5a(%rdx),%r9
2636	mov    %r9,0x5a(%rcx)
2637L(bkP2QB):
2638	mov    0x52(%rdx),%r11
2639	mov    %r11,0x52(%rcx)
2640L(bkP2QA):
2641	mov    0x4a(%rdx),%r10
2642	mov    %r10,0x4a(%rcx)
2643L(bkP2Q9):
2644	mov    0x42(%rdx),%r9
2645	mov    %r9,0x42(%rcx)
2646L(bkP2Q8):
2647	mov    0x3a(%rdx),%r11
2648	mov    %r11,0x3a(%rcx)
2649L(bkP2Q7):
2650	mov    0x32(%rdx),%r10
2651	mov    %r10,0x32(%rcx)
2652L(bkP2Q6):
2653	mov    0x2a(%rdx),%r9
2654	mov    %r9,0x2a(%rcx)
2655L(bkP2Q5):
2656	mov    0x22(%rdx),%r11
2657	mov    %r11,0x22(%rcx)
2658L(bkP2Q4):
2659	mov    0x1a(%rdx),%r10
2660	mov    %r10,0x1a(%rcx)
2661L(bkP2Q3):
2662	mov    0x12(%rdx),%r9
2663	mov    %r9,0x12(%rcx)
2664L(bkP2Q2):
2665	mov    0xa(%rdx),%r11
2666	mov    %r11,0xa(%rcx)
2667L(bkP2Q1):
2668	mov    0x2(%rdx),%r10
2669	mov    %r10,0x2(%rcx)
2670L(bkP2Q0):
2671	mov    (%rdx),%r9w
2672	mov    %r9w,(%rcx)
2673	ret
2674
2675	.balign 16
2676L(bkP3QI):
2677	mov    0x8b(%rdx),%r10
2678	mov    %r10,0x8b(%rcx)
2679L(bkP3QH):
2680	mov    0x83(%rdx),%r11
2681	mov    %r11,0x83(%rcx)
2682L(bkP3QG):
2683	mov    0x7b(%rdx),%r10
2684	mov    %r10,0x7b(%rcx)
2685L(bkP3QF):
2686	mov    0x73(%rdx),%r9
2687	mov    %r9,0x73(%rcx)
2688L(bkP3QE):
2689	mov    0x6b(%rdx),%r11
2690	mov    %r11,0x6b(%rcx)
2691L(bkP3QD):
2692	mov    0x63(%rdx),%r10
2693	mov    %r10,0x63(%rcx)
2694L(bkP3QC):
2695	mov    0x5b(%rdx),%r9
2696	mov    %r9,0x5b(%rcx)
2697L(bkP3QB):
2698	mov    0x53(%rdx),%r11
2699	mov    %r11,0x53(%rcx)
2700L(bkP3QA):
2701	mov    0x4b(%rdx),%r10
2702	mov    %r10,0x4b(%rcx)
2703L(bkP3Q9):
2704	mov    0x43(%rdx),%r9
2705	mov    %r9,0x43(%rcx)
2706L(bkP3Q8):
2707	mov    0x3b(%rdx),%r11
2708	mov    %r11,0x3b(%rcx)
2709L(bkP3Q7):
2710	mov    0x33(%rdx),%r10
2711	mov    %r10,0x33(%rcx)
2712L(bkP3Q6):
2713	mov    0x2b(%rdx),%r9
2714	mov    %r9,0x2b(%rcx)
2715L(bkP3Q5):
2716	mov    0x23(%rdx),%r11
2717	mov    %r11,0x23(%rcx)
2718L(bkP3Q4):
2719	mov    0x1b(%rdx),%r10
2720	mov    %r10,0x1b(%rcx)
2721L(bkP3Q3):
2722	mov    0x13(%rdx),%r9
2723	mov    %r9,0x13(%rcx)
2724L(bkP3Q2):
2725	mov    0xb(%rdx),%r11
2726	mov    %r11,0xb(%rcx)
2727L(bkP3Q1):
2728	mov    0x3(%rdx),%r10
2729	mov    %r10,0x3(%rcx)
2730L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores
2731	mov    0x1(%rdx),%r9w
2732	mov    %r9w,0x1(%rcx)
2733	mov    (%rdx),%r10b
2734	mov    %r10b,(%rcx)
2735	ret
2736
2737	.balign 16
2738L(bkP4QI):
2739	mov    0x8c(%rdx),%r10
2740	mov    %r10,0x8c(%rcx)
2741L(bkP4QH):
2742	mov    0x84(%rdx),%r11
2743	mov    %r11,0x84(%rcx)
2744L(bkP4QG):
2745	mov    0x7c(%rdx),%r10
2746	mov    %r10,0x7c(%rcx)
2747L(bkP4QF):
2748	mov    0x74(%rdx),%r9
2749	mov    %r9,0x74(%rcx)
2750L(bkP4QE):
2751	mov    0x6c(%rdx),%r11
2752	mov    %r11,0x6c(%rcx)
2753L(bkP4QD):
2754	mov    0x64(%rdx),%r10
2755	mov    %r10,0x64(%rcx)
2756L(bkP4QC):
2757	mov    0x5c(%rdx),%r9
2758	mov    %r9,0x5c(%rcx)
2759L(bkP4QB):
2760	mov    0x54(%rdx),%r11
2761	mov    %r11,0x54(%rcx)
2762L(bkP4QA):
2763	mov    0x4c(%rdx),%r10
2764	mov    %r10,0x4c(%rcx)
2765L(bkP4Q9):
2766	mov    0x44(%rdx),%r9
2767	mov    %r9,0x44(%rcx)
2768L(bkP4Q8):
2769	mov    0x3c(%rdx),%r11
2770	mov    %r11,0x3c(%rcx)
2771L(bkP4Q7):
2772	mov    0x34(%rdx),%r10
2773	mov    %r10,0x34(%rcx)
2774L(bkP4Q6):
2775	mov    0x2c(%rdx),%r9
2776	mov    %r9,0x2c(%rcx)
2777L(bkP4Q5):
2778	mov    0x24(%rdx),%r11
2779	mov    %r11,0x24(%rcx)
2780L(bkP4Q4):
2781	mov    0x1c(%rdx),%r10
2782	mov    %r10,0x1c(%rcx)
2783L(bkP4Q3):
2784	mov    0x14(%rdx),%r9
2785	mov    %r9,0x14(%rcx)
2786L(bkP4Q2):
2787	mov    0xc(%rdx),%r11
2788	mov    %r11,0xc(%rcx)
2789L(bkP4Q1):
2790	mov    0x4(%rdx),%r10
2791	mov    %r10,0x4(%rcx)
2792L(bkP4Q0):
2793	mov    (%rdx),%r9d
2794	mov    %r9d,(%rcx)
2795	ret
2796
2797	.balign 16
2798L(bkP5QI):
2799	mov    0x8d(%rdx),%r10
2800	mov    %r10,0x8d(%rcx)
2801L(bkP5QH):
2802	mov    0x85(%rdx),%r9
2803	mov    %r9,0x85(%rcx)
2804L(bkP5QG):
2805	mov    0x7d(%rdx),%r11
2806	mov    %r11,0x7d(%rcx)
2807L(bkP5QF):
2808	mov    0x75(%rdx),%r10
2809	mov    %r10,0x75(%rcx)
2810L(bkP5QE):
2811	mov    0x6d(%rdx),%r9
2812	mov    %r9,0x6d(%rcx)
2813L(bkP5QD):
2814	mov    0x65(%rdx),%r11
2815	mov    %r11,0x65(%rcx)
2816L(bkP5QC):
2817	mov    0x5d(%rdx),%r10
2818	mov    %r10,0x5d(%rcx)
2819L(bkP5QB):
2820	mov    0x55(%rdx),%r9
2821	mov    %r9,0x55(%rcx)
2822L(bkP5QA):
2823	mov    0x4d(%rdx),%r11
2824	mov    %r11,0x4d(%rcx)
2825L(bkP5Q9):
2826	mov    0x45(%rdx),%r10
2827	mov    %r10,0x45(%rcx)
2828L(bkP5Q8):
2829	mov    0x3d(%rdx),%r9
2830	mov    %r9,0x3d(%rcx)
2831L(bkP5Q7):
2832	mov    0x35(%rdx),%r11
2833	mov    %r11,0x35(%rcx)
2834L(bkP5Q6):
2835	mov    0x2d(%rdx),%r10
2836	mov    %r10,0x2d(%rcx)
2837L(bkP5Q5):
2838	mov    0x25(%rdx),%r9
2839	mov    %r9,0x25(%rcx)
2840L(bkP5Q4):
2841	mov    0x1d(%rdx),%r11
2842	mov    %r11,0x1d(%rcx)
2843L(bkP5Q3):
2844	mov    0x15(%rdx),%r10
2845	mov    %r10,0x15(%rcx)
2846L(bkP5Q2):
2847	mov    0xd(%rdx),%r9
2848	mov    %r9,0xd(%rcx)
2849L(bkP5Q1):
2850	mov    0x5(%rdx),%r11
2851	mov    %r11,0x5(%rcx)
2852L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores
2853	mov    0x1(%rdx),%r9d
2854	mov    %r9d,0x1(%rcx)
2855	mov    (%rdx),%r10b
2856	mov    %r10b,(%rcx)
2857	ret
2858
2859	.balign 16
2860L(bkP6QI):
2861	mov    0x8e(%rdx),%r10
2862	mov    %r10,0x8e(%rcx)
2863L(bkP6QH):
2864	mov    0x86(%rdx),%r11
2865	mov    %r11,0x86(%rcx)
2866L(bkP6QG):
2867	mov    0x7e(%rdx),%r10
2868	mov    %r10,0x7e(%rcx)
2869L(bkP6QF):
2870	mov    0x76(%rdx),%r9
2871	mov    %r9,0x76(%rcx)
2872L(bkP6QE):
2873	mov    0x6e(%rdx),%r11
2874	mov    %r11,0x6e(%rcx)
2875L(bkP6QD):
2876	mov    0x66(%rdx),%r10
2877	mov    %r10,0x66(%rcx)
2878L(bkP6QC):
2879	mov    0x5e(%rdx),%r9
2880	mov    %r9,0x5e(%rcx)
2881L(bkP6QB):
2882	mov    0x56(%rdx),%r11
2883	mov    %r11,0x56(%rcx)
2884L(bkP6QA):
2885	mov    0x4e(%rdx),%r10
2886	mov    %r10,0x4e(%rcx)
2887L(bkP6Q9):
2888	mov    0x46(%rdx),%r9
2889	mov    %r9,0x46(%rcx)
2890L(bkP6Q8):
2891	mov    0x3e(%rdx),%r11
2892	mov    %r11,0x3e(%rcx)
2893L(bkP6Q7):
2894	mov    0x36(%rdx),%r10
2895	mov    %r10,0x36(%rcx)
2896L(bkP6Q6):
2897	mov    0x2e(%rdx),%r9
2898	mov    %r9,0x2e(%rcx)
2899L(bkP6Q5):
2900	mov    0x26(%rdx),%r11
2901	mov    %r11,0x26(%rcx)
2902L(bkP6Q4):
2903	mov    0x1e(%rdx),%r10
2904	mov    %r10,0x1e(%rcx)
2905L(bkP6Q3):
2906	mov    0x16(%rdx),%r9
2907	mov    %r9,0x16(%rcx)
2908L(bkP6Q2):
2909	mov    0xe(%rdx),%r11
2910	mov    %r11,0xe(%rcx)
2911L(bkP6Q1):
2912	mov    0x6(%rdx),%r10
2913	mov    %r10,0x6(%rcx)
2914L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores
2915	mov    0x2(%rdx),%r9d
2916	mov    %r9d,0x2(%rcx)
2917	mov    (%rdx),%r10w
2918	mov    %r10w,(%rcx)
2919	ret
2920
2921	.balign 16
2922L(bkP7QI):
2923	mov    0x8f(%rdx),%r10
2924	mov    %r10,0x8f(%rcx)
2925L(bkP7QH):
2926	mov    0x87(%rdx),%r11
2927	mov    %r11,0x87(%rcx)
2928L(bkP7QG):
2929	mov    0x7f(%rdx),%r10
2930	mov    %r10,0x7f(%rcx)
2931L(bkP7QF):
2932	mov    0x77(%rdx),%r9
2933	mov    %r9,0x77(%rcx)
2934L(bkP7QE):
2935	mov    0x6f(%rdx),%r11
2936	mov    %r11,0x6f(%rcx)
2937L(bkP7QD):
2938	mov    0x67(%rdx),%r10
2939	mov    %r10,0x67(%rcx)
2940L(bkP7QC):
2941	mov    0x5f(%rdx),%r9
2942	mov    %r9,0x5f(%rcx)
2943L(bkP7QB):
2944	mov    0x57(%rdx),%r11
2945	mov    %r11,0x57(%rcx)
2946L(bkP7QA):
2947	mov    0x4f(%rdx),%r10
2948	mov    %r10,0x4f(%rcx)
2949L(bkP7Q9):
2950	mov    0x47(%rdx),%r9
2951	mov    %r9,0x47(%rcx)
2952L(bkP7Q8):
2953	mov    0x3f(%rdx),%r11
2954	mov    %r11,0x3f(%rcx)
2955L(bkP7Q7):
2956	mov    0x37(%rdx),%r10
2957	mov    %r10,0x37(%rcx)
2958L(bkP7Q6):
2959	mov    0x2f(%rdx),%r9
2960	mov    %r9,0x2f(%rcx)
2961L(bkP7Q5):
2962	mov    0x27(%rdx),%r11
2963	mov    %r11,0x27(%rcx)
2964L(bkP7Q4):
2965	mov    0x1f(%rdx),%r10
2966	mov    %r10,0x1f(%rcx)
2967L(bkP7Q3):
2968	mov    0x17(%rdx),%r9
2969	mov    %r9,0x17(%rcx)
2970L(bkP7Q2):
2971	mov    0xf(%rdx),%r11
2972	mov    %r11,0xf(%rcx)
2973L(bkP7Q1):
2974	mov    0x7(%rdx),%r10
2975	mov    %r10,0x7(%rcx)
2976L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores
2977	mov    0x3(%rdx),%r9d
2978	mov    %r9d,0x3(%rcx)
2979	mov    0x1(%rdx),%r10w
2980	mov    %r10w,0x1(%rcx)
2981	mov    (%rdx),%r11b
2982	mov    %r11b,(%rcx)
2983	ret
2984
2985		.balign 16
2986L(bkPxQx):	.int L(bkP0Q0)-L(bkPxQx)
2987		.int L(bkP1Q0)-L(bkPxQx)
2988		.int L(bkP2Q0)-L(bkPxQx)
2989		.int L(bkP3Q0)-L(bkPxQx)
2990		.int L(bkP4Q0)-L(bkPxQx)
2991		.int L(bkP5Q0)-L(bkPxQx)
2992		.int L(bkP6Q0)-L(bkPxQx)
2993		.int L(bkP7Q0)-L(bkPxQx)
2994
2995		.int L(bkP0Q1)-L(bkPxQx)
2996		.int L(bkP1Q1)-L(bkPxQx)
2997		.int L(bkP2Q1)-L(bkPxQx)
2998		.int L(bkP3Q1)-L(bkPxQx)
2999		.int L(bkP4Q1)-L(bkPxQx)
3000		.int L(bkP5Q1)-L(bkPxQx)
3001		.int L(bkP6Q1)-L(bkPxQx)
3002		.int L(bkP7Q1)-L(bkPxQx)
3003
3004		.int L(bkP0Q2)-L(bkPxQx)
3005		.int L(bkP1Q2)-L(bkPxQx)
3006		.int L(bkP2Q2)-L(bkPxQx)
3007		.int L(bkP3Q2)-L(bkPxQx)
3008		.int L(bkP4Q2)-L(bkPxQx)
3009		.int L(bkP5Q2)-L(bkPxQx)
3010		.int L(bkP6Q2)-L(bkPxQx)
3011		.int L(bkP7Q2)-L(bkPxQx)
3012
3013		.int L(bkP0Q3)-L(bkPxQx)
3014		.int L(bkP1Q3)-L(bkPxQx)
3015		.int L(bkP2Q3)-L(bkPxQx)
3016		.int L(bkP3Q3)-L(bkPxQx)
3017		.int L(bkP4Q3)-L(bkPxQx)
3018		.int L(bkP5Q3)-L(bkPxQx)
3019		.int L(bkP6Q3)-L(bkPxQx)
3020		.int L(bkP7Q3)-L(bkPxQx)
3021
3022		.int L(bkP0Q4)-L(bkPxQx)
3023		.int L(bkP1Q4)-L(bkPxQx)
3024		.int L(bkP2Q4)-L(bkPxQx)
3025		.int L(bkP3Q4)-L(bkPxQx)
3026		.int L(bkP4Q4)-L(bkPxQx)
3027		.int L(bkP5Q4)-L(bkPxQx)
3028		.int L(bkP6Q4)-L(bkPxQx)
3029		.int L(bkP7Q4)-L(bkPxQx)
3030
3031		.int L(bkP0Q5)-L(bkPxQx)
3032		.int L(bkP1Q5)-L(bkPxQx)
3033		.int L(bkP2Q5)-L(bkPxQx)
3034		.int L(bkP3Q5)-L(bkPxQx)
3035		.int L(bkP4Q5)-L(bkPxQx)
3036		.int L(bkP5Q5)-L(bkPxQx)
3037		.int L(bkP6Q5)-L(bkPxQx)
3038		.int L(bkP7Q5)-L(bkPxQx)
3039
3040		.int L(bkP0Q6)-L(bkPxQx)
3041		.int L(bkP1Q6)-L(bkPxQx)
3042		.int L(bkP2Q6)-L(bkPxQx)
3043		.int L(bkP3Q6)-L(bkPxQx)
3044		.int L(bkP4Q6)-L(bkPxQx)
3045		.int L(bkP5Q6)-L(bkPxQx)
3046		.int L(bkP6Q6)-L(bkPxQx)
3047		.int L(bkP7Q6)-L(bkPxQx)
3048
3049		.int L(bkP0Q7)-L(bkPxQx)
3050		.int L(bkP1Q7)-L(bkPxQx)
3051		.int L(bkP2Q7)-L(bkPxQx)
3052		.int L(bkP3Q7)-L(bkPxQx)
3053		.int L(bkP4Q7)-L(bkPxQx)
3054		.int L(bkP5Q7)-L(bkPxQx)
3055		.int L(bkP6Q7)-L(bkPxQx)
3056		.int L(bkP7Q7)-L(bkPxQx)
3057
3058		.int L(bkP0Q8)-L(bkPxQx)
3059		.int L(bkP1Q8)-L(bkPxQx)
3060		.int L(bkP2Q8)-L(bkPxQx)
3061		.int L(bkP3Q8)-L(bkPxQx)
3062		.int L(bkP4Q8)-L(bkPxQx)
3063		.int L(bkP5Q8)-L(bkPxQx)
3064		.int L(bkP6Q8)-L(bkPxQx)
3065		.int L(bkP7Q8)-L(bkPxQx)
3066
3067		.int L(bkP0Q9)-L(bkPxQx)
3068		.int L(bkP1Q9)-L(bkPxQx)
3069		.int L(bkP2Q9)-L(bkPxQx)
3070		.int L(bkP3Q9)-L(bkPxQx)
3071		.int L(bkP4Q9)-L(bkPxQx)
3072		.int L(bkP5Q9)-L(bkPxQx)
3073		.int L(bkP6Q9)-L(bkPxQx)
3074		.int L(bkP7Q9)-L(bkPxQx)
3075
3076		.int L(bkP0QA)-L(bkPxQx)
3077		.int L(bkP1QA)-L(bkPxQx)
3078		.int L(bkP2QA)-L(bkPxQx)
3079		.int L(bkP3QA)-L(bkPxQx)
3080		.int L(bkP4QA)-L(bkPxQx)
3081		.int L(bkP5QA)-L(bkPxQx)
3082		.int L(bkP6QA)-L(bkPxQx)
3083		.int L(bkP7QA)-L(bkPxQx)
3084
3085		.int L(bkP0QB)-L(bkPxQx)
3086		.int L(bkP1QB)-L(bkPxQx)
3087		.int L(bkP2QB)-L(bkPxQx)
3088		.int L(bkP3QB)-L(bkPxQx)
3089		.int L(bkP4QB)-L(bkPxQx)
3090		.int L(bkP5QB)-L(bkPxQx)
3091		.int L(bkP6QB)-L(bkPxQx)
3092		.int L(bkP7QB)-L(bkPxQx)
3093
3094		.int L(bkP0QC)-L(bkPxQx)
3095		.int L(bkP1QC)-L(bkPxQx)
3096		.int L(bkP2QC)-L(bkPxQx)
3097		.int L(bkP3QC)-L(bkPxQx)
3098		.int L(bkP4QC)-L(bkPxQx)
3099		.int L(bkP5QC)-L(bkPxQx)
3100		.int L(bkP6QC)-L(bkPxQx)
3101		.int L(bkP7QC)-L(bkPxQx)
3102
3103		.int L(bkP0QD)-L(bkPxQx)
3104		.int L(bkP1QD)-L(bkPxQx)
3105		.int L(bkP2QD)-L(bkPxQx)
3106		.int L(bkP3QD)-L(bkPxQx)
3107		.int L(bkP4QD)-L(bkPxQx)
3108		.int L(bkP5QD)-L(bkPxQx)
3109		.int L(bkP6QD)-L(bkPxQx)
3110		.int L(bkP7QD)-L(bkPxQx)
3111
3112		.int L(bkP0QE)-L(bkPxQx)
3113		.int L(bkP1QE)-L(bkPxQx)
3114		.int L(bkP2QE)-L(bkPxQx)
3115		.int L(bkP3QE)-L(bkPxQx)
3116		.int L(bkP4QE)-L(bkPxQx)
3117		.int L(bkP5QE)-L(bkPxQx)
3118		.int L(bkP6QE)-L(bkPxQx)
3119		.int L(bkP7QE)-L(bkPxQx)
3120
3121		.int L(bkP0QF)-L(bkPxQx)
3122		.int L(bkP1QF)-L(bkPxQx)
3123		.int L(bkP2QF)-L(bkPxQx)
3124		.int L(bkP3QF)-L(bkPxQx)
3125		.int L(bkP4QF)-L(bkPxQx)
3126		.int L(bkP5QF)-L(bkPxQx)
3127		.int L(bkP6QF)-L(bkPxQx)
3128		.int L(bkP7QF)-L(bkPxQx)
3129
3130		.int L(bkP0QG)-L(bkPxQx)
3131		.int L(bkP1QG)-L(bkPxQx)
3132		.int L(bkP2QG)-L(bkPxQx)
3133		.int L(bkP3QG)-L(bkPxQx)
3134		.int L(bkP4QG)-L(bkPxQx)
3135		.int L(bkP5QG)-L(bkPxQx)
3136		.int L(bkP6QG)-L(bkPxQx)
3137		.int L(bkP7QG)-L(bkPxQx)
3138
3139		.int L(bkP0QH)-L(bkPxQx)
3140		.int L(bkP1QH)-L(bkPxQx)
3141		.int L(bkP2QH)-L(bkPxQx)
3142		.int L(bkP3QH)-L(bkPxQx)
3143		.int L(bkP4QH)-L(bkPxQx)
3144		.int L(bkP5QH)-L(bkPxQx)
3145		.int L(bkP6QH)-L(bkPxQx)
3146		.int L(bkP7QH)-L(bkPxQx)
3147
3148		.int L(bkP0QI)-L(bkPxQx)
3149		.int L(bkP1QI)-L(bkPxQx)
3150		.int L(bkP2QI)-L(bkPxQx)
3151		.int L(bkP3QI)-L(bkPxQx)
3152		.int L(bkP4QI)-L(bkPxQx)
3153		.int L(bkP5QI)-L(bkPxQx)
3154		.int L(bkP6QI)-L(bkPxQx)
3155		.int L(bkP7QI)-L(bkPxQx)
3156
3157	SET_SIZE(memmove)
3158