xref: /titanic_41/usr/src/lib/libc/amd64/gen/memset.s (revision fad5204e207119133cdc503293923b09417b233b)
17c478bd9Sstevel@tonic-gate/*
2d0b3732eSbholler * CDDL HEADER START
3d0b3732eSbholler *
4d0b3732eSbholler * The contents of this file are subject to the terms of the
5d0b3732eSbholler * Common Development and Distribution License (the "License").
6d0b3732eSbholler * You may not use this file except in compliance with the License.
7d0b3732eSbholler *
8d0b3732eSbholler * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9d0b3732eSbholler * or http://www.opensolaris.org/os/licensing.
10d0b3732eSbholler * See the License for the specific language governing permissions
11d0b3732eSbholler * and limitations under the License.
12d0b3732eSbholler *
13d0b3732eSbholler * When distributing Covered Code, include this CDDL HEADER in each
14d0b3732eSbholler * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15d0b3732eSbholler * If applicable, add the following below this CDDL HEADER, with the
16d0b3732eSbholler * fields enclosed by brackets "[]" replaced with your own identifying
17d0b3732eSbholler * information: Portions Copyright [yyyy] [name of copyright owner]
18d0b3732eSbholler *
19d0b3732eSbholler * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate
227c478bd9Sstevel@tonic-gate/*
23*fad5204eSbostrovs * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
247257d1b4Sraf * Use is subject to license terms.
257257d1b4Sraf */
267257d1b4Sraf
277257d1b4Sraf/*
28d0b3732eSbholler * Copyright (c) 2008, Intel Corporation
297c478bd9Sstevel@tonic-gate * All rights reserved.
307c478bd9Sstevel@tonic-gate */
317257d1b4Sraf
32*fad5204eSbostrovs/*
33*fad5204eSbostrovs * Portions Copyright 2009 Advanced Micro Devices, Inc.
34*fad5204eSbostrovs */
35*fad5204eSbostrovs
369a70fc3bSMark J. Nelson	.file	"memset.s"
377c478bd9Sstevel@tonic-gate
387c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
397c478bd9Sstevel@tonic-gate
407c478bd9Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memset,function)
417c478bd9Sstevel@tonic-gate
427c478bd9Sstevel@tonic-gate#include "cache.h"
43d0b3732eSbholler#include "proc64_id.h"
447c478bd9Sstevel@tonic-gate
45d0b3732eSbholler#define L(s) .memset/**/s
467c478bd9Sstevel@tonic-gate
47d0b3732eSbholler/*
48d0b3732eSbholler * memset algorithm overview:
49d0b3732eSbholler *
50d0b3732eSbholler * Thresholds used below were determined experimentally.
51d0b3732eSbholler *
52d0b3732eSbholler * Pseudo code:
53d0b3732eSbholler *
54*fad5204eSbostrovs * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
55*fad5204eSbostrovs * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
56*fad5204eSbostrovs * future AMD processors.
57*fad5204eSbostrovs *
58*fad5204eSbostrovs *
59d0b3732eSbholler * If (size <= 144 bytes) {
60d0b3732eSbholler *	do unrolled code (primarily 8-byte stores) regardless of alignment.
61d0b3732eSbholler * } else {
62d0b3732eSbholler *	Align destination to 16-byte boundary
63d0b3732eSbholler *
64d0b3732eSbholler *      if (NO_SSE) {
65d0b3732eSbholler *		If (size > largest level cache) {
66d0b3732eSbholler *			Use 8-byte non-temporal stores (64-bytes/loop)
67d0b3732eSbholler *		} else {
68d0b3732eSbholler *			if (size >= 2K) {
69d0b3732eSbholler *				Use rep sstoq
70d0b3732eSbholler *			} else {
71d0b3732eSbholler *				Use 8-byte stores (128 bytes per loop)
72d0b3732eSbholler *			}
73d0b3732eSbholler *		}
74d0b3732eSbholler *
75d0b3732eSbholler *	} else { **USE SSE**
76d0b3732eSbholler *		If (size <= 192 bytes) {
77d0b3732eSbholler *			do unrolled code using primarily 16-byte stores (SSE2)
78d0b3732eSbholler *		} else {
79d0b3732eSbholler *			If (size > largest level cache) {
80d0b3732eSbholler *				Use 16-byte non-temporal stores (128-bytes/loop)
81d0b3732eSbholler *			} else {
82d0b3732eSbholler *				Use 16-byte stores (128 bytes per loop)
83d0b3732eSbholler *			}
84d0b3732eSbholler *		}
85d0b3732eSbholler *	}
86d0b3732eSbholler *
87d0b3732eSbholler *	Finish any remaining bytes via unrolled code above.
88d0b3732eSbholler * }
89d0b3732eSbholler */
907c478bd9Sstevel@tonic-gate
91d0b3732eSbholler		ENTRY(memset)		# (void *, const void*, size_t)
92d0b3732eSbholler		cmp    $0x1,%rdx
93d0b3732eSbholler		mov    %rdi,%rax	# memset returns the dest address
94d0b3732eSbholler		jne    L(ck2)
957c478bd9Sstevel@tonic-gate		mov    %sil,(%rdi)
96d0b3732eSbholler		ret
97d0b3732eSbhollerL(ck2):
98d0b3732eSbholler		mov    $0x0101010101010101,%r9
99d0b3732eSbholler		mov    %rdx,%r8
100d0b3732eSbholler		movzbq %sil,%rdx
101d0b3732eSbholler		imul   %r9,%rdx		# clone value 8 times
1027c478bd9Sstevel@tonic-gate
103d0b3732eSbholler		cmp    $0x90,%r8	# 144
104d0b3732eSbholler		jge    L(ck_align)
1057c478bd9Sstevel@tonic-gate
106d0b3732eSbholler		lea    L(setPxQx)(%rip),%r11
107d0b3732eSbholler		add    %r8,%rdi
1087c478bd9Sstevel@tonic-gate
109d0b3732eSbholler		movslq (%r11,%r8,4),%rcx
110d0b3732eSbholler		lea    (%rcx,%r11,1),%r11
111d0b3732eSbholler		jmpq   *%r11
1127c478bd9Sstevel@tonic-gate
113d0b3732eSbholler		.balign 16
114d0b3732eSbhollerL(setPxQx):	.int       L(P0Q0)-L(setPxQx)
115d0b3732eSbholler		.int       L(P1Q0)-L(setPxQx)
116d0b3732eSbholler		.int       L(P2Q0)-L(setPxQx)
117d0b3732eSbholler		.int       L(P3Q0)-L(setPxQx)
118d0b3732eSbholler		.int       L(P4Q0)-L(setPxQx)
119d0b3732eSbholler		.int       L(P5Q0)-L(setPxQx)
120d0b3732eSbholler		.int       L(P6Q0)-L(setPxQx)
121d0b3732eSbholler		.int       L(P7Q0)-L(setPxQx)
1227c478bd9Sstevel@tonic-gate
123d0b3732eSbholler		.int       L(P0Q1)-L(setPxQx)
124d0b3732eSbholler		.int       L(P1Q1)-L(setPxQx)
125d0b3732eSbholler		.int       L(P2Q1)-L(setPxQx)
126d0b3732eSbholler		.int       L(P3Q1)-L(setPxQx)
127d0b3732eSbholler		.int       L(P4Q1)-L(setPxQx)
128d0b3732eSbholler		.int       L(P5Q1)-L(setPxQx)
129d0b3732eSbholler		.int       L(P6Q1)-L(setPxQx)
130d0b3732eSbholler		.int       L(P7Q1)-L(setPxQx)
1317c478bd9Sstevel@tonic-gate
132d0b3732eSbholler		.int       L(P0Q2)-L(setPxQx)
133d0b3732eSbholler		.int       L(P1Q2)-L(setPxQx)
134d0b3732eSbholler		.int       L(P2Q2)-L(setPxQx)
135d0b3732eSbholler		.int       L(P3Q2)-L(setPxQx)
136d0b3732eSbholler		.int       L(P4Q2)-L(setPxQx)
137d0b3732eSbholler		.int       L(P5Q2)-L(setPxQx)
138d0b3732eSbholler		.int       L(P6Q2)-L(setPxQx)
139d0b3732eSbholler		.int       L(P7Q2)-L(setPxQx)
1407c478bd9Sstevel@tonic-gate
141d0b3732eSbholler		.int       L(P0Q3)-L(setPxQx)
142d0b3732eSbholler		.int       L(P1Q3)-L(setPxQx)
143d0b3732eSbholler		.int       L(P2Q3)-L(setPxQx)
144d0b3732eSbholler		.int       L(P3Q3)-L(setPxQx)
145d0b3732eSbholler		.int       L(P4Q3)-L(setPxQx)
146d0b3732eSbholler		.int       L(P5Q3)-L(setPxQx)
147d0b3732eSbholler		.int       L(P6Q3)-L(setPxQx)
148d0b3732eSbholler		.int       L(P7Q3)-L(setPxQx)
1497c478bd9Sstevel@tonic-gate
150d0b3732eSbholler		.int       L(P0Q4)-L(setPxQx)
151d0b3732eSbholler		.int       L(P1Q4)-L(setPxQx)
152d0b3732eSbholler		.int       L(P2Q4)-L(setPxQx)
153d0b3732eSbholler		.int       L(P3Q4)-L(setPxQx)
154d0b3732eSbholler		.int       L(P4Q4)-L(setPxQx)
155d0b3732eSbholler		.int       L(P5Q4)-L(setPxQx)
156d0b3732eSbholler		.int       L(P6Q4)-L(setPxQx)
157d0b3732eSbholler		.int       L(P7Q4)-L(setPxQx)
1587c478bd9Sstevel@tonic-gate
159d0b3732eSbholler		.int       L(P0Q5)-L(setPxQx)
160d0b3732eSbholler		.int       L(P1Q5)-L(setPxQx)
161d0b3732eSbholler		.int       L(P2Q5)-L(setPxQx)
162d0b3732eSbholler		.int       L(P3Q5)-L(setPxQx)
163d0b3732eSbholler		.int       L(P4Q5)-L(setPxQx)
164d0b3732eSbholler		.int       L(P5Q5)-L(setPxQx)
165d0b3732eSbholler		.int       L(P6Q5)-L(setPxQx)
166d0b3732eSbholler		.int       L(P7Q5)-L(setPxQx)
1677c478bd9Sstevel@tonic-gate
168d0b3732eSbholler		.int       L(P0Q6)-L(setPxQx)
169d0b3732eSbholler		.int       L(P1Q6)-L(setPxQx)
170d0b3732eSbholler		.int       L(P2Q6)-L(setPxQx)
171d0b3732eSbholler		.int       L(P3Q6)-L(setPxQx)
172d0b3732eSbholler		.int       L(P4Q6)-L(setPxQx)
173d0b3732eSbholler		.int       L(P5Q6)-L(setPxQx)
174d0b3732eSbholler		.int       L(P6Q6)-L(setPxQx)
175d0b3732eSbholler		.int       L(P7Q6)-L(setPxQx)
1767c478bd9Sstevel@tonic-gate
177d0b3732eSbholler		.int       L(P0Q7)-L(setPxQx)
178d0b3732eSbholler		.int       L(P1Q7)-L(setPxQx)
179d0b3732eSbholler		.int       L(P2Q7)-L(setPxQx)
180d0b3732eSbholler		.int       L(P3Q7)-L(setPxQx)
181d0b3732eSbholler		.int       L(P4Q7)-L(setPxQx)
182d0b3732eSbholler		.int       L(P5Q7)-L(setPxQx)
183d0b3732eSbholler		.int       L(P6Q7)-L(setPxQx)
184d0b3732eSbholler		.int       L(P7Q7)-L(setPxQx)
1857c478bd9Sstevel@tonic-gate
186d0b3732eSbholler		.int       L(P0Q8)-L(setPxQx)
187d0b3732eSbholler		.int       L(P1Q8)-L(setPxQx)
188d0b3732eSbholler		.int       L(P2Q8)-L(setPxQx)
189d0b3732eSbholler		.int       L(P3Q8)-L(setPxQx)
190d0b3732eSbholler		.int       L(P4Q8)-L(setPxQx)
191d0b3732eSbholler		.int       L(P5Q8)-L(setPxQx)
192d0b3732eSbholler		.int       L(P6Q8)-L(setPxQx)
193d0b3732eSbholler		.int       L(P7Q8)-L(setPxQx)
1947c478bd9Sstevel@tonic-gate
195d0b3732eSbholler		.int       L(P0Q9)-L(setPxQx)
196d0b3732eSbholler		.int       L(P1Q9)-L(setPxQx)
197d0b3732eSbholler		.int       L(P2Q9)-L(setPxQx)
198d0b3732eSbholler		.int       L(P3Q9)-L(setPxQx)
199d0b3732eSbholler		.int       L(P4Q9)-L(setPxQx)
200d0b3732eSbholler		.int       L(P5Q9)-L(setPxQx)
201d0b3732eSbholler		.int       L(P6Q9)-L(setPxQx)
202d0b3732eSbholler		.int       L(P7Q9)-L(setPxQx)
203d0b3732eSbholler
204d0b3732eSbholler		.int       L(P0QA)-L(setPxQx)
205d0b3732eSbholler		.int       L(P1QA)-L(setPxQx)
206d0b3732eSbholler		.int       L(P2QA)-L(setPxQx)
207d0b3732eSbholler		.int       L(P3QA)-L(setPxQx)
208d0b3732eSbholler		.int       L(P4QA)-L(setPxQx)
209d0b3732eSbholler		.int       L(P5QA)-L(setPxQx)
210d0b3732eSbholler		.int       L(P6QA)-L(setPxQx)
211d0b3732eSbholler		.int       L(P7QA)-L(setPxQx)
212d0b3732eSbholler
213d0b3732eSbholler		.int       L(P0QB)-L(setPxQx)
214d0b3732eSbholler		.int       L(P1QB)-L(setPxQx)
215d0b3732eSbholler		.int       L(P2QB)-L(setPxQx)
216d0b3732eSbholler		.int       L(P3QB)-L(setPxQx)
217d0b3732eSbholler		.int       L(P4QB)-L(setPxQx)
218d0b3732eSbholler		.int       L(P5QB)-L(setPxQx)
219d0b3732eSbholler		.int       L(P6QB)-L(setPxQx)
220d0b3732eSbholler		.int       L(P7QB)-L(setPxQx)
221d0b3732eSbholler
222d0b3732eSbholler		.int       L(P0QC)-L(setPxQx)
223d0b3732eSbholler		.int       L(P1QC)-L(setPxQx)
224d0b3732eSbholler		.int       L(P2QC)-L(setPxQx)
225d0b3732eSbholler		.int       L(P3QC)-L(setPxQx)
226d0b3732eSbholler		.int       L(P4QC)-L(setPxQx)
227d0b3732eSbholler		.int       L(P5QC)-L(setPxQx)
228d0b3732eSbholler		.int       L(P6QC)-L(setPxQx)
229d0b3732eSbholler		.int       L(P7QC)-L(setPxQx)
230d0b3732eSbholler
231d0b3732eSbholler		.int       L(P0QD)-L(setPxQx)
232d0b3732eSbholler		.int       L(P1QD)-L(setPxQx)
233d0b3732eSbholler		.int       L(P2QD)-L(setPxQx)
234d0b3732eSbholler		.int       L(P3QD)-L(setPxQx)
235d0b3732eSbholler		.int       L(P4QD)-L(setPxQx)
236d0b3732eSbholler		.int       L(P5QD)-L(setPxQx)
237d0b3732eSbholler		.int       L(P6QD)-L(setPxQx)
238d0b3732eSbholler		.int       L(P7QD)-L(setPxQx)
239d0b3732eSbholler
240d0b3732eSbholler		.int       L(P0QE)-L(setPxQx)	# 112
241d0b3732eSbholler		.int       L(P1QE)-L(setPxQx)
242d0b3732eSbholler		.int       L(P2QE)-L(setPxQx)
243d0b3732eSbholler		.int       L(P3QE)-L(setPxQx)
244d0b3732eSbholler		.int       L(P4QE)-L(setPxQx)
245d0b3732eSbholler		.int       L(P5QE)-L(setPxQx)
246d0b3732eSbholler		.int       L(P6QE)-L(setPxQx)
247d0b3732eSbholler		.int       L(P7QE)-L(setPxQx)
248d0b3732eSbholler
249d0b3732eSbholler		.int       L(P0QF)-L(setPxQx)	#120
250d0b3732eSbholler		.int       L(P1QF)-L(setPxQx)
251d0b3732eSbholler		.int       L(P2QF)-L(setPxQx)
252d0b3732eSbholler		.int       L(P3QF)-L(setPxQx)
253d0b3732eSbholler		.int       L(P4QF)-L(setPxQx)
254d0b3732eSbholler		.int       L(P5QF)-L(setPxQx)
255d0b3732eSbholler		.int       L(P6QF)-L(setPxQx)
256d0b3732eSbholler		.int       L(P7QF)-L(setPxQx)
257d0b3732eSbholler
258d0b3732eSbholler		.int       L(P0QG)-L(setPxQx)	#128
259d0b3732eSbholler		.int       L(P1QG)-L(setPxQx)
260d0b3732eSbholler		.int       L(P2QG)-L(setPxQx)
261d0b3732eSbholler		.int       L(P3QG)-L(setPxQx)
262d0b3732eSbholler		.int       L(P4QG)-L(setPxQx)
263d0b3732eSbholler		.int       L(P5QG)-L(setPxQx)
264d0b3732eSbholler		.int       L(P6QG)-L(setPxQx)
265d0b3732eSbholler		.int       L(P7QG)-L(setPxQx)
266d0b3732eSbholler
267d0b3732eSbholler		.int       L(P0QH)-L(setPxQx)	#136
268d0b3732eSbholler		.int       L(P1QH)-L(setPxQx)
269d0b3732eSbholler		.int       L(P2QH)-L(setPxQx)
270d0b3732eSbholler		.int       L(P3QH)-L(setPxQx)
271d0b3732eSbholler		.int       L(P4QH)-L(setPxQx)
272d0b3732eSbholler		.int       L(P5QH)-L(setPxQx)
273d0b3732eSbholler		.int       L(P6QH)-L(setPxQx)
274d0b3732eSbholler		.int       L(P7QH)-L(setPxQx)	#143
275d0b3732eSbholler
276d0b3732eSbholler		.balign 16
277d0b3732eSbhollerL(P1QH):	mov    %rdx,-0x89(%rdi)
278d0b3732eSbhollerL(P1QG):	mov    %rdx,-0x81(%rdi)
279d0b3732eSbholler		.balign 16
280d0b3732eSbhollerL(P1QF):	mov    %rdx,-0x79(%rdi)
281d0b3732eSbhollerL(P1QE):	mov    %rdx,-0x71(%rdi)
282d0b3732eSbhollerL(P1QD):	mov    %rdx,-0x69(%rdi)
283d0b3732eSbhollerL(P1QC):	mov    %rdx,-0x61(%rdi)
284d0b3732eSbhollerL(P1QB):	mov    %rdx,-0x59(%rdi)
285d0b3732eSbhollerL(P1QA):	mov    %rdx,-0x51(%rdi)
286d0b3732eSbhollerL(P1Q9):	mov    %rdx,-0x49(%rdi)
287d0b3732eSbhollerL(P1Q8):	mov    %rdx,-0x41(%rdi)
288d0b3732eSbhollerL(P1Q7):	mov    %rdx,-0x39(%rdi)
289d0b3732eSbhollerL(P1Q6):	mov    %rdx,-0x31(%rdi)
290d0b3732eSbhollerL(P1Q5):	mov    %rdx,-0x29(%rdi)
291d0b3732eSbhollerL(P1Q4):	mov    %rdx,-0x21(%rdi)
292d0b3732eSbhollerL(P1Q3):	mov    %rdx,-0x19(%rdi)
293d0b3732eSbhollerL(P1Q2):	mov    %rdx,-0x11(%rdi)
294d0b3732eSbhollerL(P1Q1):	mov    %rdx,-0x9(%rdi)
295d0b3732eSbhollerL(P1Q0):	mov    %dl,-0x1(%rdi)
2967c478bd9Sstevel@tonic-gate		ret
2977c478bd9Sstevel@tonic-gate
298d0b3732eSbholler		.balign 16
299d0b3732eSbhollerL(P0QH):	mov    %rdx,-0x88(%rdi)
300d0b3732eSbholler		.balign 16
301d0b3732eSbhollerL(P0QG):	mov    %rdx,-0x80(%rdi)
302d0b3732eSbhollerL(P0QF):	mov    %rdx,-0x78(%rdi)
303d0b3732eSbhollerL(P0QE):	mov    %rdx,-0x70(%rdi)
304d0b3732eSbhollerL(P0QD):	mov    %rdx,-0x68(%rdi)
305d0b3732eSbhollerL(P0QC):	mov    %rdx,-0x60(%rdi)
306d0b3732eSbhollerL(P0QB):	mov    %rdx,-0x58(%rdi)
307d0b3732eSbhollerL(P0QA):	mov    %rdx,-0x50(%rdi)
308d0b3732eSbhollerL(P0Q9):	mov    %rdx,-0x48(%rdi)
309d0b3732eSbhollerL(P0Q8):	mov    %rdx,-0x40(%rdi)
310d0b3732eSbhollerL(P0Q7):	mov    %rdx,-0x38(%rdi)
311d0b3732eSbhollerL(P0Q6):	mov    %rdx,-0x30(%rdi)
312d0b3732eSbhollerL(P0Q5):	mov    %rdx,-0x28(%rdi)
313d0b3732eSbhollerL(P0Q4):	mov    %rdx,-0x20(%rdi)
314d0b3732eSbhollerL(P0Q3):	mov    %rdx,-0x18(%rdi)
315d0b3732eSbhollerL(P0Q2):	mov    %rdx,-0x10(%rdi)
316d0b3732eSbhollerL(P0Q1):	mov    %rdx,-0x8(%rdi)
317d0b3732eSbhollerL(P0Q0):	ret
3187c478bd9Sstevel@tonic-gate
319d0b3732eSbholler		.balign 16
320d0b3732eSbhollerL(P2QH):	mov    %rdx,-0x8a(%rdi)
321d0b3732eSbhollerL(P2QG):	mov    %rdx,-0x82(%rdi)
322d0b3732eSbholler		.balign 16
323d0b3732eSbhollerL(P2QF):	mov    %rdx,-0x7a(%rdi)
324d0b3732eSbhollerL(P2QE):	mov    %rdx,-0x72(%rdi)
325d0b3732eSbhollerL(P2QD):	mov    %rdx,-0x6a(%rdi)
326d0b3732eSbhollerL(P2QC):	mov    %rdx,-0x62(%rdi)
327d0b3732eSbhollerL(P2QB):	mov    %rdx,-0x5a(%rdi)
328d0b3732eSbhollerL(P2QA):	mov    %rdx,-0x52(%rdi)
329d0b3732eSbhollerL(P2Q9):	mov    %rdx,-0x4a(%rdi)
330d0b3732eSbhollerL(P2Q8):	mov    %rdx,-0x42(%rdi)
331d0b3732eSbhollerL(P2Q7):	mov    %rdx,-0x3a(%rdi)
332d0b3732eSbhollerL(P2Q6):	mov    %rdx,-0x32(%rdi)
333d0b3732eSbhollerL(P2Q5):	mov    %rdx,-0x2a(%rdi)
334d0b3732eSbhollerL(P2Q4):	mov    %rdx,-0x22(%rdi)
335d0b3732eSbhollerL(P2Q3):	mov    %rdx,-0x1a(%rdi)
336d0b3732eSbhollerL(P2Q2):	mov    %rdx,-0x12(%rdi)
337d0b3732eSbhollerL(P2Q1):	mov    %rdx,-0xa(%rdi)
338d0b3732eSbhollerL(P2Q0):	mov    %dx,-0x2(%rdi)
3397c478bd9Sstevel@tonic-gate		ret
3407c478bd9Sstevel@tonic-gate
341d0b3732eSbholler		.balign 16
342d0b3732eSbhollerL(P3QH):	mov    %rdx,-0x8b(%rdi)
343d0b3732eSbhollerL(P3QG):	mov    %rdx,-0x83(%rdi)
344d0b3732eSbholler		.balign 16
345d0b3732eSbhollerL(P3QF):	mov    %rdx,-0x7b(%rdi)
346d0b3732eSbhollerL(P3QE):	mov    %rdx,-0x73(%rdi)
347d0b3732eSbhollerL(P3QD):	mov    %rdx,-0x6b(%rdi)
348d0b3732eSbhollerL(P3QC):	mov    %rdx,-0x63(%rdi)
349d0b3732eSbhollerL(P3QB):	mov    %rdx,-0x5b(%rdi)
350d0b3732eSbhollerL(P3QA):	mov    %rdx,-0x53(%rdi)
351d0b3732eSbhollerL(P3Q9):	mov    %rdx,-0x4b(%rdi)
352d0b3732eSbhollerL(P3Q8):	mov    %rdx,-0x43(%rdi)
353d0b3732eSbhollerL(P3Q7):	mov    %rdx,-0x3b(%rdi)
354d0b3732eSbhollerL(P3Q6):	mov    %rdx,-0x33(%rdi)
355d0b3732eSbhollerL(P3Q5):	mov    %rdx,-0x2b(%rdi)
356d0b3732eSbhollerL(P3Q4):	mov    %rdx,-0x23(%rdi)
357d0b3732eSbhollerL(P3Q3):	mov    %rdx,-0x1b(%rdi)
358d0b3732eSbhollerL(P3Q2):	mov    %rdx,-0x13(%rdi)
359d0b3732eSbhollerL(P3Q1):	mov    %rdx,-0xb(%rdi)
360d0b3732eSbhollerL(P3Q0):	mov    %dx,-0x3(%rdi)
361d0b3732eSbholler		mov    %dl,-0x1(%rdi)
3627c478bd9Sstevel@tonic-gate		ret
3637c478bd9Sstevel@tonic-gate
364d0b3732eSbholler		.balign 16
365d0b3732eSbhollerL(P4QH):	mov    %rdx,-0x8c(%rdi)
366d0b3732eSbhollerL(P4QG):	mov    %rdx,-0x84(%rdi)
367d0b3732eSbholler		.balign 16
368d0b3732eSbhollerL(P4QF):	mov    %rdx,-0x7c(%rdi)
369d0b3732eSbhollerL(P4QE):	mov    %rdx,-0x74(%rdi)
370d0b3732eSbhollerL(P4QD):	mov    %rdx,-0x6c(%rdi)
371d0b3732eSbhollerL(P4QC):	mov    %rdx,-0x64(%rdi)
372d0b3732eSbhollerL(P4QB):	mov    %rdx,-0x5c(%rdi)
373d0b3732eSbhollerL(P4QA):	mov    %rdx,-0x54(%rdi)
374d0b3732eSbhollerL(P4Q9):	mov    %rdx,-0x4c(%rdi)
375d0b3732eSbhollerL(P4Q8):	mov    %rdx,-0x44(%rdi)
376d0b3732eSbhollerL(P4Q7):	mov    %rdx,-0x3c(%rdi)
377d0b3732eSbhollerL(P4Q6):	mov    %rdx,-0x34(%rdi)
378d0b3732eSbhollerL(P4Q5):	mov    %rdx,-0x2c(%rdi)
379d0b3732eSbhollerL(P4Q4):	mov    %rdx,-0x24(%rdi)
380d0b3732eSbhollerL(P4Q3):	mov    %rdx,-0x1c(%rdi)
381d0b3732eSbhollerL(P4Q2):	mov    %rdx,-0x14(%rdi)
382d0b3732eSbhollerL(P4Q1):	mov    %rdx,-0xc(%rdi)
383d0b3732eSbhollerL(P4Q0):	mov    %edx,-0x4(%rdi)
3847c478bd9Sstevel@tonic-gate		ret
3857c478bd9Sstevel@tonic-gate
386d0b3732eSbholler		.balign 16
387d0b3732eSbhollerL(P5QH):	mov    %rdx,-0x8d(%rdi)
388d0b3732eSbhollerL(P5QG):	mov    %rdx,-0x85(%rdi)
389d0b3732eSbholler		.balign 16
390d0b3732eSbhollerL(P5QF):	mov    %rdx,-0x7d(%rdi)
391d0b3732eSbhollerL(P5QE):	mov    %rdx,-0x75(%rdi)
392d0b3732eSbhollerL(P5QD):	mov    %rdx,-0x6d(%rdi)
393d0b3732eSbhollerL(P5QC):	mov    %rdx,-0x65(%rdi)
394d0b3732eSbhollerL(P5QB):	mov    %rdx,-0x5d(%rdi)
395d0b3732eSbhollerL(P5QA):	mov    %rdx,-0x55(%rdi)
396d0b3732eSbhollerL(P5Q9):	mov    %rdx,-0x4d(%rdi)
397d0b3732eSbhollerL(P5Q8):	mov    %rdx,-0x45(%rdi)
398d0b3732eSbhollerL(P5Q7):	mov    %rdx,-0x3d(%rdi)
399d0b3732eSbhollerL(P5Q6):	mov    %rdx,-0x35(%rdi)
400d0b3732eSbhollerL(P5Q5):	mov    %rdx,-0x2d(%rdi)
401d0b3732eSbhollerL(P5Q4):	mov    %rdx,-0x25(%rdi)
402d0b3732eSbhollerL(P5Q3):	mov    %rdx,-0x1d(%rdi)
403d0b3732eSbhollerL(P5Q2):	mov    %rdx,-0x15(%rdi)
404d0b3732eSbhollerL(P5Q1):	mov    %rdx,-0xd(%rdi)
405d0b3732eSbhollerL(P5Q0):	mov    %edx,-0x5(%rdi)
406d0b3732eSbholler		mov    %dl,-0x1(%rdi)
4077c478bd9Sstevel@tonic-gate		ret
4087c478bd9Sstevel@tonic-gate
409d0b3732eSbholler		.balign 16
410d0b3732eSbhollerL(P6QH):	mov    %rdx,-0x8e(%rdi)
411d0b3732eSbhollerL(P6QG):	mov    %rdx,-0x86(%rdi)
412d0b3732eSbholler		.balign 16
413d0b3732eSbhollerL(P6QF):	mov    %rdx,-0x7e(%rdi)
414d0b3732eSbhollerL(P6QE):	mov    %rdx,-0x76(%rdi)
415d0b3732eSbhollerL(P6QD):	mov    %rdx,-0x6e(%rdi)
416d0b3732eSbhollerL(P6QC):	mov    %rdx,-0x66(%rdi)
417d0b3732eSbhollerL(P6QB):	mov    %rdx,-0x5e(%rdi)
418d0b3732eSbhollerL(P6QA):	mov    %rdx,-0x56(%rdi)
419d0b3732eSbhollerL(P6Q9):	mov    %rdx,-0x4e(%rdi)
420d0b3732eSbhollerL(P6Q8):	mov    %rdx,-0x46(%rdi)
421d0b3732eSbhollerL(P6Q7):	mov    %rdx,-0x3e(%rdi)
422d0b3732eSbhollerL(P6Q6):	mov    %rdx,-0x36(%rdi)
423d0b3732eSbhollerL(P6Q5):	mov    %rdx,-0x2e(%rdi)
424d0b3732eSbhollerL(P6Q4):	mov    %rdx,-0x26(%rdi)
425d0b3732eSbhollerL(P6Q3):	mov    %rdx,-0x1e(%rdi)
426d0b3732eSbhollerL(P6Q2):	mov    %rdx,-0x16(%rdi)
427d0b3732eSbhollerL(P6Q1):	mov    %rdx,-0xe(%rdi)
428d0b3732eSbhollerL(P6Q0):	mov    %edx,-0x6(%rdi)
429d0b3732eSbholler		mov    %dx,-0x2(%rdi)
430d0b3732eSbholler		ret
431d0b3732eSbholler
432d0b3732eSbholler		.balign 16
433d0b3732eSbhollerL(P7QH):	mov    %rdx,-0x8f(%rdi)
434d0b3732eSbhollerL(P7QG):	mov    %rdx,-0x87(%rdi)
435d0b3732eSbholler		.balign 16
436d0b3732eSbhollerL(P7QF):	mov    %rdx,-0x7f(%rdi)
437d0b3732eSbhollerL(P7QE):	mov    %rdx,-0x77(%rdi)
438d0b3732eSbhollerL(P7QD):	mov    %rdx,-0x6f(%rdi)
439d0b3732eSbhollerL(P7QC):	mov    %rdx,-0x67(%rdi)
440d0b3732eSbhollerL(P7QB):	mov    %rdx,-0x5f(%rdi)
441d0b3732eSbhollerL(P7QA):	mov    %rdx,-0x57(%rdi)
442d0b3732eSbhollerL(P7Q9):	mov    %rdx,-0x4f(%rdi)
443d0b3732eSbhollerL(P7Q8):	mov    %rdx,-0x47(%rdi)
444d0b3732eSbhollerL(P7Q7):	mov    %rdx,-0x3f(%rdi)
445d0b3732eSbhollerL(P7Q6):	mov    %rdx,-0x37(%rdi)
446d0b3732eSbhollerL(P7Q5):	mov    %rdx,-0x2f(%rdi)
447d0b3732eSbhollerL(P7Q4):	mov    %rdx,-0x27(%rdi)
448d0b3732eSbhollerL(P7Q3):	mov    %rdx,-0x1f(%rdi)
449d0b3732eSbhollerL(P7Q2):	mov    %rdx,-0x17(%rdi)
450d0b3732eSbhollerL(P7Q1):	mov    %rdx,-0xf(%rdi)
451d0b3732eSbhollerL(P7Q0):	mov    %edx,-0x7(%rdi)
452d0b3732eSbholler		mov    %dx,-0x3(%rdi)
453d0b3732eSbholler		mov    %dl,-0x1(%rdi)
454d0b3732eSbholler		ret
455d0b3732eSbholler
456d0b3732eSbholler		.balign 16
457d0b3732eSbhollerL(ck_align):
458d0b3732eSbholler		/*
459d0b3732eSbholler		 * Align to 16 byte boundary first
460d0b3732eSbholler		 */
461d0b3732eSbholler	 	lea    L(AliPxQx)(%rip),%r11
462d0b3732eSbholler	 	mov    $0x10,%r10
463d0b3732eSbholler	 	mov    %rdi,%r9
464d0b3732eSbholler	 	and    $0xf,%r9
465d0b3732eSbholler	 	sub    %r9,%r10
466d0b3732eSbholler	 	and    $0xf,%r10
467d0b3732eSbholler	 	add    %r10,%rdi
468d0b3732eSbholler	 	sub    %r10,%r8
469d0b3732eSbholler
470d0b3732eSbholler		movslq (%r11,%r10,4),%rcx
471d0b3732eSbholler		lea    (%rcx,%r11,1),%r11
472d0b3732eSbholler		jmpq   *%r11			# align dest to 16-byte boundary
473d0b3732eSbholler
474d0b3732eSbholler		.balign 16
475d0b3732eSbhollerL(AliPxQx):	.int	L(aligned_now)-L(AliPxQx)
476d0b3732eSbholler		.int	L(A1Q0)-L(AliPxQx)
477d0b3732eSbholler		.int	L(A2Q0)-L(AliPxQx)
478d0b3732eSbholler		.int	L(A3Q0)-L(AliPxQx)
479d0b3732eSbholler		.int	L(A4Q0)-L(AliPxQx)
480d0b3732eSbholler		.int	L(A5Q0)-L(AliPxQx)
481d0b3732eSbholler		.int	L(A6Q0)-L(AliPxQx)
482d0b3732eSbholler		.int	L(A7Q0)-L(AliPxQx)
483d0b3732eSbholler
484d0b3732eSbholler		.int	L(A0Q1)-L(AliPxQx)
485d0b3732eSbholler		.int	L(A1Q1)-L(AliPxQx)
486d0b3732eSbholler		.int	L(A2Q1)-L(AliPxQx)
487d0b3732eSbholler		.int	L(A3Q1)-L(AliPxQx)
488d0b3732eSbholler		.int	L(A4Q1)-L(AliPxQx)
489d0b3732eSbholler		.int	L(A5Q1)-L(AliPxQx)
490d0b3732eSbholler		.int	L(A6Q1)-L(AliPxQx)
491d0b3732eSbholler		.int	L(A7Q1)-L(AliPxQx)
492d0b3732eSbholler
493d0b3732eSbholler		.balign 16
494d0b3732eSbhollerL(A5Q1):	mov    %dl,-0xd(%rdi)
495d0b3732eSbhollerL(A4Q1):	mov    %edx,-0xc(%rdi)
496d0b3732eSbhollerL(A0Q1):	mov    %rdx,-0x8(%rdi)
497d0b3732eSbholler		jmp     L(aligned_now)
498d0b3732eSbholler
499d0b3732eSbholler		.balign 16
500d0b3732eSbhollerL(A1Q1):	mov    %dl,-0x9(%rdi)
501d0b3732eSbholler		mov    %rdx,-0x8(%rdi)
502d0b3732eSbholler		jmp    L(aligned_now)
503d0b3732eSbholler
504d0b3732eSbholler		.balign 16
505d0b3732eSbhollerL(A1Q0):	mov    %dl,-0x1(%rdi)
506d0b3732eSbholler		jmp    L(aligned_now)
507d0b3732eSbholler
508d0b3732eSbholler		.balign 16
509d0b3732eSbhollerL(A3Q1):	mov    %dl,-0xb(%rdi)
510d0b3732eSbhollerL(A2Q1):	mov    %dx,-0xa(%rdi)
511d0b3732eSbholler		mov    %rdx,-0x8(%rdi)
512d0b3732eSbholler		jmp    L(aligned_now)
513d0b3732eSbholler
514d0b3732eSbholler		.balign 16
515d0b3732eSbhollerL(A3Q0):	mov    %dl,-0x3(%rdi)
516d0b3732eSbhollerL(A2Q0):	mov    %dx,-0x2(%rdi)
517d0b3732eSbholler		jmp    L(aligned_now)
518d0b3732eSbholler
519d0b3732eSbholler		.balign 16
520d0b3732eSbhollerL(A5Q0):	mov    %dl,-0x5(%rdi)
521d0b3732eSbhollerL(A4Q0):	mov    %edx,-0x4(%rdi)
522d0b3732eSbholler		jmp    L(aligned_now)
523d0b3732eSbholler
524d0b3732eSbholler		.balign 16
525d0b3732eSbhollerL(A7Q1):	mov    %dl,-0xf(%rdi)
526d0b3732eSbhollerL(A6Q1):	mov    %dx,-0xe(%rdi)
527d0b3732eSbholler		mov    %edx,-0xc(%rdi)
528d0b3732eSbholler		mov    %rdx,-0x8(%rdi)
529d0b3732eSbholler		jmp    L(aligned_now)
530d0b3732eSbholler
531d0b3732eSbholler		.balign 16
532d0b3732eSbhollerL(A7Q0):	mov    %dl,-0x7(%rdi)
533d0b3732eSbhollerL(A6Q0):	mov    %dx,-0x6(%rdi)
534d0b3732eSbholler		mov    %edx,-0x4(%rdi)
535d0b3732eSbholler		#jmp    L(aligned_now)		# Fall thru...
536d0b3732eSbholler
537d0b3732eSbholler		.balign 16
538d0b3732eSbhollerL(aligned_now):
539d0b3732eSbholler		/*
540d0b3732eSbholler		 * Check memops method
541d0b3732eSbholler		 */
542d0b3732eSbholler		cmpl   $NO_SSE,.memops_method(%rip)
543d0b3732eSbholler		je     L(Loop8byte_pre)
544d0b3732eSbholler
545d0b3732eSbholler		/*
546d0b3732eSbholler		 * Use SSE2 instructions
547d0b3732eSbholler		 */
548d0b3732eSbholler	 	movd   %rdx,%xmm0
549d0b3732eSbholler		lea    L(SSExDx)(%rip),%r9	# after dest alignment
550d0b3732eSbholler	 	punpcklqdq %xmm0,%xmm0		# fill RegXMM0 with the pattern
551d0b3732eSbholler		cmp    $0xc0,%r8		# 192
552d0b3732eSbholler		jge    L(byte32sse2_pre)
553d0b3732eSbholler
554d0b3732eSbholler		add    %r8,%rdi
555d0b3732eSbholler
556d0b3732eSbholler		movslq (%r9,%r8,4),%rcx
557d0b3732eSbholler		lea    (%rcx,%r9,1),%r9
558d0b3732eSbholler		jmpq   *%r9
559d0b3732eSbholler
560d0b3732eSbholler		.balign 16
561d0b3732eSbhollerL(SSE0QB):	movdqa %xmm0,-0xb0(%rdi)
562d0b3732eSbhollerL(SSE0QA):	movdqa %xmm0,-0xa0(%rdi)
563d0b3732eSbhollerL(SSE0Q9):	movdqa %xmm0,-0x90(%rdi)
564d0b3732eSbhollerL(SSE0Q8):	movdqa %xmm0,-0x80(%rdi)
565d0b3732eSbhollerL(SSE0Q7):	movdqa %xmm0,-0x70(%rdi)
566d0b3732eSbhollerL(SSE0Q6):	movdqa %xmm0,-0x60(%rdi)
567d0b3732eSbhollerL(SSE0Q5):	movdqa %xmm0,-0x50(%rdi)
568d0b3732eSbhollerL(SSE0Q4):	movdqa %xmm0,-0x40(%rdi)
569d0b3732eSbhollerL(SSE0Q3):	movdqa %xmm0,-0x30(%rdi)
570d0b3732eSbhollerL(SSE0Q2):	movdqa %xmm0,-0x20(%rdi)
571d0b3732eSbhollerL(SSE0Q1):	movdqa %xmm0,-0x10(%rdi)
572d0b3732eSbhollerL(SSE0Q0):	ret
573d0b3732eSbholler
574d0b3732eSbholler		.balign 16
575d0b3732eSbhollerL(SSE1QB):	movdqa %xmm0,-0xb1(%rdi)
576d0b3732eSbhollerL(SSE1QA):	movdqa %xmm0,-0xa1(%rdi)
577d0b3732eSbhollerL(SSE1Q9):	movdqa %xmm0,-0x91(%rdi)
578d0b3732eSbhollerL(SSE1Q8):	movdqa %xmm0,-0x81(%rdi)
579d0b3732eSbhollerL(SSE1Q7):	movdqa %xmm0,-0x71(%rdi)
580d0b3732eSbhollerL(SSE1Q6):	movdqa %xmm0,-0x61(%rdi)
581d0b3732eSbhollerL(SSE1Q5):	movdqa %xmm0,-0x51(%rdi)
582d0b3732eSbhollerL(SSE1Q4):	movdqa %xmm0,-0x41(%rdi)
583d0b3732eSbhollerL(SSE1Q3):	movdqa %xmm0,-0x31(%rdi)
584d0b3732eSbhollerL(SSE1Q2):	movdqa %xmm0,-0x21(%rdi)
585d0b3732eSbhollerL(SSE1Q1):	movdqa %xmm0,-0x11(%rdi)
586d0b3732eSbhollerL(SSE1Q0):	mov    %dl,-0x1(%rdi)
587d0b3732eSbholler		ret
588d0b3732eSbholler
589d0b3732eSbholler		.balign 16
590d0b3732eSbhollerL(SSE2QB):	movdqa %xmm0,-0xb2(%rdi)
591d0b3732eSbhollerL(SSE2QA):	movdqa %xmm0,-0xa2(%rdi)
592d0b3732eSbhollerL(SSE2Q9):	movdqa %xmm0,-0x92(%rdi)
593d0b3732eSbhollerL(SSE2Q8):	movdqa %xmm0,-0x82(%rdi)
594d0b3732eSbhollerL(SSE2Q7):	movdqa %xmm0,-0x72(%rdi)
595d0b3732eSbhollerL(SSE2Q6):	movdqa %xmm0,-0x62(%rdi)
596d0b3732eSbhollerL(SSE2Q5):	movdqa %xmm0,-0x52(%rdi)
597d0b3732eSbhollerL(SSE2Q4):	movdqa %xmm0,-0x42(%rdi)
598d0b3732eSbhollerL(SSE2Q3):	movdqa %xmm0,-0x32(%rdi)
599d0b3732eSbhollerL(SSE2Q2):	movdqa %xmm0,-0x22(%rdi)
600d0b3732eSbhollerL(SSE2Q1):	movdqa %xmm0,-0x12(%rdi)
601d0b3732eSbhollerL(SSE2Q0):	mov    %dx,-0x2(%rdi)
602d0b3732eSbholler		ret
603d0b3732eSbholler
604d0b3732eSbholler		.balign 16
605d0b3732eSbhollerL(SSE3QB):	movdqa %xmm0,-0xb3(%rdi)
606d0b3732eSbhollerL(SSE3QA):	movdqa %xmm0,-0xa3(%rdi)
607d0b3732eSbhollerL(SSE3Q9):	movdqa %xmm0,-0x93(%rdi)
608d0b3732eSbhollerL(SSE3Q8):	movdqa %xmm0,-0x83(%rdi)
609d0b3732eSbhollerL(SSE3Q7):	movdqa %xmm0,-0x73(%rdi)
610d0b3732eSbhollerL(SSE3Q6):	movdqa %xmm0,-0x63(%rdi)
611d0b3732eSbhollerL(SSE3Q5):	movdqa %xmm0,-0x53(%rdi)
612d0b3732eSbhollerL(SSE3Q4):	movdqa %xmm0,-0x43(%rdi)
613d0b3732eSbhollerL(SSE3Q3):	movdqa %xmm0,-0x33(%rdi)
614d0b3732eSbhollerL(SSE3Q2):	movdqa %xmm0,-0x23(%rdi)
615d0b3732eSbhollerL(SSE3Q1):	movdqa %xmm0,-0x13(%rdi)
616d0b3732eSbhollerL(SSE3Q0):	mov    %dx,-0x3(%rdi)
617d0b3732eSbholler		mov    %dl,-0x1(%rdi)
618d0b3732eSbholler		ret
619d0b3732eSbholler
620d0b3732eSbholler		.balign 16
621d0b3732eSbhollerL(SSE4QB):	movdqa %xmm0,-0xb4(%rdi)
622d0b3732eSbhollerL(SSE4QA):	movdqa %xmm0,-0xa4(%rdi)
623d0b3732eSbhollerL(SSE4Q9):	movdqa %xmm0,-0x94(%rdi)
624d0b3732eSbhollerL(SSE4Q8):	movdqa %xmm0,-0x84(%rdi)
625d0b3732eSbhollerL(SSE4Q7):	movdqa %xmm0,-0x74(%rdi)
626d0b3732eSbhollerL(SSE4Q6):	movdqa %xmm0,-0x64(%rdi)
627d0b3732eSbhollerL(SSE4Q5):	movdqa %xmm0,-0x54(%rdi)
628d0b3732eSbhollerL(SSE4Q4):	movdqa %xmm0,-0x44(%rdi)
629d0b3732eSbhollerL(SSE4Q3):	movdqa %xmm0,-0x34(%rdi)
630d0b3732eSbhollerL(SSE4Q2):	movdqa %xmm0,-0x24(%rdi)
631d0b3732eSbhollerL(SSE4Q1):	movdqa %xmm0,-0x14(%rdi)
632d0b3732eSbhollerL(SSE4Q0):	mov    %edx,-0x4(%rdi)
633d0b3732eSbholler		ret
634d0b3732eSbholler
635d0b3732eSbholler		.balign 16
636d0b3732eSbhollerL(SSE5QB):	movdqa %xmm0,-0xb5(%rdi)
637d0b3732eSbhollerL(SSE5QA):	movdqa %xmm0,-0xa5(%rdi)
638d0b3732eSbhollerL(SSE5Q9):	movdqa %xmm0,-0x95(%rdi)
639d0b3732eSbhollerL(SSE5Q8):	movdqa %xmm0,-0x85(%rdi)
640d0b3732eSbhollerL(SSE5Q7):	movdqa %xmm0,-0x75(%rdi)
641d0b3732eSbhollerL(SSE5Q6):	movdqa %xmm0,-0x65(%rdi)
642d0b3732eSbhollerL(SSE5Q5):	movdqa %xmm0,-0x55(%rdi)
643d0b3732eSbhollerL(SSE5Q4):	movdqa %xmm0,-0x45(%rdi)
644d0b3732eSbhollerL(SSE5Q3):	movdqa %xmm0,-0x35(%rdi)
645d0b3732eSbhollerL(SSE5Q2):	movdqa %xmm0,-0x25(%rdi)
646d0b3732eSbhollerL(SSE5Q1):	movdqa %xmm0,-0x15(%rdi)
647d0b3732eSbhollerL(SSE5Q0):	mov    %edx,-0x5(%rdi)
648d0b3732eSbholler		mov    %dl,-0x1(%rdi)
649d0b3732eSbholler		ret
650d0b3732eSbholler
651d0b3732eSbholler		.balign 16
652d0b3732eSbhollerL(SSE6QB):	movdqa %xmm0,-0xb6(%rdi)
653d0b3732eSbhollerL(SSE6QA):	movdqa %xmm0,-0xa6(%rdi)
654d0b3732eSbhollerL(SSE6Q9):	movdqa %xmm0,-0x96(%rdi)
655d0b3732eSbhollerL(SSE6Q8):	movdqa %xmm0,-0x86(%rdi)
656d0b3732eSbhollerL(SSE6Q7):	movdqa %xmm0,-0x76(%rdi)
657d0b3732eSbhollerL(SSE6Q6):	movdqa %xmm0,-0x66(%rdi)
658d0b3732eSbhollerL(SSE6Q5):	movdqa %xmm0,-0x56(%rdi)
659d0b3732eSbhollerL(SSE6Q4):	movdqa %xmm0,-0x46(%rdi)
660d0b3732eSbhollerL(SSE6Q3):	movdqa %xmm0,-0x36(%rdi)
661d0b3732eSbhollerL(SSE6Q2):	movdqa %xmm0,-0x26(%rdi)
662d0b3732eSbhollerL(SSE6Q1):	movdqa %xmm0,-0x16(%rdi)
663d0b3732eSbhollerL(SSE6Q0):	mov    %edx,-0x6(%rdi)
664d0b3732eSbholler		mov    %dx,-0x2(%rdi)
665d0b3732eSbholler		ret
666d0b3732eSbholler
667d0b3732eSbholler		.balign 16
668d0b3732eSbhollerL(SSE7QB):	movdqa %xmm0,-0xb7(%rdi)
669d0b3732eSbhollerL(SSE7QA):	movdqa %xmm0,-0xa7(%rdi)
670d0b3732eSbhollerL(SSE7Q9):	movdqa %xmm0,-0x97(%rdi)
671d0b3732eSbhollerL(SSE7Q8):	movdqa %xmm0,-0x87(%rdi)
672d0b3732eSbhollerL(SSE7Q7):	movdqa %xmm0,-0x77(%rdi)
673d0b3732eSbhollerL(SSE7Q6):	movdqa %xmm0,-0x67(%rdi)
674d0b3732eSbhollerL(SSE7Q5):	movdqa %xmm0,-0x57(%rdi)
675d0b3732eSbhollerL(SSE7Q4):	movdqa %xmm0,-0x47(%rdi)
676d0b3732eSbhollerL(SSE7Q3):	movdqa %xmm0,-0x37(%rdi)
677d0b3732eSbhollerL(SSE7Q2):	movdqa %xmm0,-0x27(%rdi)
678d0b3732eSbhollerL(SSE7Q1):	movdqa %xmm0,-0x17(%rdi)
679d0b3732eSbhollerL(SSE7Q0):	mov    %edx,-0x7(%rdi)
680d0b3732eSbholler		mov    %dx,-0x3(%rdi)
681d0b3732eSbholler		mov    %dl,-0x1(%rdi)
682d0b3732eSbholler		ret
683d0b3732eSbholler
684d0b3732eSbholler		.balign 16
685d0b3732eSbhollerL(SSE8QB):	movdqa %xmm0,-0xb8(%rdi)
686d0b3732eSbhollerL(SSE8QA):	movdqa %xmm0,-0xa8(%rdi)
687d0b3732eSbhollerL(SSE8Q9):	movdqa %xmm0,-0x98(%rdi)
688d0b3732eSbhollerL(SSE8Q8):	movdqa %xmm0,-0x88(%rdi)
689d0b3732eSbhollerL(SSE8Q7):	movdqa %xmm0,-0x78(%rdi)
690d0b3732eSbhollerL(SSE8Q6):	movdqa %xmm0,-0x68(%rdi)
691d0b3732eSbhollerL(SSE8Q5):	movdqa %xmm0,-0x58(%rdi)
692d0b3732eSbhollerL(SSE8Q4):	movdqa %xmm0,-0x48(%rdi)
693d0b3732eSbhollerL(SSE8Q3):	movdqa %xmm0,-0x38(%rdi)
694d0b3732eSbhollerL(SSE8Q2):	movdqa %xmm0,-0x28(%rdi)
695d0b3732eSbhollerL(SSE8Q1):	movdqa %xmm0,-0x18(%rdi)
696d0b3732eSbhollerL(SSE8Q0):	mov    %rdx,-0x8(%rdi)
697d0b3732eSbholler		ret
698d0b3732eSbholler
699d0b3732eSbholler		.balign 16
700d0b3732eSbhollerL(SSE9QB):	movdqa %xmm0,-0xb9(%rdi)
701d0b3732eSbhollerL(SSE9QA):	movdqa %xmm0,-0xa9(%rdi)
702d0b3732eSbhollerL(SSE9Q9):	movdqa %xmm0,-0x99(%rdi)
703d0b3732eSbhollerL(SSE9Q8):	movdqa %xmm0,-0x89(%rdi)
704d0b3732eSbhollerL(SSE9Q7):	movdqa %xmm0,-0x79(%rdi)
705d0b3732eSbhollerL(SSE9Q6):	movdqa %xmm0,-0x69(%rdi)
706d0b3732eSbhollerL(SSE9Q5):	movdqa %xmm0,-0x59(%rdi)
707d0b3732eSbhollerL(SSE9Q4):	movdqa %xmm0,-0x49(%rdi)
708d0b3732eSbhollerL(SSE9Q3):	movdqa %xmm0,-0x39(%rdi)
709d0b3732eSbhollerL(SSE9Q2):	movdqa %xmm0,-0x29(%rdi)
710d0b3732eSbhollerL(SSE9Q1):	movdqa %xmm0,-0x19(%rdi)
711d0b3732eSbhollerL(SSE9Q0):	mov    %rdx,-0x9(%rdi)
712d0b3732eSbholler		mov    %dl,-0x1(%rdi)
713d0b3732eSbholler		ret
714d0b3732eSbholler
715d0b3732eSbholler		.balign 16
716d0b3732eSbhollerL(SSE10QB):	movdqa %xmm0,-0xba(%rdi)
717d0b3732eSbhollerL(SSE10QA):	movdqa %xmm0,-0xaa(%rdi)
718d0b3732eSbhollerL(SSE10Q9):	movdqa %xmm0,-0x9a(%rdi)
719d0b3732eSbhollerL(SSE10Q8):	movdqa %xmm0,-0x8a(%rdi)
720d0b3732eSbhollerL(SSE10Q7):	movdqa %xmm0,-0x7a(%rdi)
721d0b3732eSbhollerL(SSE10Q6):	movdqa %xmm0,-0x6a(%rdi)
722d0b3732eSbhollerL(SSE10Q5):	movdqa %xmm0,-0x5a(%rdi)
723d0b3732eSbhollerL(SSE10Q4):	movdqa %xmm0,-0x4a(%rdi)
724d0b3732eSbhollerL(SSE10Q3):	movdqa %xmm0,-0x3a(%rdi)
725d0b3732eSbhollerL(SSE10Q2):	movdqa %xmm0,-0x2a(%rdi)
726d0b3732eSbhollerL(SSE10Q1):	movdqa %xmm0,-0x1a(%rdi)
727d0b3732eSbhollerL(SSE10Q0):	mov    %rdx,-0xa(%rdi)
728d0b3732eSbholler		mov    %dx,-0x2(%rdi)
729d0b3732eSbholler		ret
730d0b3732eSbholler
731d0b3732eSbholler		.balign 16
732d0b3732eSbhollerL(SSE11QB):	movdqa %xmm0,-0xbb(%rdi)
733d0b3732eSbhollerL(SSE11QA):	movdqa %xmm0,-0xab(%rdi)
734d0b3732eSbhollerL(SSE11Q9):	movdqa %xmm0,-0x9b(%rdi)
735d0b3732eSbhollerL(SSE11Q8):	movdqa %xmm0,-0x8b(%rdi)
736d0b3732eSbhollerL(SSE11Q7):	movdqa %xmm0,-0x7b(%rdi)
737d0b3732eSbhollerL(SSE11Q6):	movdqa %xmm0,-0x6b(%rdi)
738d0b3732eSbhollerL(SSE11Q5):	movdqa %xmm0,-0x5b(%rdi)
739d0b3732eSbhollerL(SSE11Q4):	movdqa %xmm0,-0x4b(%rdi)
740d0b3732eSbhollerL(SSE11Q3):	movdqa %xmm0,-0x3b(%rdi)
741d0b3732eSbhollerL(SSE11Q2):	movdqa %xmm0,-0x2b(%rdi)
742d0b3732eSbhollerL(SSE11Q1):	movdqa %xmm0,-0x1b(%rdi)
743d0b3732eSbhollerL(SSE11Q0):	mov    %rdx,-0xb(%rdi)
744d0b3732eSbholler		mov    %dx,-0x3(%rdi)
745d0b3732eSbholler		mov    %dl,-0x1(%rdi)
746d0b3732eSbholler		ret
747d0b3732eSbholler
748d0b3732eSbholler		.balign 16
749d0b3732eSbhollerL(SSE12QB):	movdqa %xmm0,-0xbc(%rdi)
750d0b3732eSbhollerL(SSE12QA):	movdqa %xmm0,-0xac(%rdi)
751d0b3732eSbhollerL(SSE12Q9):	movdqa %xmm0,-0x9c(%rdi)
752d0b3732eSbhollerL(SSE12Q8):	movdqa %xmm0,-0x8c(%rdi)
753d0b3732eSbhollerL(SSE12Q7):	movdqa %xmm0,-0x7c(%rdi)
754d0b3732eSbhollerL(SSE12Q6):	movdqa %xmm0,-0x6c(%rdi)
755d0b3732eSbhollerL(SSE12Q5):	movdqa %xmm0,-0x5c(%rdi)
756d0b3732eSbhollerL(SSE12Q4):	movdqa %xmm0,-0x4c(%rdi)
757d0b3732eSbhollerL(SSE12Q3):	movdqa %xmm0,-0x3c(%rdi)
758d0b3732eSbhollerL(SSE12Q2):	movdqa %xmm0,-0x2c(%rdi)
759d0b3732eSbhollerL(SSE12Q1):	movdqa %xmm0,-0x1c(%rdi)
760d0b3732eSbhollerL(SSE12Q0):	mov    %rdx,-0xc(%rdi)
761d0b3732eSbholler		mov    %edx,-0x4(%rdi)
762d0b3732eSbholler		ret
763d0b3732eSbholler
764d0b3732eSbholler		.balign 16
765d0b3732eSbhollerL(SSE13QB):	movdqa %xmm0,-0xbd(%rdi)
766d0b3732eSbhollerL(SSE13QA):	movdqa %xmm0,-0xad(%rdi)
767d0b3732eSbhollerL(SSE13Q9):	movdqa %xmm0,-0x9d(%rdi)
768d0b3732eSbhollerL(SSE13Q8):	movdqa %xmm0,-0x8d(%rdi)
769d0b3732eSbhollerL(SSE13Q7):	movdqa %xmm0,-0x7d(%rdi)
770d0b3732eSbhollerL(SSE13Q6):	movdqa %xmm0,-0x6d(%rdi)
771d0b3732eSbhollerL(SSE13Q5):	movdqa %xmm0,-0x5d(%rdi)
772d0b3732eSbhollerL(SSE13Q4):	movdqa %xmm0,-0x4d(%rdi)
773d0b3732eSbhollerL(SSE13Q3):	movdqa %xmm0,-0x3d(%rdi)
774d0b3732eSbhollerL(SSE13Q2):	movdqa %xmm0,-0x2d(%rdi)
775d0b3732eSbhollerL(SSE13Q1):	movdqa %xmm0,-0x1d(%rdi)
776d0b3732eSbhollerL(SSE13Q0):	mov    %rdx,-0xd(%rdi)
777d0b3732eSbholler		mov    %edx,-0x5(%rdi)
778d0b3732eSbholler		mov    %dl,-0x1(%rdi)
779d0b3732eSbholler		ret
780d0b3732eSbholler
781d0b3732eSbholler		.balign 16
782d0b3732eSbhollerL(SSE14QB):	movdqa %xmm0,-0xbe(%rdi)
783d0b3732eSbhollerL(SSE14QA):	movdqa %xmm0,-0xae(%rdi)
784d0b3732eSbhollerL(SSE14Q9):	movdqa %xmm0,-0x9e(%rdi)
785d0b3732eSbhollerL(SSE14Q8):	movdqa %xmm0,-0x8e(%rdi)
786d0b3732eSbhollerL(SSE14Q7):	movdqa %xmm0,-0x7e(%rdi)
787d0b3732eSbhollerL(SSE14Q6):	movdqa %xmm0,-0x6e(%rdi)
788d0b3732eSbhollerL(SSE14Q5):	movdqa %xmm0,-0x5e(%rdi)
789d0b3732eSbhollerL(SSE14Q4):	movdqa %xmm0,-0x4e(%rdi)
790d0b3732eSbhollerL(SSE14Q3):	movdqa %xmm0,-0x3e(%rdi)
791d0b3732eSbhollerL(SSE14Q2):	movdqa %xmm0,-0x2e(%rdi)
792d0b3732eSbhollerL(SSE14Q1):	movdqa %xmm0,-0x1e(%rdi)
793d0b3732eSbhollerL(SSE14Q0):	mov    %rdx,-0xe(%rdi)
794d0b3732eSbholler		mov    %edx,-0x6(%rdi)
795d0b3732eSbholler		mov    %dx,-0x2(%rdi)
796d0b3732eSbholler		ret
797d0b3732eSbholler
798d0b3732eSbholler		.balign 16
799d0b3732eSbhollerL(SSE15QB):	movdqa %xmm0,-0xbf(%rdi)
800d0b3732eSbhollerL(SSE15QA):	movdqa %xmm0,-0xaf(%rdi)
801d0b3732eSbhollerL(SSE15Q9):	movdqa %xmm0,-0x9f(%rdi)
802d0b3732eSbhollerL(SSE15Q8):	movdqa %xmm0,-0x8f(%rdi)
803d0b3732eSbhollerL(SSE15Q7):	movdqa %xmm0,-0x7f(%rdi)
804d0b3732eSbhollerL(SSE15Q6):	movdqa %xmm0,-0x6f(%rdi)
805d0b3732eSbhollerL(SSE15Q5):	movdqa %xmm0,-0x5f(%rdi)
806d0b3732eSbhollerL(SSE15Q4):	movdqa %xmm0,-0x4f(%rdi)
807d0b3732eSbhollerL(SSE15Q3):	movdqa %xmm0,-0x3f(%rdi)
808d0b3732eSbhollerL(SSE15Q2):	movdqa %xmm0,-0x2f(%rdi)
809d0b3732eSbhollerL(SSE15Q1):	movdqa %xmm0,-0x1f(%rdi)
810d0b3732eSbhollerL(SSE15Q0):	mov    %rdx,-0xf(%rdi)
811d0b3732eSbholler		mov    %edx,-0x7(%rdi)
812d0b3732eSbholler		mov    %dx,-0x3(%rdi)
813d0b3732eSbholler		mov    %dl,-0x1(%rdi)
814d0b3732eSbholler		ret
815d0b3732eSbholler
816d0b3732eSbholler		.balign 16
817d0b3732eSbhollerL(byte32sse2_pre):
818d0b3732eSbholler		mov    .largest_level_cache_size(%rip),%r9d
819d0b3732eSbholler		cmp    %r9,%r8
820d0b3732eSbholler		jg     L(sse2_nt_move)
821d0b3732eSbholler		#jmp    L(byte32sse2)		# Fall thru...
822d0b3732eSbholler
823d0b3732eSbholler		.balign 16
824d0b3732eSbhollerL(byte32sse2):
825d0b3732eSbholler		lea    -0x80(%r8),%r8		# 128
826d0b3732eSbholler		cmp    $0x80,%r8
827d0b3732eSbholler		movdqa %xmm0,(%rdi)
828d0b3732eSbholler		movdqa %xmm0,0x10(%rdi)
829d0b3732eSbholler		movdqa %xmm0,0x20(%rdi)
830d0b3732eSbholler		movdqa %xmm0,0x30(%rdi)
831d0b3732eSbholler		movdqa %xmm0,0x40(%rdi)
832d0b3732eSbholler		movdqa %xmm0,0x50(%rdi)
833d0b3732eSbholler		movdqa %xmm0,0x60(%rdi)
834d0b3732eSbholler		movdqa %xmm0,0x70(%rdi)
835d0b3732eSbholler
836d0b3732eSbholler		lea    0x80(%rdi),%rdi
837d0b3732eSbholler		jge    L(byte32sse2)
838d0b3732eSbholler
839d0b3732eSbholler		lea    L(SSExDx)(%rip),%r11
840d0b3732eSbholler		add    %r8,%rdi
841d0b3732eSbholler		movslq (%r11,%r8,4),%rcx
842d0b3732eSbholler		lea    (%rcx,%r11,1),%r11
843d0b3732eSbholler		jmpq   *%r11
844d0b3732eSbholler
845d0b3732eSbholler		.balign	16
846d0b3732eSbhollerL(sse2_nt_move):
847d0b3732eSbholler		sub    $0x80,%r8		# 128
848d0b3732eSbholler		movntdq %xmm0,(%rdi)
849d0b3732eSbholler		movntdq %xmm0,0x10(%rdi)
850d0b3732eSbholler		movntdq %xmm0,0x20(%rdi)
851d0b3732eSbholler		movntdq %xmm0,0x30(%rdi)
852d0b3732eSbholler		movntdq %xmm0,0x40(%rdi)
853d0b3732eSbholler		movntdq %xmm0,0x50(%rdi)
854d0b3732eSbholler		movntdq %xmm0,0x60(%rdi)
855d0b3732eSbholler		movntdq %xmm0,0x70(%rdi)
856d0b3732eSbholler		add    $0x80,%rdi
857d0b3732eSbholler		cmp    $0x80,%r8
858d0b3732eSbholler		jge    L(sse2_nt_move)
859d0b3732eSbholler
860d0b3732eSbholler		sfence
861d0b3732eSbholler		lea    L(SSExDx)(%rip),%r11
862d0b3732eSbholler		add    %r8,%rdi
863d0b3732eSbholler		movslq (%r11,%r8,4),%rcx
864d0b3732eSbholler		lea    (%rcx,%r11,1),%r11
865d0b3732eSbholler		jmpq   *%r11
866d0b3732eSbholler
867d0b3732eSbholler		/*
868d0b3732eSbholler		 * Don't use SSE
869d0b3732eSbholler		 */
870d0b3732eSbholler		.balign 16
871d0b3732eSbhollerL(Loop8byte_pre):
872d0b3732eSbholler		mov    .largest_level_cache_size(%rip),%r9d
873d0b3732eSbholler		cmp    %r9,%r8
874d0b3732eSbholler		jg     L(Loop8byte_nt_move)
875d0b3732eSbholler		cmp    $0x800,%r8		# Use rep sstoq
876d0b3732eSbholler		jge    L(use_rep)
877d0b3732eSbholler
878d0b3732eSbholler		.balign 16
879d0b3732eSbhollerL(Loop8byte):
880d0b3732eSbholler		lea    -0x80(%r8),%r8		# 128
881d0b3732eSbholler		mov    %rdx,(%rdi)
882d0b3732eSbholler		mov    %rdx,0x8(%rdi)
883d0b3732eSbholler		mov    %rdx,0x10(%rdi)
884d0b3732eSbholler		mov    %rdx,0x18(%rdi)
885d0b3732eSbholler		mov    %rdx,0x20(%rdi)
886d0b3732eSbholler		mov    %rdx,0x28(%rdi)
887d0b3732eSbholler		mov    %rdx,0x30(%rdi)
888d0b3732eSbholler		mov    %rdx,0x38(%rdi)
889d0b3732eSbholler		cmp    $0x80,%r8
890d0b3732eSbholler		mov    %rdx,0x40(%rdi)
891d0b3732eSbholler		mov    %rdx,0x48(%rdi)
892d0b3732eSbholler		mov    %rdx,0x50(%rdi)
893d0b3732eSbholler		mov    %rdx,0x58(%rdi)
894d0b3732eSbholler		mov    %rdx,0x60(%rdi)
895d0b3732eSbholler		mov    %rdx,0x68(%rdi)
896d0b3732eSbholler		mov    %rdx,0x70(%rdi)
897d0b3732eSbholler		mov    %rdx,0x78(%rdi)
898d0b3732eSbholler		lea    0x80(%rdi),%rdi
899d0b3732eSbholler		jge    L(Loop8byte)
900d0b3732eSbholler
901d0b3732eSbholler1:
902d0b3732eSbholler		lea    L(setPxQx)(%rip),%r11
903d0b3732eSbholler		lea    (%rdi,%r8,1),%rdi
904d0b3732eSbholler
905d0b3732eSbholler		movslq (%r11,%r8,4),%rcx
906d0b3732eSbholler		lea    (%rcx,%r11,1),%r11
907d0b3732eSbholler		jmpq   *%r11
908d0b3732eSbholler
909d0b3732eSbholler		/*
910d0b3732eSbholler		 * Use rep sstoq for sizes > 2K
911d0b3732eSbholler		 */
912d0b3732eSbholler		.balign 16
913d0b3732eSbhollerL(use_rep):
914d0b3732eSbholler		movq   %r8,%rcx			# get size in bytes
915d0b3732eSbholler		xchg   %rax,%rdx
916d0b3732eSbholler		shrq   $3,%rcx
917d0b3732eSbholler		rep
918d0b3732eSbholler		  sstoq
919d0b3732eSbholler		xchg   %rax,%rdx
920d0b3732eSbholler		andq   $7,%r8			# remaining bytes
921d0b3732eSbholler		jnz    1b
922d0b3732eSbholler		ret
923d0b3732eSbholler
924d0b3732eSbholler		.balign 16
925d0b3732eSbhollerL(Loop8byte_nt_move):
926*fad5204eSbostrovs		lea    -0x80(%r8),%r8		# 128
927d0b3732eSbholler		movnti %rdx,(%rdi)
928d0b3732eSbholler		movnti %rdx,0x8(%rdi)
929d0b3732eSbholler		movnti %rdx,0x10(%rdi)
930d0b3732eSbholler		movnti %rdx,0x18(%rdi)
931d0b3732eSbholler		movnti %rdx,0x20(%rdi)
932d0b3732eSbholler		movnti %rdx,0x28(%rdi)
933d0b3732eSbholler		movnti %rdx,0x30(%rdi)
934d0b3732eSbholler		movnti %rdx,0x38(%rdi)
935*fad5204eSbostrovs		cmp    $0x80,%r8
936*fad5204eSbostrovs		movnti %rdx,0x40(%rdi)
937*fad5204eSbostrovs		movnti %rdx,0x48(%rdi)
938*fad5204eSbostrovs		movnti %rdx,0x50(%rdi)
939*fad5204eSbostrovs		movnti %rdx,0x58(%rdi)
940*fad5204eSbostrovs		movnti %rdx,0x60(%rdi)
941*fad5204eSbostrovs		movnti %rdx,0x68(%rdi)
942*fad5204eSbostrovs		movnti %rdx,0x70(%rdi)
943*fad5204eSbostrovs		movnti %rdx,0x78(%rdi)
944*fad5204eSbostrovs		lea    0x80(%rdi),%rdi
945d0b3732eSbholler		jge    L(Loop8byte_nt_move)
946d0b3732eSbholler
947d0b3732eSbholler		sfence
948d0b3732eSbholler		lea    L(setPxQx)(%rip),%r11
949d0b3732eSbholler		lea    (%rdi,%r8,1),%rdi
950d0b3732eSbholler
951d0b3732eSbholler		movslq    (%r11,%r8,4),%rcx
952d0b3732eSbholler		lea    (%rcx,%r11,1),%r11
953d0b3732eSbholler		jmpq   *%r11
954d0b3732eSbholler
955d0b3732eSbholler		.balign 16
956d0b3732eSbhollerL(SSExDx):	.int       L(SSE0Q0) -L(SSExDx)
957d0b3732eSbholler		.int       L(SSE1Q0) -L(SSExDx)
958d0b3732eSbholler		.int       L(SSE2Q0) -L(SSExDx)
959d0b3732eSbholler		.int       L(SSE3Q0) -L(SSExDx)
960d0b3732eSbholler		.int       L(SSE4Q0) -L(SSExDx)
961d0b3732eSbholler		.int       L(SSE5Q0) -L(SSExDx)
962d0b3732eSbholler		.int       L(SSE6Q0) -L(SSExDx)
963d0b3732eSbholler		.int       L(SSE7Q0) -L(SSExDx)
964d0b3732eSbholler
965d0b3732eSbholler		.int       L(SSE8Q0) -L(SSExDx)
966d0b3732eSbholler		.int       L(SSE9Q0) -L(SSExDx)
967d0b3732eSbholler		.int       L(SSE10Q0)-L(SSExDx)
968d0b3732eSbholler		.int       L(SSE11Q0)-L(SSExDx)
969d0b3732eSbholler		.int       L(SSE12Q0)-L(SSExDx)
970d0b3732eSbholler		.int       L(SSE13Q0)-L(SSExDx)
971d0b3732eSbholler		.int       L(SSE14Q0)-L(SSExDx)
972d0b3732eSbholler		.int       L(SSE15Q0)-L(SSExDx)
973d0b3732eSbholler
974d0b3732eSbholler		.int       L(SSE0Q1) -L(SSExDx)
975d0b3732eSbholler		.int       L(SSE1Q1) -L(SSExDx)
976d0b3732eSbholler		.int       L(SSE2Q1) -L(SSExDx)
977d0b3732eSbholler		.int       L(SSE3Q1) -L(SSExDx)
978d0b3732eSbholler		.int       L(SSE4Q1) -L(SSExDx)
979d0b3732eSbholler		.int       L(SSE5Q1) -L(SSExDx)
980d0b3732eSbholler		.int       L(SSE6Q1) -L(SSExDx)
981d0b3732eSbholler		.int       L(SSE7Q1) -L(SSExDx)
982d0b3732eSbholler
983d0b3732eSbholler		.int       L(SSE8Q1) -L(SSExDx)
984d0b3732eSbholler		.int       L(SSE9Q1) -L(SSExDx)
985d0b3732eSbholler		.int       L(SSE10Q1)-L(SSExDx)
986d0b3732eSbholler		.int       L(SSE11Q1)-L(SSExDx)
987d0b3732eSbholler		.int       L(SSE12Q1)-L(SSExDx)
988d0b3732eSbholler		.int       L(SSE13Q1)-L(SSExDx)
989d0b3732eSbholler		.int       L(SSE14Q1)-L(SSExDx)
990d0b3732eSbholler		.int       L(SSE15Q1)-L(SSExDx)
991d0b3732eSbholler
992d0b3732eSbholler		.int       L(SSE0Q2) -L(SSExDx)
993d0b3732eSbholler		.int       L(SSE1Q2) -L(SSExDx)
994d0b3732eSbholler		.int       L(SSE2Q2) -L(SSExDx)
995d0b3732eSbholler		.int       L(SSE3Q2) -L(SSExDx)
996d0b3732eSbholler		.int       L(SSE4Q2) -L(SSExDx)
997d0b3732eSbholler		.int       L(SSE5Q2) -L(SSExDx)
998d0b3732eSbholler		.int       L(SSE6Q2) -L(SSExDx)
999d0b3732eSbholler		.int       L(SSE7Q2) -L(SSExDx)
1000d0b3732eSbholler
1001d0b3732eSbholler		.int       L(SSE8Q2) -L(SSExDx)
1002d0b3732eSbholler		.int       L(SSE9Q2) -L(SSExDx)
1003d0b3732eSbholler		.int       L(SSE10Q2)-L(SSExDx)
1004d0b3732eSbholler		.int       L(SSE11Q2)-L(SSExDx)
1005d0b3732eSbholler		.int       L(SSE12Q2)-L(SSExDx)
1006d0b3732eSbholler		.int       L(SSE13Q2)-L(SSExDx)
1007d0b3732eSbholler		.int       L(SSE14Q2)-L(SSExDx)
1008d0b3732eSbholler		.int       L(SSE15Q2)-L(SSExDx)
1009d0b3732eSbholler
1010d0b3732eSbholler		.int       L(SSE0Q3) -L(SSExDx)
1011d0b3732eSbholler		.int       L(SSE1Q3) -L(SSExDx)
1012d0b3732eSbholler		.int       L(SSE2Q3) -L(SSExDx)
1013d0b3732eSbholler		.int       L(SSE3Q3) -L(SSExDx)
1014d0b3732eSbholler		.int       L(SSE4Q3) -L(SSExDx)
1015d0b3732eSbholler		.int       L(SSE5Q3) -L(SSExDx)
1016d0b3732eSbholler		.int       L(SSE6Q3) -L(SSExDx)
1017d0b3732eSbholler		.int       L(SSE7Q3) -L(SSExDx)
1018d0b3732eSbholler
1019d0b3732eSbholler		.int       L(SSE8Q3) -L(SSExDx)
1020d0b3732eSbholler		.int       L(SSE9Q3) -L(SSExDx)
1021d0b3732eSbholler		.int       L(SSE10Q3)-L(SSExDx)
1022d0b3732eSbholler		.int       L(SSE11Q3)-L(SSExDx)
1023d0b3732eSbholler		.int       L(SSE12Q3)-L(SSExDx)
1024d0b3732eSbholler		.int       L(SSE13Q3)-L(SSExDx)
1025d0b3732eSbholler		.int       L(SSE14Q3)-L(SSExDx)
1026d0b3732eSbholler		.int       L(SSE15Q3)-L(SSExDx)
1027d0b3732eSbholler
1028d0b3732eSbholler		.int       L(SSE0Q4) -L(SSExDx)
1029d0b3732eSbholler		.int       L(SSE1Q4) -L(SSExDx)
1030d0b3732eSbholler		.int       L(SSE2Q4) -L(SSExDx)
1031d0b3732eSbholler		.int       L(SSE3Q4) -L(SSExDx)
1032d0b3732eSbholler		.int       L(SSE4Q4) -L(SSExDx)
1033d0b3732eSbholler		.int       L(SSE5Q4) -L(SSExDx)
1034d0b3732eSbholler		.int       L(SSE6Q4) -L(SSExDx)
1035d0b3732eSbholler		.int       L(SSE7Q4) -L(SSExDx)
1036d0b3732eSbholler
1037d0b3732eSbholler		.int       L(SSE8Q4) -L(SSExDx)
1038d0b3732eSbholler		.int       L(SSE9Q4) -L(SSExDx)
1039d0b3732eSbholler		.int       L(SSE10Q4)-L(SSExDx)
1040d0b3732eSbholler		.int       L(SSE11Q4)-L(SSExDx)
1041d0b3732eSbholler		.int       L(SSE12Q4)-L(SSExDx)
1042d0b3732eSbholler		.int       L(SSE13Q4)-L(SSExDx)
1043d0b3732eSbholler		.int       L(SSE14Q4)-L(SSExDx)
1044d0b3732eSbholler		.int       L(SSE15Q4)-L(SSExDx)
1045d0b3732eSbholler
1046d0b3732eSbholler		.int       L(SSE0Q5) -L(SSExDx)
1047d0b3732eSbholler		.int       L(SSE1Q5) -L(SSExDx)
1048d0b3732eSbholler		.int       L(SSE2Q5) -L(SSExDx)
1049d0b3732eSbholler		.int       L(SSE3Q5) -L(SSExDx)
1050d0b3732eSbholler		.int       L(SSE4Q5) -L(SSExDx)
1051d0b3732eSbholler		.int       L(SSE5Q5) -L(SSExDx)
1052d0b3732eSbholler		.int       L(SSE6Q5) -L(SSExDx)
1053d0b3732eSbholler		.int       L(SSE7Q5) -L(SSExDx)
1054d0b3732eSbholler
1055d0b3732eSbholler		.int       L(SSE8Q5) -L(SSExDx)
1056d0b3732eSbholler		.int       L(SSE9Q5) -L(SSExDx)
1057d0b3732eSbholler		.int       L(SSE10Q5)-L(SSExDx)
1058d0b3732eSbholler		.int       L(SSE11Q5)-L(SSExDx)
1059d0b3732eSbholler		.int       L(SSE12Q5)-L(SSExDx)
1060d0b3732eSbholler		.int       L(SSE13Q5)-L(SSExDx)
1061d0b3732eSbholler		.int       L(SSE14Q5)-L(SSExDx)
1062d0b3732eSbholler		.int       L(SSE15Q5)-L(SSExDx)
1063d0b3732eSbholler
1064d0b3732eSbholler		.int       L(SSE0Q6) -L(SSExDx)
1065d0b3732eSbholler		.int       L(SSE1Q6) -L(SSExDx)
1066d0b3732eSbholler		.int       L(SSE2Q6) -L(SSExDx)
1067d0b3732eSbholler		.int       L(SSE3Q6) -L(SSExDx)
1068d0b3732eSbholler		.int       L(SSE4Q6) -L(SSExDx)
1069d0b3732eSbholler		.int       L(SSE5Q6) -L(SSExDx)
1070d0b3732eSbholler		.int       L(SSE6Q6) -L(SSExDx)
1071d0b3732eSbholler		.int       L(SSE7Q6) -L(SSExDx)
1072d0b3732eSbholler
1073d0b3732eSbholler		.int       L(SSE8Q6) -L(SSExDx)
1074d0b3732eSbholler		.int       L(SSE9Q6) -L(SSExDx)
1075d0b3732eSbholler		.int       L(SSE10Q6)-L(SSExDx)
1076d0b3732eSbholler		.int       L(SSE11Q6)-L(SSExDx)
1077d0b3732eSbholler		.int       L(SSE12Q6)-L(SSExDx)
1078d0b3732eSbholler		.int       L(SSE13Q6)-L(SSExDx)
1079d0b3732eSbholler		.int       L(SSE14Q6)-L(SSExDx)
1080d0b3732eSbholler		.int       L(SSE15Q6)-L(SSExDx)
1081d0b3732eSbholler
1082d0b3732eSbholler		.int       L(SSE0Q7) -L(SSExDx)
1083d0b3732eSbholler		.int       L(SSE1Q7) -L(SSExDx)
1084d0b3732eSbholler		.int       L(SSE2Q7) -L(SSExDx)
1085d0b3732eSbholler		.int       L(SSE3Q7) -L(SSExDx)
1086d0b3732eSbholler		.int       L(SSE4Q7) -L(SSExDx)
1087d0b3732eSbholler		.int       L(SSE5Q7) -L(SSExDx)
1088d0b3732eSbholler		.int       L(SSE6Q7) -L(SSExDx)
1089d0b3732eSbholler		.int       L(SSE7Q7) -L(SSExDx)
1090d0b3732eSbholler
1091d0b3732eSbholler		.int       L(SSE8Q7) -L(SSExDx)
1092d0b3732eSbholler		.int       L(SSE9Q7) -L(SSExDx)
1093d0b3732eSbholler		.int       L(SSE10Q7)-L(SSExDx)
1094d0b3732eSbholler		.int       L(SSE11Q7)-L(SSExDx)
1095d0b3732eSbholler		.int       L(SSE12Q7)-L(SSExDx)
1096d0b3732eSbholler		.int       L(SSE13Q7)-L(SSExDx)
1097d0b3732eSbholler		.int       L(SSE14Q7)-L(SSExDx)
1098d0b3732eSbholler		.int       L(SSE15Q7)-L(SSExDx)
1099d0b3732eSbholler
1100d0b3732eSbholler		.int       L(SSE0Q8) -L(SSExDx)
1101d0b3732eSbholler		.int       L(SSE1Q8) -L(SSExDx)
1102d0b3732eSbholler		.int       L(SSE2Q8) -L(SSExDx)
1103d0b3732eSbholler		.int       L(SSE3Q8) -L(SSExDx)
1104d0b3732eSbholler		.int       L(SSE4Q8) -L(SSExDx)
1105d0b3732eSbholler		.int       L(SSE5Q8) -L(SSExDx)
1106d0b3732eSbholler		.int       L(SSE6Q8) -L(SSExDx)
1107d0b3732eSbholler		.int       L(SSE7Q8) -L(SSExDx)
1108d0b3732eSbholler
1109d0b3732eSbholler		.int       L(SSE8Q8) -L(SSExDx)
1110d0b3732eSbholler		.int       L(SSE9Q8) -L(SSExDx)
1111d0b3732eSbholler		.int       L(SSE10Q8)-L(SSExDx)
1112d0b3732eSbholler		.int       L(SSE11Q8)-L(SSExDx)
1113d0b3732eSbholler		.int       L(SSE12Q8)-L(SSExDx)
1114d0b3732eSbholler		.int       L(SSE13Q8)-L(SSExDx)
1115d0b3732eSbholler		.int       L(SSE14Q8)-L(SSExDx)
1116d0b3732eSbholler		.int       L(SSE15Q8)-L(SSExDx)
1117d0b3732eSbholler
1118d0b3732eSbholler		.int       L(SSE0Q9) -L(SSExDx)
1119d0b3732eSbholler		.int       L(SSE1Q9) -L(SSExDx)
1120d0b3732eSbholler		.int       L(SSE2Q9) -L(SSExDx)
1121d0b3732eSbholler		.int       L(SSE3Q9) -L(SSExDx)
1122d0b3732eSbholler		.int       L(SSE4Q9) -L(SSExDx)
1123d0b3732eSbholler		.int       L(SSE5Q9) -L(SSExDx)
1124d0b3732eSbholler		.int       L(SSE6Q9) -L(SSExDx)
1125d0b3732eSbholler		.int       L(SSE7Q9) -L(SSExDx)
1126d0b3732eSbholler
1127d0b3732eSbholler		.int       L(SSE8Q9) -L(SSExDx)
1128d0b3732eSbholler		.int       L(SSE9Q9) -L(SSExDx)
1129d0b3732eSbholler		.int       L(SSE10Q9)-L(SSExDx)
1130d0b3732eSbholler		.int       L(SSE11Q9)-L(SSExDx)
1131d0b3732eSbholler		.int       L(SSE12Q9)-L(SSExDx)
1132d0b3732eSbholler		.int       L(SSE13Q9)-L(SSExDx)
1133d0b3732eSbholler		.int       L(SSE14Q9)-L(SSExDx)
1134d0b3732eSbholler		.int       L(SSE15Q9)-L(SSExDx)
1135d0b3732eSbholler
1136d0b3732eSbholler		.int       L(SSE0QA) -L(SSExDx)
1137d0b3732eSbholler		.int       L(SSE1QA) -L(SSExDx)
1138d0b3732eSbholler		.int       L(SSE2QA) -L(SSExDx)
1139d0b3732eSbholler		.int       L(SSE3QA) -L(SSExDx)
1140d0b3732eSbholler		.int       L(SSE4QA) -L(SSExDx)
1141d0b3732eSbholler		.int       L(SSE5QA) -L(SSExDx)
1142d0b3732eSbholler		.int       L(SSE6QA) -L(SSExDx)
1143d0b3732eSbholler		.int       L(SSE7QA) -L(SSExDx)
1144d0b3732eSbholler
1145d0b3732eSbholler		.int       L(SSE8QA) -L(SSExDx)
1146d0b3732eSbholler		.int       L(SSE9QA) -L(SSExDx)
1147d0b3732eSbholler		.int       L(SSE10QA)-L(SSExDx)
1148d0b3732eSbholler		.int       L(SSE11QA)-L(SSExDx)
1149d0b3732eSbholler		.int       L(SSE12QA)-L(SSExDx)
1150d0b3732eSbholler		.int       L(SSE13QA)-L(SSExDx)
1151d0b3732eSbholler		.int       L(SSE14QA)-L(SSExDx)
1152d0b3732eSbholler		.int       L(SSE15QA)-L(SSExDx)
1153d0b3732eSbholler
1154d0b3732eSbholler		.int       L(SSE0QB) -L(SSExDx)
1155d0b3732eSbholler		.int       L(SSE1QB) -L(SSExDx)
1156d0b3732eSbholler		.int       L(SSE2QB) -L(SSExDx)
1157d0b3732eSbholler		.int       L(SSE3QB) -L(SSExDx)
1158d0b3732eSbholler		.int       L(SSE4QB) -L(SSExDx)
1159d0b3732eSbholler		.int       L(SSE5QB) -L(SSExDx)
1160d0b3732eSbholler		.int       L(SSE6QB) -L(SSExDx)
1161d0b3732eSbholler		.int       L(SSE7QB) -L(SSExDx)
1162d0b3732eSbholler
1163d0b3732eSbholler		.int       L(SSE8QB) -L(SSExDx)
1164d0b3732eSbholler		.int       L(SSE9QB) -L(SSExDx)
1165d0b3732eSbholler		.int       L(SSE10QB)-L(SSExDx)
1166d0b3732eSbholler		.int       L(SSE11QB)-L(SSExDx)
1167d0b3732eSbholler		.int       L(SSE12QB)-L(SSExDx)
1168d0b3732eSbholler		.int       L(SSE13QB)-L(SSExDx)
1169d0b3732eSbholler		.int       L(SSE14QB)-L(SSExDx)
1170d0b3732eSbholler		.int       L(SSE15QB)-L(SSExDx)
1171d0b3732eSbholler
11727c478bd9Sstevel@tonic-gate		SET_SIZE(memset)
1173