xref: /titanic_44/usr/src/uts/intel/ia32/ml/copy.s (revision 49b225e1cfa7bbf7738d4df0a03f18e3283426eb)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright (c) 2009, Intel Corporation
28 * All rights reserved.
29 */
30
31/*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
32/*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T		*/
33/*         All Rights Reserved						*/
34
35/*       Copyright (c) 1987, 1988 Microsoft Corporation			*/
36/*         All Rights Reserved						*/
37
38#include <sys/errno.h>
39#include <sys/asm_linkage.h>
40
41#if defined(__lint)
42#include <sys/types.h>
43#include <sys/systm.h>
44#else	/* __lint */
45#include "assym.h"
46#endif	/* __lint */
47
48#define	KCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
49#define	XCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
50/*
51 * Non-temopral access (NTA) alignment requirement
52 */
53#define	NTA_ALIGN_SIZE	4	/* Must be at least 4-byte aligned */
54#define	NTA_ALIGN_MASK	_CONST(NTA_ALIGN_SIZE-1)
55#define	COUNT_ALIGN_SIZE	16	/* Must be at least 16-byte aligned */
56#define	COUNT_ALIGN_MASK	_CONST(COUNT_ALIGN_SIZE-1)
57
58/*
59 * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
60 * "rep smovq" for large sizes. Performance data shows that many calls to
61 * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
62 * these small sizes unrolled code is used. For medium sizes loops writing
63 * 64-bytes per loop are used. Transition points were determined experimentally.
64 */
65#define BZERO_USE_REP	(1024)
66#define BCOPY_DFLT_REP	(128)
67#define	BCOPY_NHM_REP	(768)
68
69/*
70 * Copy a block of storage, returning an error code if `from' or
71 * `to' takes a kernel pagefault which cannot be resolved.
72 * Returns errno value on pagefault error, 0 if all ok
73 */
74
75#if defined(__lint)
76
77/* ARGSUSED */
78int
79kcopy(const void *from, void *to, size_t count)
80{ return (0); }
81
82#else	/* __lint */
83
84	.globl	kernelbase
85	.globl	postbootkernelbase
86
87#if defined(__amd64)
88
89	ENTRY(kcopy)
90	pushq	%rbp
91	movq	%rsp, %rbp
92#ifdef DEBUG
93	cmpq	postbootkernelbase(%rip), %rdi 		/* %rdi = from */
94	jb	0f
95	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
96	jnb	1f
970:	leaq	.kcopy_panic_msg(%rip), %rdi
98	xorl	%eax, %eax
99	call	panic
1001:
101#endif
102	/*
103	 * pass lofault value as 4th argument to do_copy_fault
104	 */
105	leaq	_kcopy_copyerr(%rip), %rcx
106	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
107
108do_copy_fault:
109	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
110	movq	%rcx, T_LOFAULT(%r9)	/* new lofault */
111	call	bcopy_altentry
112	xorl	%eax, %eax		/* return 0 (success) */
113
114	/*
115	 * A fault during do_copy_fault is indicated through an errno value
116	 * in %rax and we iretq from the trap handler to here.
117	 */
118_kcopy_copyerr:
119	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
120	leave
121	ret
122	SET_SIZE(kcopy)
123
124#elif defined(__i386)
125
126#define	ARG_FROM	8
127#define	ARG_TO		12
128#define	ARG_COUNT	16
129
130	ENTRY(kcopy)
131#ifdef DEBUG
132	pushl	%ebp
133	movl	%esp, %ebp
134	movl	postbootkernelbase, %eax
135	cmpl	%eax, ARG_FROM(%ebp)
136	jb	0f
137	cmpl	%eax, ARG_TO(%ebp)
138	jnb	1f
1390:	pushl	$.kcopy_panic_msg
140	call	panic
1411:	popl	%ebp
142#endif
143	lea	_kcopy_copyerr, %eax	/* lofault value */
144	movl	%gs:CPU_THREAD, %edx
145
146do_copy_fault:
147	pushl	%ebp
148	movl	%esp, %ebp		/* setup stack frame */
149	pushl	%esi
150	pushl	%edi			/* save registers */
151
152	movl	T_LOFAULT(%edx), %edi
153	pushl	%edi			/* save the current lofault */
154	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
155
156	movl	ARG_COUNT(%ebp), %ecx
157	movl	ARG_FROM(%ebp), %esi
158	movl	ARG_TO(%ebp), %edi
159	shrl	$2, %ecx		/* word count */
160	rep
161	  smovl
162	movl	ARG_COUNT(%ebp), %ecx
163	andl	$3, %ecx		/* bytes left over */
164	rep
165	  smovb
166	xorl	%eax, %eax
167
168	/*
169	 * A fault during do_copy_fault is indicated through an errno value
170	 * in %eax and we iret from the trap handler to here.
171	 */
172_kcopy_copyerr:
173	popl	%ecx
174	popl	%edi
175	movl	%ecx, T_LOFAULT(%edx)	/* restore the original lofault */
176	popl	%esi
177	popl	%ebp
178	ret
179	SET_SIZE(kcopy)
180
181#undef	ARG_FROM
182#undef	ARG_TO
183#undef	ARG_COUNT
184
185#endif	/* __i386 */
186#endif	/* __lint */
187
188#if defined(__lint)
189
190/*
191 * Copy a block of storage.  Similar to kcopy but uses non-temporal
192 * instructions.
193 */
194
195/* ARGSUSED */
196int
197kcopy_nta(const void *from, void *to, size_t count, int copy_cached)
198{ return (0); }
199
200#else	/* __lint */
201
202#if defined(__amd64)
203
204#define	COPY_LOOP_INIT(src, dst, cnt)	\
205	addq	cnt, src;			\
206	addq	cnt, dst;			\
207	shrq	$3, cnt;			\
208	neg	cnt
209
210	/* Copy 16 bytes per loop.  Uses %rax and %r8 */
211#define	COPY_LOOP_BODY(src, dst, cnt)	\
212	prefetchnta	0x100(src, cnt, 8);	\
213	movq	(src, cnt, 8), %rax;		\
214	movq	0x8(src, cnt, 8), %r8;		\
215	movnti	%rax, (dst, cnt, 8);		\
216	movnti	%r8, 0x8(dst, cnt, 8);		\
217	addq	$2, cnt
218
219	ENTRY(kcopy_nta)
220	pushq	%rbp
221	movq	%rsp, %rbp
222#ifdef DEBUG
223	cmpq	postbootkernelbase(%rip), %rdi 		/* %rdi = from */
224	jb	0f
225	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
226	jnb	1f
2270:	leaq	.kcopy_panic_msg(%rip), %rdi
228	xorl	%eax, %eax
229	call	panic
2301:
231#endif
232
233	movq	%gs:CPU_THREAD, %r9
234	cmpq	$0, %rcx		/* No non-temporal access? */
235	/*
236	 * pass lofault value as 4th argument to do_copy_fault
237	 */
238	leaq	_kcopy_nta_copyerr(%rip), %rcx	/* doesn't set rflags */
239	jnz	do_copy_fault		/* use regular access */
240	/*
241	 * Make sure cnt is >= KCOPY_MIN_SIZE
242	 */
243	cmpq	$KCOPY_MIN_SIZE, %rdx
244	jb	do_copy_fault
245
246	/*
247	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
248	 * count is COUNT_ALIGN_SIZE aligned.
249	 */
250	movq	%rdi, %r10
251	orq	%rsi, %r10
252	andq	$NTA_ALIGN_MASK, %r10
253	orq	%rdx, %r10
254	andq	$COUNT_ALIGN_MASK, %r10
255	jnz	do_copy_fault
256
257	ALTENTRY(do_copy_fault_nta)
258	movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
259	movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
260	movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
261
262	/*
263	 * COPY_LOOP_BODY uses %rax and %r8
264	 */
265	COPY_LOOP_INIT(%rdi, %rsi, %rdx)
2662:	COPY_LOOP_BODY(%rdi, %rsi, %rdx)
267	jnz	2b
268
269	mfence
270	xorl	%eax, %eax		/* return 0 (success) */
271
272_kcopy_nta_copyerr:
273	movq	%r11, T_LOFAULT(%r9)    /* restore original lofault */
274	leave
275	ret
276	SET_SIZE(do_copy_fault_nta)
277	SET_SIZE(kcopy_nta)
278
279#elif defined(__i386)
280
281#define	ARG_FROM	8
282#define	ARG_TO		12
283#define	ARG_COUNT	16
284
285#define	COPY_LOOP_INIT(src, dst, cnt)	\
286	addl	cnt, src;			\
287	addl	cnt, dst;			\
288	shrl	$3, cnt;			\
289	neg	cnt
290
291#define	COPY_LOOP_BODY(src, dst, cnt)	\
292	prefetchnta	0x100(src, cnt, 8);	\
293	movl	(src, cnt, 8), %esi;		\
294	movnti	%esi, (dst, cnt, 8);		\
295	movl	0x4(src, cnt, 8), %esi;		\
296	movnti	%esi, 0x4(dst, cnt, 8);		\
297	movl	0x8(src, cnt, 8), %esi;		\
298	movnti	%esi, 0x8(dst, cnt, 8);		\
299	movl	0xc(src, cnt, 8), %esi;		\
300	movnti	%esi, 0xc(dst, cnt, 8);		\
301	addl	$2, cnt
302
303	/*
304	 * kcopy_nta is not implemented for 32-bit as no performance
305	 * improvement was shown.  We simply jump directly to kcopy
306	 * and discard the 4 arguments.
307	 */
308	ENTRY(kcopy_nta)
309	jmp	kcopy
310
311	lea	_kcopy_nta_copyerr, %eax	/* lofault value */
312	ALTENTRY(do_copy_fault_nta)
313	pushl	%ebp
314	movl	%esp, %ebp		/* setup stack frame */
315	pushl	%esi
316	pushl	%edi
317
318	movl	%gs:CPU_THREAD, %edx
319	movl	T_LOFAULT(%edx), %edi
320	pushl	%edi			/* save the current lofault */
321	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
322
323	/* COPY_LOOP_BODY needs to use %esi */
324	movl	ARG_COUNT(%ebp), %ecx
325	movl	ARG_FROM(%ebp), %edi
326	movl	ARG_TO(%ebp), %eax
327	COPY_LOOP_INIT(%edi, %eax, %ecx)
3281:	COPY_LOOP_BODY(%edi, %eax, %ecx)
329	jnz	1b
330	mfence
331
332	xorl	%eax, %eax
333_kcopy_nta_copyerr:
334	popl	%ecx
335	popl	%edi
336	movl	%ecx, T_LOFAULT(%edx)	/* restore the original lofault */
337	popl	%esi
338	leave
339	ret
340	SET_SIZE(do_copy_fault_nta)
341	SET_SIZE(kcopy_nta)
342
343#undef	ARG_FROM
344#undef	ARG_TO
345#undef	ARG_COUNT
346
347#endif	/* __i386 */
348#endif	/* __lint */
349
350#if defined(__lint)
351
352/* ARGSUSED */
353void
354bcopy(const void *from, void *to, size_t count)
355{}
356
357#else	/* __lint */
358
359#if defined(__amd64)
360
361	ENTRY(bcopy)
362#ifdef DEBUG
363	orq	%rdx, %rdx		/* %rdx = count */
364	jz	1f
365	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
366	jb	0f
367	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
368	jnb	1f
3690:	leaq	.bcopy_panic_msg(%rip), %rdi
370	jmp	call_panic		/* setup stack and call panic */
3711:
372#endif
373	/*
374	 * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
375	 * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
376	 * uses these registers in future they must be saved and restored.
377	 */
378	ALTENTRY(bcopy_altentry)
379do_copy:
380#define	L(s) .bcopy/**/s
381	cmpq	$0x50, %rdx		/* 80 */
382	jge	bcopy_ck_size
383
384	/*
385	 * Performance data shows many caller's copy small buffers. So for
386	 * best perf for these sizes unrolled code is used. Store data without
387	 * worrying about alignment.
388	 */
389	leaq	L(fwdPxQx)(%rip), %r10
390	addq	%rdx, %rdi
391	addq	%rdx, %rsi
392	movslq	(%r10,%rdx,4), %rcx
393	leaq	(%rcx,%r10,1), %r10
394	jmpq	*%r10
395
396	.p2align 4
397L(fwdPxQx):
398	.int       L(P0Q0)-L(fwdPxQx)	/* 0 */
399	.int       L(P1Q0)-L(fwdPxQx)
400	.int       L(P2Q0)-L(fwdPxQx)
401	.int       L(P3Q0)-L(fwdPxQx)
402	.int       L(P4Q0)-L(fwdPxQx)
403	.int       L(P5Q0)-L(fwdPxQx)
404	.int       L(P6Q0)-L(fwdPxQx)
405	.int       L(P7Q0)-L(fwdPxQx)
406
407	.int       L(P0Q1)-L(fwdPxQx)	/* 8 */
408	.int       L(P1Q1)-L(fwdPxQx)
409	.int       L(P2Q1)-L(fwdPxQx)
410	.int       L(P3Q1)-L(fwdPxQx)
411	.int       L(P4Q1)-L(fwdPxQx)
412	.int       L(P5Q1)-L(fwdPxQx)
413	.int       L(P6Q1)-L(fwdPxQx)
414	.int       L(P7Q1)-L(fwdPxQx)
415
416	.int       L(P0Q2)-L(fwdPxQx)	/* 16 */
417	.int       L(P1Q2)-L(fwdPxQx)
418	.int       L(P2Q2)-L(fwdPxQx)
419	.int       L(P3Q2)-L(fwdPxQx)
420	.int       L(P4Q2)-L(fwdPxQx)
421	.int       L(P5Q2)-L(fwdPxQx)
422	.int       L(P6Q2)-L(fwdPxQx)
423	.int       L(P7Q2)-L(fwdPxQx)
424
425	.int       L(P0Q3)-L(fwdPxQx)	/* 24 */
426	.int       L(P1Q3)-L(fwdPxQx)
427	.int       L(P2Q3)-L(fwdPxQx)
428	.int       L(P3Q3)-L(fwdPxQx)
429	.int       L(P4Q3)-L(fwdPxQx)
430	.int       L(P5Q3)-L(fwdPxQx)
431	.int       L(P6Q3)-L(fwdPxQx)
432	.int       L(P7Q3)-L(fwdPxQx)
433
434	.int       L(P0Q4)-L(fwdPxQx)	/* 32 */
435	.int       L(P1Q4)-L(fwdPxQx)
436	.int       L(P2Q4)-L(fwdPxQx)
437	.int       L(P3Q4)-L(fwdPxQx)
438	.int       L(P4Q4)-L(fwdPxQx)
439	.int       L(P5Q4)-L(fwdPxQx)
440	.int       L(P6Q4)-L(fwdPxQx)
441	.int       L(P7Q4)-L(fwdPxQx)
442
443	.int       L(P0Q5)-L(fwdPxQx)	/* 40 */
444	.int       L(P1Q5)-L(fwdPxQx)
445	.int       L(P2Q5)-L(fwdPxQx)
446	.int       L(P3Q5)-L(fwdPxQx)
447	.int       L(P4Q5)-L(fwdPxQx)
448	.int       L(P5Q5)-L(fwdPxQx)
449	.int       L(P6Q5)-L(fwdPxQx)
450	.int       L(P7Q5)-L(fwdPxQx)
451
452	.int       L(P0Q6)-L(fwdPxQx)	/* 48 */
453	.int       L(P1Q6)-L(fwdPxQx)
454	.int       L(P2Q6)-L(fwdPxQx)
455	.int       L(P3Q6)-L(fwdPxQx)
456	.int       L(P4Q6)-L(fwdPxQx)
457	.int       L(P5Q6)-L(fwdPxQx)
458	.int       L(P6Q6)-L(fwdPxQx)
459	.int       L(P7Q6)-L(fwdPxQx)
460
461	.int       L(P0Q7)-L(fwdPxQx)	/* 56 */
462	.int       L(P1Q7)-L(fwdPxQx)
463	.int       L(P2Q7)-L(fwdPxQx)
464	.int       L(P3Q7)-L(fwdPxQx)
465	.int       L(P4Q7)-L(fwdPxQx)
466	.int       L(P5Q7)-L(fwdPxQx)
467	.int       L(P6Q7)-L(fwdPxQx)
468	.int       L(P7Q7)-L(fwdPxQx)
469
470	.int       L(P0Q8)-L(fwdPxQx)	/* 64 */
471	.int       L(P1Q8)-L(fwdPxQx)
472	.int       L(P2Q8)-L(fwdPxQx)
473	.int       L(P3Q8)-L(fwdPxQx)
474	.int       L(P4Q8)-L(fwdPxQx)
475	.int       L(P5Q8)-L(fwdPxQx)
476	.int       L(P6Q8)-L(fwdPxQx)
477	.int       L(P7Q8)-L(fwdPxQx)
478
479	.int       L(P0Q9)-L(fwdPxQx)	/* 72 */
480	.int       L(P1Q9)-L(fwdPxQx)
481	.int       L(P2Q9)-L(fwdPxQx)
482	.int       L(P3Q9)-L(fwdPxQx)
483	.int       L(P4Q9)-L(fwdPxQx)
484	.int       L(P5Q9)-L(fwdPxQx)
485	.int       L(P6Q9)-L(fwdPxQx)
486	.int       L(P7Q9)-L(fwdPxQx)	/* 79 */
487
488	.p2align 4
489L(P0Q9):
490	mov    -0x48(%rdi), %rcx
491	mov    %rcx, -0x48(%rsi)
492L(P0Q8):
493	mov    -0x40(%rdi), %r10
494	mov    %r10, -0x40(%rsi)
495L(P0Q7):
496	mov    -0x38(%rdi), %r8
497	mov    %r8, -0x38(%rsi)
498L(P0Q6):
499	mov    -0x30(%rdi), %rcx
500	mov    %rcx, -0x30(%rsi)
501L(P0Q5):
502	mov    -0x28(%rdi), %r10
503	mov    %r10, -0x28(%rsi)
504L(P0Q4):
505	mov    -0x20(%rdi), %r8
506	mov    %r8, -0x20(%rsi)
507L(P0Q3):
508	mov    -0x18(%rdi), %rcx
509	mov    %rcx, -0x18(%rsi)
510L(P0Q2):
511	mov    -0x10(%rdi), %r10
512	mov    %r10, -0x10(%rsi)
513L(P0Q1):
514	mov    -0x8(%rdi), %r8
515	mov    %r8, -0x8(%rsi)
516L(P0Q0):
517	ret
518
519	.p2align 4
520L(P1Q9):
521	mov    -0x49(%rdi), %r8
522	mov    %r8, -0x49(%rsi)
523L(P1Q8):
524	mov    -0x41(%rdi), %rcx
525	mov    %rcx, -0x41(%rsi)
526L(P1Q7):
527	mov    -0x39(%rdi), %r10
528	mov    %r10, -0x39(%rsi)
529L(P1Q6):
530	mov    -0x31(%rdi), %r8
531	mov    %r8, -0x31(%rsi)
532L(P1Q5):
533	mov    -0x29(%rdi), %rcx
534	mov    %rcx, -0x29(%rsi)
535L(P1Q4):
536	mov    -0x21(%rdi), %r10
537	mov    %r10, -0x21(%rsi)
538L(P1Q3):
539	mov    -0x19(%rdi), %r8
540	mov    %r8, -0x19(%rsi)
541L(P1Q2):
542	mov    -0x11(%rdi), %rcx
543	mov    %rcx, -0x11(%rsi)
544L(P1Q1):
545	mov    -0x9(%rdi), %r10
546	mov    %r10, -0x9(%rsi)
547L(P1Q0):
548	movzbq -0x1(%rdi), %r8
549	mov    %r8b, -0x1(%rsi)
550	ret
551
552	.p2align 4
553L(P2Q9):
554	mov    -0x4a(%rdi), %r8
555	mov    %r8, -0x4a(%rsi)
556L(P2Q8):
557	mov    -0x42(%rdi), %rcx
558	mov    %rcx, -0x42(%rsi)
559L(P2Q7):
560	mov    -0x3a(%rdi), %r10
561	mov    %r10, -0x3a(%rsi)
562L(P2Q6):
563	mov    -0x32(%rdi), %r8
564	mov    %r8, -0x32(%rsi)
565L(P2Q5):
566	mov    -0x2a(%rdi), %rcx
567	mov    %rcx, -0x2a(%rsi)
568L(P2Q4):
569	mov    -0x22(%rdi), %r10
570	mov    %r10, -0x22(%rsi)
571L(P2Q3):
572	mov    -0x1a(%rdi), %r8
573	mov    %r8, -0x1a(%rsi)
574L(P2Q2):
575	mov    -0x12(%rdi), %rcx
576	mov    %rcx, -0x12(%rsi)
577L(P2Q1):
578	mov    -0xa(%rdi), %r10
579	mov    %r10, -0xa(%rsi)
580L(P2Q0):
581	movzwq -0x2(%rdi), %r8
582	mov    %r8w, -0x2(%rsi)
583	ret
584
585	.p2align 4
586L(P3Q9):
587	mov    -0x4b(%rdi), %r8
588	mov    %r8, -0x4b(%rsi)
589L(P3Q8):
590	mov    -0x43(%rdi), %rcx
591	mov    %rcx, -0x43(%rsi)
592L(P3Q7):
593	mov    -0x3b(%rdi), %r10
594	mov    %r10, -0x3b(%rsi)
595L(P3Q6):
596	mov    -0x33(%rdi), %r8
597	mov    %r8, -0x33(%rsi)
598L(P3Q5):
599	mov    -0x2b(%rdi), %rcx
600	mov    %rcx, -0x2b(%rsi)
601L(P3Q4):
602	mov    -0x23(%rdi), %r10
603	mov    %r10, -0x23(%rsi)
604L(P3Q3):
605	mov    -0x1b(%rdi), %r8
606	mov    %r8, -0x1b(%rsi)
607L(P3Q2):
608	mov    -0x13(%rdi), %rcx
609	mov    %rcx, -0x13(%rsi)
610L(P3Q1):
611	mov    -0xb(%rdi), %r10
612	mov    %r10, -0xb(%rsi)
613	/*
614	 * These trailing loads/stores have to do all their loads 1st,
615	 * then do the stores.
616	 */
617L(P3Q0):
618	movzwq -0x3(%rdi), %r8
619	movzbq -0x1(%rdi), %r10
620	mov    %r8w, -0x3(%rsi)
621	mov    %r10b, -0x1(%rsi)
622	ret
623
624	.p2align 4
625L(P4Q9):
626	mov    -0x4c(%rdi), %r8
627	mov    %r8, -0x4c(%rsi)
628L(P4Q8):
629	mov    -0x44(%rdi), %rcx
630	mov    %rcx, -0x44(%rsi)
631L(P4Q7):
632	mov    -0x3c(%rdi), %r10
633	mov    %r10, -0x3c(%rsi)
634L(P4Q6):
635	mov    -0x34(%rdi), %r8
636	mov    %r8, -0x34(%rsi)
637L(P4Q5):
638	mov    -0x2c(%rdi), %rcx
639	mov    %rcx, -0x2c(%rsi)
640L(P4Q4):
641	mov    -0x24(%rdi), %r10
642	mov    %r10, -0x24(%rsi)
643L(P4Q3):
644	mov    -0x1c(%rdi), %r8
645	mov    %r8, -0x1c(%rsi)
646L(P4Q2):
647	mov    -0x14(%rdi), %rcx
648	mov    %rcx, -0x14(%rsi)
649L(P4Q1):
650	mov    -0xc(%rdi), %r10
651	mov    %r10, -0xc(%rsi)
652L(P4Q0):
653	mov    -0x4(%rdi), %r8d
654	mov    %r8d, -0x4(%rsi)
655	ret
656
657	.p2align 4
658L(P5Q9):
659	mov    -0x4d(%rdi), %r8
660	mov    %r8, -0x4d(%rsi)
661L(P5Q8):
662	mov    -0x45(%rdi), %rcx
663	mov    %rcx, -0x45(%rsi)
664L(P5Q7):
665	mov    -0x3d(%rdi), %r10
666	mov    %r10, -0x3d(%rsi)
667L(P5Q6):
668	mov    -0x35(%rdi), %r8
669	mov    %r8, -0x35(%rsi)
670L(P5Q5):
671	mov    -0x2d(%rdi), %rcx
672	mov    %rcx, -0x2d(%rsi)
673L(P5Q4):
674	mov    -0x25(%rdi), %r10
675	mov    %r10, -0x25(%rsi)
676L(P5Q3):
677	mov    -0x1d(%rdi), %r8
678	mov    %r8, -0x1d(%rsi)
679L(P5Q2):
680	mov    -0x15(%rdi), %rcx
681	mov    %rcx, -0x15(%rsi)
682L(P5Q1):
683	mov    -0xd(%rdi), %r10
684	mov    %r10, -0xd(%rsi)
685L(P5Q0):
686	mov    -0x5(%rdi), %r8d
687	movzbq -0x1(%rdi), %r10
688	mov    %r8d, -0x5(%rsi)
689	mov    %r10b, -0x1(%rsi)
690	ret
691
692	.p2align 4
693L(P6Q9):
694	mov    -0x4e(%rdi), %r8
695	mov    %r8, -0x4e(%rsi)
696L(P6Q8):
697	mov    -0x46(%rdi), %rcx
698	mov    %rcx, -0x46(%rsi)
699L(P6Q7):
700	mov    -0x3e(%rdi), %r10
701	mov    %r10, -0x3e(%rsi)
702L(P6Q6):
703	mov    -0x36(%rdi), %r8
704	mov    %r8, -0x36(%rsi)
705L(P6Q5):
706	mov    -0x2e(%rdi), %rcx
707	mov    %rcx, -0x2e(%rsi)
708L(P6Q4):
709	mov    -0x26(%rdi), %r10
710	mov    %r10, -0x26(%rsi)
711L(P6Q3):
712	mov    -0x1e(%rdi), %r8
713	mov    %r8, -0x1e(%rsi)
714L(P6Q2):
715	mov    -0x16(%rdi), %rcx
716	mov    %rcx, -0x16(%rsi)
717L(P6Q1):
718	mov    -0xe(%rdi), %r10
719	mov    %r10, -0xe(%rsi)
720L(P6Q0):
721	mov    -0x6(%rdi), %r8d
722	movzwq -0x2(%rdi), %r10
723	mov    %r8d, -0x6(%rsi)
724	mov    %r10w, -0x2(%rsi)
725	ret
726
727	.p2align 4
728L(P7Q9):
729	mov    -0x4f(%rdi), %r8
730	mov    %r8, -0x4f(%rsi)
731L(P7Q8):
732	mov    -0x47(%rdi), %rcx
733	mov    %rcx, -0x47(%rsi)
734L(P7Q7):
735	mov    -0x3f(%rdi), %r10
736	mov    %r10, -0x3f(%rsi)
737L(P7Q6):
738	mov    -0x37(%rdi), %r8
739	mov    %r8, -0x37(%rsi)
740L(P7Q5):
741	mov    -0x2f(%rdi), %rcx
742	mov    %rcx, -0x2f(%rsi)
743L(P7Q4):
744	mov    -0x27(%rdi), %r10
745	mov    %r10, -0x27(%rsi)
746L(P7Q3):
747	mov    -0x1f(%rdi), %r8
748	mov    %r8, -0x1f(%rsi)
749L(P7Q2):
750	mov    -0x17(%rdi), %rcx
751	mov    %rcx, -0x17(%rsi)
752L(P7Q1):
753	mov    -0xf(%rdi), %r10
754	mov    %r10, -0xf(%rsi)
755L(P7Q0):
756	mov    -0x7(%rdi), %r8d
757	movzwq -0x3(%rdi), %r10
758	movzbq -0x1(%rdi), %rcx
759	mov    %r8d, -0x7(%rsi)
760	mov    %r10w, -0x3(%rsi)
761	mov    %cl, -0x1(%rsi)
762	ret
763
764	/*
765	 * For large sizes rep smovq is fastest.
766	 * Transition point determined experimentally as measured on
767	 * Intel Xeon processors (incl. Nehalem and previous generations) and
768	 * AMD Opteron. The transition value is patched at boot time to avoid
769	 * memory reference hit.
770	 */
771	.globl bcopy_patch_start
772bcopy_patch_start:
773	cmpq	$BCOPY_NHM_REP, %rdx
774	.globl bcopy_patch_end
775bcopy_patch_end:
776
777	.p2align 4
778	.globl bcopy_ck_size
779bcopy_ck_size:
780	cmpq	$BCOPY_DFLT_REP, %rdx
781	jge	L(use_rep)
782
783	/*
784	 * Align to a 8-byte boundary. Avoids penalties from unaligned stores
785	 * as well as from stores spanning cachelines.
786	 */
787	test	$0x7, %rsi
788	jz	L(aligned_loop)
789	test	$0x1, %rsi
790	jz	2f
791	movzbq	(%rdi), %r8
792	dec	%rdx
793	inc	%rdi
794	mov	%r8b, (%rsi)
795	inc	%rsi
7962:
797	test	$0x2, %rsi
798	jz	4f
799	movzwq	(%rdi), %r8
800	sub	$0x2, %rdx
801	add	$0x2, %rdi
802	mov	%r8w, (%rsi)
803	add	$0x2, %rsi
8044:
805	test	$0x4, %rsi
806	jz	L(aligned_loop)
807	mov	(%rdi), %r8d
808	sub	$0x4, %rdx
809	add	$0x4, %rdi
810	mov	%r8d, (%rsi)
811	add	$0x4, %rsi
812
813	/*
814	 * Copy 64-bytes per loop
815	 */
816	.p2align 4
817L(aligned_loop):
818	mov	(%rdi), %r8
819	mov	0x8(%rdi), %r10
820	lea	-0x40(%rdx), %rdx
821	mov	%r8, (%rsi)
822	mov	%r10, 0x8(%rsi)
823	mov	0x10(%rdi), %rcx
824	mov	0x18(%rdi), %r8
825	mov	%rcx, 0x10(%rsi)
826	mov	%r8, 0x18(%rsi)
827
828	cmp	$0x40, %rdx
829	mov	0x20(%rdi), %r10
830	mov	0x28(%rdi), %rcx
831	mov	%r10, 0x20(%rsi)
832	mov	%rcx, 0x28(%rsi)
833	mov	0x30(%rdi), %r8
834	mov	0x38(%rdi), %r10
835	lea	0x40(%rdi), %rdi
836	mov	%r8, 0x30(%rsi)
837	mov	%r10, 0x38(%rsi)
838	lea	0x40(%rsi), %rsi
839	jge	L(aligned_loop)
840
841	/*
842	 * Copy remaining bytes (0-63)
843	 */
844L(do_remainder):
845	leaq	L(fwdPxQx)(%rip), %r10
846	addq	%rdx, %rdi
847	addq	%rdx, %rsi
848	movslq	(%r10,%rdx,4), %rcx
849	leaq	(%rcx,%r10,1), %r10
850	jmpq	*%r10
851
852	/*
853	 * Use rep smovq. Clear remainder via unrolled code
854	 */
855	.p2align 4
856L(use_rep):
857	xchgq	%rdi, %rsi		/* %rsi = source, %rdi = destination */
858	movq	%rdx, %rcx		/* %rcx = count */
859	shrq	$3, %rcx		/* 8-byte word count */
860	rep
861	  smovq
862
863	xchgq	%rsi, %rdi		/* %rdi = src, %rsi = destination */
864	andq	$7, %rdx		/* remainder */
865	jnz	L(do_remainder)
866	ret
867#undef	L
868
869#ifdef DEBUG
870	/*
871	 * Setup frame on the run-time stack. The end of the input argument
872	 * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
873	 * always points to the end of the latest allocated stack frame.
874	 * panic(const char *format, ...) is a varargs function. When a
875	 * function taking variable arguments is called, %rax must be set
876	 * to eight times the number of floating point parameters passed
877	 * to the function in SSE registers.
878	 */
879call_panic:
880	pushq	%rbp			/* align stack properly */
881	movq	%rsp, %rbp
882	xorl	%eax, %eax		/* no variable arguments */
883	call	panic			/* %rdi = format string */
884#endif
885	SET_SIZE(bcopy_altentry)
886	SET_SIZE(bcopy)
887
888#elif defined(__i386)
889
890#define	ARG_FROM	4
891#define	ARG_TO		8
892#define	ARG_COUNT	12
893
894	ENTRY(bcopy)
895#ifdef DEBUG
896	movl	ARG_COUNT(%esp), %eax
897	orl	%eax, %eax
898	jz	1f
899	movl	postbootkernelbase, %eax
900	cmpl	%eax, ARG_FROM(%esp)
901	jb	0f
902	cmpl	%eax, ARG_TO(%esp)
903	jnb	1f
9040:	pushl	%ebp
905	movl	%esp, %ebp
906	pushl	$.bcopy_panic_msg
907	call	panic
9081:
909#endif
910do_copy:
911	movl	%esi, %eax		/* save registers */
912	movl	%edi, %edx
913	movl	ARG_COUNT(%esp), %ecx
914	movl	ARG_FROM(%esp), %esi
915	movl	ARG_TO(%esp), %edi
916
917	shrl	$2, %ecx		/* word count */
918	rep
919	  smovl
920	movl	ARG_COUNT(%esp), %ecx
921	andl	$3, %ecx		/* bytes left over */
922	rep
923	  smovb
924	movl	%eax, %esi		/* restore registers */
925	movl	%edx, %edi
926	ret
927	SET_SIZE(bcopy)
928
929#undef	ARG_COUNT
930#undef	ARG_FROM
931#undef	ARG_TO
932
933#endif	/* __i386 */
934#endif	/* __lint */
935
936
937/*
938 * Zero a block of storage, returning an error code if we
939 * take a kernel pagefault which cannot be resolved.
940 * Returns errno value on pagefault error, 0 if all ok
941 */
942
943#if defined(__lint)
944
945/* ARGSUSED */
946int
947kzero(void *addr, size_t count)
948{ return (0); }
949
950#else	/* __lint */
951
952#if defined(__amd64)
953
954	ENTRY(kzero)
955#ifdef DEBUG
956        cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
957        jnb	0f
958        leaq	.kzero_panic_msg(%rip), %rdi
959	jmp	call_panic		/* setup stack and call panic */
9600:
961#endif
962	/*
963	 * pass lofault value as 3rd argument for fault return
964	 */
965	leaq	_kzeroerr(%rip), %rdx
966
967	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
968	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
969	movq	%rdx, T_LOFAULT(%r9)	/* new lofault */
970	call	bzero_altentry
971	xorl	%eax, %eax
972	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
973	ret
974	/*
975	 * A fault during bzero is indicated through an errno value
976	 * in %rax when we iretq to here.
977	 */
978_kzeroerr:
979	addq	$8, %rsp		/* pop bzero_altentry call ret addr */
980	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
981	ret
982	SET_SIZE(kzero)
983
984#elif defined(__i386)
985
986#define	ARG_ADDR	8
987#define	ARG_COUNT	12
988
989	ENTRY(kzero)
990#ifdef DEBUG
991	pushl	%ebp
992	movl	%esp, %ebp
993	movl	postbootkernelbase, %eax
994        cmpl	%eax, ARG_ADDR(%ebp)
995        jnb	0f
996        pushl   $.kzero_panic_msg
997        call    panic
9980:	popl	%ebp
999#endif
1000	lea	_kzeroerr, %eax		/* kzeroerr is lofault value */
1001
1002	pushl	%ebp			/* save stack base */
1003	movl	%esp, %ebp		/* set new stack base */
1004	pushl	%edi			/* save %edi */
1005
1006	mov	%gs:CPU_THREAD, %edx
1007	movl	T_LOFAULT(%edx), %edi
1008	pushl	%edi			/* save the current lofault */
1009	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
1010
1011	movl	ARG_COUNT(%ebp), %ecx	/* get size in bytes */
1012	movl	ARG_ADDR(%ebp), %edi	/* %edi <- address of bytes to clear */
1013	shrl	$2, %ecx		/* Count of double words to zero */
1014	xorl	%eax, %eax		/* sstol val */
1015	rep
1016	  sstol			/* %ecx contains words to clear (%eax=0) */
1017
1018	movl	ARG_COUNT(%ebp), %ecx	/* get size in bytes */
1019	andl	$3, %ecx		/* do mod 4 */
1020	rep
1021	  sstob			/* %ecx contains residual bytes to clear */
1022
1023	/*
1024	 * A fault during kzero is indicated through an errno value
1025	 * in %eax when we iret to here.
1026	 */
1027_kzeroerr:
1028	popl	%edi
1029	movl	%edi, T_LOFAULT(%edx)	/* restore the original lofault */
1030	popl	%edi
1031	popl	%ebp
1032	ret
1033	SET_SIZE(kzero)
1034
1035#undef	ARG_ADDR
1036#undef	ARG_COUNT
1037
1038#endif	/* __i386 */
1039#endif	/* __lint */
1040
1041/*
1042 * Zero a block of storage.
1043 */
1044
1045#if defined(__lint)
1046
1047/* ARGSUSED */
1048void
1049bzero(void *addr, size_t count)
1050{}
1051
1052#else	/* __lint */
1053
1054#if defined(__amd64)
1055
1056	ENTRY(bzero)
1057#ifdef DEBUG
1058	cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
1059	jnb	0f
1060	leaq	.bzero_panic_msg(%rip), %rdi
1061	jmp	call_panic		/* setup stack and call panic */
10620:
1063#endif
1064	ALTENTRY(bzero_altentry)
1065do_zero:
1066#define	L(s) .bzero/**/s
1067	xorl	%eax, %eax
1068
1069	cmpq	$0x50, %rsi		/* 80 */
1070	jge	L(ck_align)
1071
1072	/*
1073	 * Performance data shows many caller's are zeroing small buffers. So
1074	 * for best perf for these sizes unrolled code is used. Store zeros
1075	 * without worrying about alignment.
1076	 */
1077	leaq	L(setPxQx)(%rip), %r10
1078	addq	%rsi, %rdi
1079	movslq	(%r10,%rsi,4), %rcx
1080	leaq	(%rcx,%r10,1), %r10
1081	jmpq	*%r10
1082
1083	.p2align 4
1084L(setPxQx):
1085	.int       L(P0Q0)-L(setPxQx)	/* 0 */
1086	.int       L(P1Q0)-L(setPxQx)
1087	.int       L(P2Q0)-L(setPxQx)
1088	.int       L(P3Q0)-L(setPxQx)
1089	.int       L(P4Q0)-L(setPxQx)
1090	.int       L(P5Q0)-L(setPxQx)
1091	.int       L(P6Q0)-L(setPxQx)
1092	.int       L(P7Q0)-L(setPxQx)
1093
1094	.int       L(P0Q1)-L(setPxQx)	/* 8 */
1095	.int       L(P1Q1)-L(setPxQx)
1096	.int       L(P2Q1)-L(setPxQx)
1097	.int       L(P3Q1)-L(setPxQx)
1098	.int       L(P4Q1)-L(setPxQx)
1099	.int       L(P5Q1)-L(setPxQx)
1100	.int       L(P6Q1)-L(setPxQx)
1101	.int       L(P7Q1)-L(setPxQx)
1102
1103	.int       L(P0Q2)-L(setPxQx)	/* 16 */
1104	.int       L(P1Q2)-L(setPxQx)
1105	.int       L(P2Q2)-L(setPxQx)
1106	.int       L(P3Q2)-L(setPxQx)
1107	.int       L(P4Q2)-L(setPxQx)
1108	.int       L(P5Q2)-L(setPxQx)
1109	.int       L(P6Q2)-L(setPxQx)
1110	.int       L(P7Q2)-L(setPxQx)
1111
1112	.int       L(P0Q3)-L(setPxQx)	/* 24 */
1113	.int       L(P1Q3)-L(setPxQx)
1114	.int       L(P2Q3)-L(setPxQx)
1115	.int       L(P3Q3)-L(setPxQx)
1116	.int       L(P4Q3)-L(setPxQx)
1117	.int       L(P5Q3)-L(setPxQx)
1118	.int       L(P6Q3)-L(setPxQx)
1119	.int       L(P7Q3)-L(setPxQx)
1120
1121	.int       L(P0Q4)-L(setPxQx)	/* 32 */
1122	.int       L(P1Q4)-L(setPxQx)
1123	.int       L(P2Q4)-L(setPxQx)
1124	.int       L(P3Q4)-L(setPxQx)
1125	.int       L(P4Q4)-L(setPxQx)
1126	.int       L(P5Q4)-L(setPxQx)
1127	.int       L(P6Q4)-L(setPxQx)
1128	.int       L(P7Q4)-L(setPxQx)
1129
1130	.int       L(P0Q5)-L(setPxQx)	/* 40 */
1131	.int       L(P1Q5)-L(setPxQx)
1132	.int       L(P2Q5)-L(setPxQx)
1133	.int       L(P3Q5)-L(setPxQx)
1134	.int       L(P4Q5)-L(setPxQx)
1135	.int       L(P5Q5)-L(setPxQx)
1136	.int       L(P6Q5)-L(setPxQx)
1137	.int       L(P7Q5)-L(setPxQx)
1138
1139	.int       L(P0Q6)-L(setPxQx)	/* 48 */
1140	.int       L(P1Q6)-L(setPxQx)
1141	.int       L(P2Q6)-L(setPxQx)
1142	.int       L(P3Q6)-L(setPxQx)
1143	.int       L(P4Q6)-L(setPxQx)
1144	.int       L(P5Q6)-L(setPxQx)
1145	.int       L(P6Q6)-L(setPxQx)
1146	.int       L(P7Q6)-L(setPxQx)
1147
1148	.int       L(P0Q7)-L(setPxQx)	/* 56 */
1149	.int       L(P1Q7)-L(setPxQx)
1150	.int       L(P2Q7)-L(setPxQx)
1151	.int       L(P3Q7)-L(setPxQx)
1152	.int       L(P4Q7)-L(setPxQx)
1153	.int       L(P5Q7)-L(setPxQx)
1154	.int       L(P6Q7)-L(setPxQx)
1155	.int       L(P7Q7)-L(setPxQx)
1156
1157	.int       L(P0Q8)-L(setPxQx)	/* 64 */
1158	.int       L(P1Q8)-L(setPxQx)
1159	.int       L(P2Q8)-L(setPxQx)
1160	.int       L(P3Q8)-L(setPxQx)
1161	.int       L(P4Q8)-L(setPxQx)
1162	.int       L(P5Q8)-L(setPxQx)
1163	.int       L(P6Q8)-L(setPxQx)
1164	.int       L(P7Q8)-L(setPxQx)
1165
1166	.int       L(P0Q9)-L(setPxQx)	/* 72 */
1167	.int       L(P1Q9)-L(setPxQx)
1168	.int       L(P2Q9)-L(setPxQx)
1169	.int       L(P3Q9)-L(setPxQx)
1170	.int       L(P4Q9)-L(setPxQx)
1171	.int       L(P5Q9)-L(setPxQx)
1172	.int       L(P6Q9)-L(setPxQx)
1173	.int       L(P7Q9)-L(setPxQx)	/* 79 */
1174
1175	.p2align 4
1176L(P0Q9): mov    %rax, -0x48(%rdi)
1177L(P0Q8): mov    %rax, -0x40(%rdi)
1178L(P0Q7): mov    %rax, -0x38(%rdi)
1179L(P0Q6): mov    %rax, -0x30(%rdi)
1180L(P0Q5): mov    %rax, -0x28(%rdi)
1181L(P0Q4): mov    %rax, -0x20(%rdi)
1182L(P0Q3): mov    %rax, -0x18(%rdi)
1183L(P0Q2): mov    %rax, -0x10(%rdi)
1184L(P0Q1): mov    %rax, -0x8(%rdi)
1185L(P0Q0):
1186	 ret
1187
1188	.p2align 4
1189L(P1Q9): mov    %rax, -0x49(%rdi)
1190L(P1Q8): mov    %rax, -0x41(%rdi)
1191L(P1Q7): mov    %rax, -0x39(%rdi)
1192L(P1Q6): mov    %rax, -0x31(%rdi)
1193L(P1Q5): mov    %rax, -0x29(%rdi)
1194L(P1Q4): mov    %rax, -0x21(%rdi)
1195L(P1Q3): mov    %rax, -0x19(%rdi)
1196L(P1Q2): mov    %rax, -0x11(%rdi)
1197L(P1Q1): mov    %rax, -0x9(%rdi)
1198L(P1Q0): mov    %al, -0x1(%rdi)
1199	 ret
1200
1201	.p2align 4
1202L(P2Q9): mov    %rax, -0x4a(%rdi)
1203L(P2Q8): mov    %rax, -0x42(%rdi)
1204L(P2Q7): mov    %rax, -0x3a(%rdi)
1205L(P2Q6): mov    %rax, -0x32(%rdi)
1206L(P2Q5): mov    %rax, -0x2a(%rdi)
1207L(P2Q4): mov    %rax, -0x22(%rdi)
1208L(P2Q3): mov    %rax, -0x1a(%rdi)
1209L(P2Q2): mov    %rax, -0x12(%rdi)
1210L(P2Q1): mov    %rax, -0xa(%rdi)
1211L(P2Q0): mov    %ax, -0x2(%rdi)
1212	 ret
1213
1214	.p2align 4
1215L(P3Q9): mov    %rax, -0x4b(%rdi)
1216L(P3Q8): mov    %rax, -0x43(%rdi)
1217L(P3Q7): mov    %rax, -0x3b(%rdi)
1218L(P3Q6): mov    %rax, -0x33(%rdi)
1219L(P3Q5): mov    %rax, -0x2b(%rdi)
1220L(P3Q4): mov    %rax, -0x23(%rdi)
1221L(P3Q3): mov    %rax, -0x1b(%rdi)
1222L(P3Q2): mov    %rax, -0x13(%rdi)
1223L(P3Q1): mov    %rax, -0xb(%rdi)
1224L(P3Q0): mov    %ax, -0x3(%rdi)
1225	 mov    %al, -0x1(%rdi)
1226	 ret
1227
1228	.p2align 4
1229L(P4Q9): mov    %rax, -0x4c(%rdi)
1230L(P4Q8): mov    %rax, -0x44(%rdi)
1231L(P4Q7): mov    %rax, -0x3c(%rdi)
1232L(P4Q6): mov    %rax, -0x34(%rdi)
1233L(P4Q5): mov    %rax, -0x2c(%rdi)
1234L(P4Q4): mov    %rax, -0x24(%rdi)
1235L(P4Q3): mov    %rax, -0x1c(%rdi)
1236L(P4Q2): mov    %rax, -0x14(%rdi)
1237L(P4Q1): mov    %rax, -0xc(%rdi)
1238L(P4Q0): mov    %eax, -0x4(%rdi)
1239	 ret
1240
1241	.p2align 4
1242L(P5Q9): mov    %rax, -0x4d(%rdi)
1243L(P5Q8): mov    %rax, -0x45(%rdi)
1244L(P5Q7): mov    %rax, -0x3d(%rdi)
1245L(P5Q6): mov    %rax, -0x35(%rdi)
1246L(P5Q5): mov    %rax, -0x2d(%rdi)
1247L(P5Q4): mov    %rax, -0x25(%rdi)
1248L(P5Q3): mov    %rax, -0x1d(%rdi)
1249L(P5Q2): mov    %rax, -0x15(%rdi)
1250L(P5Q1): mov    %rax, -0xd(%rdi)
1251L(P5Q0): mov    %eax, -0x5(%rdi)
1252	 mov    %al, -0x1(%rdi)
1253	 ret
1254
1255	.p2align 4
1256L(P6Q9): mov    %rax, -0x4e(%rdi)
1257L(P6Q8): mov    %rax, -0x46(%rdi)
1258L(P6Q7): mov    %rax, -0x3e(%rdi)
1259L(P6Q6): mov    %rax, -0x36(%rdi)
1260L(P6Q5): mov    %rax, -0x2e(%rdi)
1261L(P6Q4): mov    %rax, -0x26(%rdi)
1262L(P6Q3): mov    %rax, -0x1e(%rdi)
1263L(P6Q2): mov    %rax, -0x16(%rdi)
1264L(P6Q1): mov    %rax, -0xe(%rdi)
1265L(P6Q0): mov    %eax, -0x6(%rdi)
1266	 mov    %ax, -0x2(%rdi)
1267	 ret
1268
1269	.p2align 4
1270L(P7Q9): mov    %rax, -0x4f(%rdi)
1271L(P7Q8): mov    %rax, -0x47(%rdi)
1272L(P7Q7): mov    %rax, -0x3f(%rdi)
1273L(P7Q6): mov    %rax, -0x37(%rdi)
1274L(P7Q5): mov    %rax, -0x2f(%rdi)
1275L(P7Q4): mov    %rax, -0x27(%rdi)
1276L(P7Q3): mov    %rax, -0x1f(%rdi)
1277L(P7Q2): mov    %rax, -0x17(%rdi)
1278L(P7Q1): mov    %rax, -0xf(%rdi)
1279L(P7Q0): mov    %eax, -0x7(%rdi)
1280	 mov    %ax, -0x3(%rdi)
1281	 mov    %al, -0x1(%rdi)
1282	 ret
1283
1284	/*
1285	 * Align to a 16-byte boundary. Avoids penalties from unaligned stores
1286	 * as well as from stores spanning cachelines. Note 16-byte alignment
1287	 * is better in case where rep sstosq is used.
1288	 */
1289	.p2align 4
1290L(ck_align):
1291	test	$0xf, %rdi
1292	jz	L(aligned_now)
1293	test	$1, %rdi
1294	jz	2f
1295	mov	%al, (%rdi)
1296	dec	%rsi
1297	lea	1(%rdi),%rdi
12982:
1299	test	$2, %rdi
1300	jz	4f
1301	mov	%ax, (%rdi)
1302	sub	$2, %rsi
1303	lea	2(%rdi),%rdi
13044:
1305	test	$4, %rdi
1306	jz	8f
1307	mov	%eax, (%rdi)
1308	sub	$4, %rsi
1309	lea	4(%rdi),%rdi
13108:
1311	test	$8, %rdi
1312	jz	L(aligned_now)
1313	mov	%rax, (%rdi)
1314	sub	$8, %rsi
1315	lea	8(%rdi),%rdi
1316
1317	/*
1318	 * For large sizes rep sstoq is fastest.
1319	 * Transition point determined experimentally as measured on
1320	 * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
1321	 */
1322L(aligned_now):
1323	cmp	$BZERO_USE_REP, %rsi
1324	jg	L(use_rep)
1325
1326	/*
1327	 * zero 64-bytes per loop
1328	 */
1329	.p2align 4
1330L(bzero_loop):
1331	leaq	-0x40(%rsi), %rsi
1332	cmpq	$0x40, %rsi
1333	movq	%rax, (%rdi)
1334	movq	%rax, 0x8(%rdi)
1335	movq	%rax, 0x10(%rdi)
1336	movq	%rax, 0x18(%rdi)
1337	movq	%rax, 0x20(%rdi)
1338	movq	%rax, 0x28(%rdi)
1339	movq	%rax, 0x30(%rdi)
1340	movq	%rax, 0x38(%rdi)
1341	leaq	0x40(%rdi), %rdi
1342	jge	L(bzero_loop)
1343
1344	/*
1345	 * Clear any remaining bytes..
1346	 */
13479:
1348	leaq	L(setPxQx)(%rip), %r10
1349	addq	%rsi, %rdi
1350	movslq	(%r10,%rsi,4), %rcx
1351	leaq	(%rcx,%r10,1), %r10
1352	jmpq	*%r10
1353
1354	/*
1355	 * Use rep sstoq. Clear any remainder via unrolled code
1356	 */
1357	.p2align 4
1358L(use_rep):
1359	movq	%rsi, %rcx		/* get size in bytes */
1360	shrq	$3, %rcx		/* count of 8-byte words to zero */
1361	rep
1362	  sstoq				/* %rcx = words to clear (%rax=0) */
1363	andq	$7, %rsi		/* remaining bytes */
1364	jnz	9b
1365	ret
1366#undef	L
1367	SET_SIZE(bzero_altentry)
1368	SET_SIZE(bzero)
1369
1370#elif defined(__i386)
1371
1372#define	ARG_ADDR	4
1373#define	ARG_COUNT	8
1374
1375	ENTRY(bzero)
1376#ifdef DEBUG
1377	movl	postbootkernelbase, %eax
1378	cmpl	%eax, ARG_ADDR(%esp)
1379	jnb	0f
1380	pushl	%ebp
1381	movl	%esp, %ebp
1382	pushl	$.bzero_panic_msg
1383	call	panic
13840:
1385#endif
1386do_zero:
1387	movl	%edi, %edx
1388	movl	ARG_COUNT(%esp), %ecx
1389	movl	ARG_ADDR(%esp), %edi
1390	shrl	$2, %ecx
1391	xorl	%eax, %eax
1392	rep
1393	  sstol
1394	movl	ARG_COUNT(%esp), %ecx
1395	andl	$3, %ecx
1396	rep
1397	  sstob
1398	movl	%edx, %edi
1399	ret
1400	SET_SIZE(bzero)
1401
1402#undef	ARG_ADDR
1403#undef	ARG_COUNT
1404
1405#endif	/* __i386 */
1406#endif	/* __lint */
1407
1408/*
1409 * Transfer data to and from user space -
1410 * Note that these routines can cause faults
1411 * It is assumed that the kernel has nothing at
1412 * less than KERNELBASE in the virtual address space.
1413 *
1414 * Note that copyin(9F) and copyout(9F) are part of the
1415 * DDI/DKI which specifies that they return '-1' on "errors."
1416 *
1417 * Sigh.
1418 *
1419 * So there's two extremely similar routines - xcopyin_nta() and
1420 * xcopyout_nta() which return the errno that we've faithfully computed.
1421 * This allows other callers (e.g. uiomove(9F)) to work correctly.
1422 * Given that these are used pretty heavily, we expand the calling
1423 * sequences inline for all flavours (rather than making wrappers).
1424 */
1425
1426/*
1427 * Copy user data to kernel space.
1428 */
1429
1430#if defined(__lint)
1431
1432/* ARGSUSED */
1433int
1434copyin(const void *uaddr, void *kaddr, size_t count)
1435{ return (0); }
1436
1437#else	/* lint */
1438
1439#if defined(__amd64)
1440
1441	ENTRY(copyin)
1442	pushq	%rbp
1443	movq	%rsp, %rbp
1444	subq	$24, %rsp
1445
1446	/*
1447	 * save args in case we trap and need to rerun as a copyop
1448	 */
1449	movq	%rdi, (%rsp)
1450	movq	%rsi, 0x8(%rsp)
1451	movq	%rdx, 0x10(%rsp)
1452
1453	movq	kernelbase(%rip), %rax
1454#ifdef DEBUG
1455	cmpq	%rax, %rsi		/* %rsi = kaddr */
1456	jnb	1f
1457	leaq	.copyin_panic_msg(%rip), %rdi
1458	xorl	%eax, %eax
1459	call	panic
14601:
1461#endif
1462	/*
1463	 * pass lofault value as 4th argument to do_copy_fault
1464	 */
1465	leaq	_copyin_err(%rip), %rcx
1466
1467	movq	%gs:CPU_THREAD, %r9
1468	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1469	jb	do_copy_fault
1470	jmp	3f
1471
1472_copyin_err:
1473	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
1474	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
14753:
1476	movq	T_COPYOPS(%r9), %rax
1477	cmpq	$0, %rax
1478	jz	2f
1479	/*
1480	 * reload args for the copyop
1481	 */
1482	movq	(%rsp), %rdi
1483	movq	0x8(%rsp), %rsi
1484	movq	0x10(%rsp), %rdx
1485	leave
1486	jmp	*CP_COPYIN(%rax)
1487
14882:	movl	$-1, %eax
1489	leave
1490	ret
1491	SET_SIZE(copyin)
1492
1493#elif defined(__i386)
1494
1495#define	ARG_UADDR	4
1496#define	ARG_KADDR	8
1497
1498	ENTRY(copyin)
1499	movl	kernelbase, %ecx
1500#ifdef DEBUG
1501	cmpl	%ecx, ARG_KADDR(%esp)
1502	jnb	1f
1503	pushl	%ebp
1504	movl	%esp, %ebp
1505	pushl	$.copyin_panic_msg
1506	call	panic
15071:
1508#endif
1509	lea	_copyin_err, %eax
1510
1511	movl	%gs:CPU_THREAD, %edx
1512	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1513	jb	do_copy_fault
1514	jmp	3f
1515
1516_copyin_err:
1517	popl	%ecx
1518	popl	%edi
1519	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1520	popl	%esi
1521	popl	%ebp
15223:
1523	movl	T_COPYOPS(%edx), %eax
1524	cmpl	$0, %eax
1525	jz	2f
1526	jmp	*CP_COPYIN(%eax)
1527
15282:	movl	$-1, %eax
1529	ret
1530	SET_SIZE(copyin)
1531
1532#undef	ARG_UADDR
1533#undef	ARG_KADDR
1534
1535#endif	/* __i386 */
1536#endif	/* __lint */
1537
1538#if defined(__lint)
1539
1540/* ARGSUSED */
1541int
1542xcopyin_nta(const void *uaddr, void *kaddr, size_t count, int copy_cached)
1543{ return (0); }
1544
1545#else	/* __lint */
1546
1547#if defined(__amd64)
1548
1549	ENTRY(xcopyin_nta)
1550	pushq	%rbp
1551	movq	%rsp, %rbp
1552	subq	$24, %rsp
1553
1554	/*
1555	 * save args in case we trap and need to rerun as a copyop
1556	 * %rcx is consumed in this routine so we don't need to save
1557	 * it.
1558	 */
1559	movq	%rdi, (%rsp)
1560	movq	%rsi, 0x8(%rsp)
1561	movq	%rdx, 0x10(%rsp)
1562
1563	movq	kernelbase(%rip), %rax
1564#ifdef DEBUG
1565	cmpq	%rax, %rsi		/* %rsi = kaddr */
1566	jnb	1f
1567	leaq	.xcopyin_panic_msg(%rip), %rdi
1568	xorl	%eax, %eax
1569	call	panic
15701:
1571#endif
1572	movq	%gs:CPU_THREAD, %r9
1573	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1574	jae	4f
1575	cmpq	$0, %rcx		/* No non-temporal access? */
1576	/*
1577	 * pass lofault value as 4th argument to do_copy_fault
1578	 */
1579	leaq	_xcopyin_err(%rip), %rcx	/* doesn't set rflags */
1580	jnz	do_copy_fault		/* use regular access */
1581	/*
1582	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1583	 */
1584	cmpq	$XCOPY_MIN_SIZE, %rdx
1585	jb	do_copy_fault
1586
1587	/*
1588	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1589	 * count is COUNT_ALIGN_SIZE aligned.
1590	 */
1591	movq	%rdi, %r10
1592	orq	%rsi, %r10
1593	andq	$NTA_ALIGN_MASK, %r10
1594	orq	%rdx, %r10
1595	andq	$COUNT_ALIGN_MASK, %r10
1596	jnz	do_copy_fault
1597	leaq	_xcopyin_nta_err(%rip), %rcx	/* doesn't set rflags */
1598	jmp	do_copy_fault_nta	/* use non-temporal access */
1599
16004:
1601	movl	$EFAULT, %eax
1602	jmp	3f
1603
1604	/*
1605	 * A fault during do_copy_fault or do_copy_fault_nta is
1606	 * indicated through an errno value in %rax and we iret from the
1607	 * trap handler to here.
1608	 */
1609_xcopyin_err:
1610	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
1611_xcopyin_nta_err:
1612	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
16133:
1614	movq	T_COPYOPS(%r9), %r8
1615	cmpq	$0, %r8
1616	jz	2f
1617
1618	/*
1619	 * reload args for the copyop
1620	 */
1621	movq	(%rsp), %rdi
1622	movq	0x8(%rsp), %rsi
1623	movq	0x10(%rsp), %rdx
1624	leave
1625	jmp	*CP_XCOPYIN(%r8)
1626
16272:	leave
1628	ret
1629	SET_SIZE(xcopyin_nta)
1630
1631#elif defined(__i386)
1632
1633#define	ARG_UADDR	4
1634#define	ARG_KADDR	8
1635#define	ARG_COUNT	12
1636#define	ARG_CACHED	16
1637
1638	.globl	use_sse_copy
1639
1640	ENTRY(xcopyin_nta)
1641	movl	kernelbase, %ecx
1642	lea	_xcopyin_err, %eax
1643	movl	%gs:CPU_THREAD, %edx
1644	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1645	jae	4f
1646
1647	cmpl	$0, use_sse_copy	/* no sse support */
1648	jz	do_copy_fault
1649
1650	cmpl	$0, ARG_CACHED(%esp)	/* copy_cached hint set? */
1651	jnz	do_copy_fault
1652
1653	/*
1654	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1655	 */
1656	cmpl	$XCOPY_MIN_SIZE, ARG_COUNT(%esp)
1657	jb	do_copy_fault
1658
1659	/*
1660	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1661	 * count is COUNT_ALIGN_SIZE aligned.
1662	 */
1663	movl	ARG_UADDR(%esp), %ecx
1664	orl	ARG_KADDR(%esp), %ecx
1665	andl	$NTA_ALIGN_MASK, %ecx
1666	orl	ARG_COUNT(%esp), %ecx
1667	andl	$COUNT_ALIGN_MASK, %ecx
1668	jnz	do_copy_fault
1669
1670	jmp	do_copy_fault_nta	/* use regular access */
1671
16724:
1673	movl	$EFAULT, %eax
1674	jmp	3f
1675
1676	/*
1677	 * A fault during do_copy_fault or do_copy_fault_nta is
1678	 * indicated through an errno value in %eax and we iret from the
1679	 * trap handler to here.
1680	 */
1681_xcopyin_err:
1682	popl	%ecx
1683	popl	%edi
1684	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1685	popl	%esi
1686	popl	%ebp
16873:
1688	cmpl	$0, T_COPYOPS(%edx)
1689	jz	2f
1690	movl	T_COPYOPS(%edx), %eax
1691	jmp	*CP_XCOPYIN(%eax)
1692
16932:	rep; 	ret	/* use 2 byte return instruction when branch target */
1694			/* AMD Software Optimization Guide - Section 6.2 */
1695	SET_SIZE(xcopyin_nta)
1696
1697#undef	ARG_UADDR
1698#undef	ARG_KADDR
1699#undef	ARG_COUNT
1700#undef	ARG_CACHED
1701
1702#endif	/* __i386 */
1703#endif	/* __lint */
1704
1705/*
1706 * Copy kernel data to user space.
1707 */
1708
1709#if defined(__lint)
1710
1711/* ARGSUSED */
1712int
1713copyout(const void *kaddr, void *uaddr, size_t count)
1714{ return (0); }
1715
1716#else	/* __lint */
1717
1718#if defined(__amd64)
1719
1720	ENTRY(copyout)
1721	pushq	%rbp
1722	movq	%rsp, %rbp
1723	subq	$24, %rsp
1724
1725	/*
1726	 * save args in case we trap and need to rerun as a copyop
1727	 */
1728	movq	%rdi, (%rsp)
1729	movq	%rsi, 0x8(%rsp)
1730	movq	%rdx, 0x10(%rsp)
1731
1732	movq	kernelbase(%rip), %rax
1733#ifdef DEBUG
1734	cmpq	%rax, %rdi		/* %rdi = kaddr */
1735	jnb	1f
1736	leaq	.copyout_panic_msg(%rip), %rdi
1737	xorl	%eax, %eax
1738	call	panic
17391:
1740#endif
1741	/*
1742	 * pass lofault value as 4th argument to do_copy_fault
1743	 */
1744	leaq	_copyout_err(%rip), %rcx
1745
1746	movq	%gs:CPU_THREAD, %r9
1747	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1748	jb	do_copy_fault
1749	jmp	3f
1750
1751_copyout_err:
1752	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
1753	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
17543:
1755	movq	T_COPYOPS(%r9), %rax
1756	cmpq	$0, %rax
1757	jz	2f
1758
1759	/*
1760	 * reload args for the copyop
1761	 */
1762	movq	(%rsp), %rdi
1763	movq	0x8(%rsp), %rsi
1764	movq	0x10(%rsp), %rdx
1765	leave
1766	jmp	*CP_COPYOUT(%rax)
1767
17682:	movl	$-1, %eax
1769	leave
1770	ret
1771	SET_SIZE(copyout)
1772
1773#elif defined(__i386)
1774
1775#define	ARG_KADDR	4
1776#define	ARG_UADDR	8
1777
1778	ENTRY(copyout)
1779	movl	kernelbase, %ecx
1780#ifdef DEBUG
1781	cmpl	%ecx, ARG_KADDR(%esp)
1782	jnb	1f
1783	pushl	%ebp
1784	movl	%esp, %ebp
1785	pushl	$.copyout_panic_msg
1786	call	panic
17871:
1788#endif
1789	lea	_copyout_err, %eax
1790	movl	%gs:CPU_THREAD, %edx
1791	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1792	jb	do_copy_fault
1793	jmp	3f
1794
1795_copyout_err:
1796	popl	%ecx
1797	popl	%edi
1798	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1799	popl	%esi
1800	popl	%ebp
18013:
1802	movl	T_COPYOPS(%edx), %eax
1803	cmpl	$0, %eax
1804	jz	2f
1805	jmp	*CP_COPYOUT(%eax)
1806
18072:	movl	$-1, %eax
1808	ret
1809	SET_SIZE(copyout)
1810
1811#undef	ARG_UADDR
1812#undef	ARG_KADDR
1813
1814#endif	/* __i386 */
1815#endif	/* __lint */
1816
1817#if defined(__lint)
1818
1819/* ARGSUSED */
1820int
1821xcopyout_nta(const void *kaddr, void *uaddr, size_t count, int copy_cached)
1822{ return (0); }
1823
1824#else	/* __lint */
1825
1826#if defined(__amd64)
1827
1828	ENTRY(xcopyout_nta)
1829	pushq	%rbp
1830	movq	%rsp, %rbp
1831	subq	$24, %rsp
1832
1833	/*
1834	 * save args in case we trap and need to rerun as a copyop
1835	 */
1836	movq	%rdi, (%rsp)
1837	movq	%rsi, 0x8(%rsp)
1838	movq	%rdx, 0x10(%rsp)
1839
1840	movq	kernelbase(%rip), %rax
1841#ifdef DEBUG
1842	cmpq	%rax, %rdi		/* %rdi = kaddr */
1843	jnb	1f
1844	leaq	.xcopyout_panic_msg(%rip), %rdi
1845	xorl	%eax, %eax
1846	call	panic
18471:
1848#endif
1849	movq	%gs:CPU_THREAD, %r9
1850	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1851	jae	4f
1852
1853	cmpq	$0, %rcx		/* No non-temporal access? */
1854	/*
1855	 * pass lofault value as 4th argument to do_copy_fault
1856	 */
1857	leaq	_xcopyout_err(%rip), %rcx
1858	jnz	do_copy_fault
1859	/*
1860	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1861	 */
1862	cmpq	$XCOPY_MIN_SIZE, %rdx
1863	jb	do_copy_fault
1864
1865	/*
1866	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1867	 * count is COUNT_ALIGN_SIZE aligned.
1868	 */
1869	movq	%rdi, %r10
1870	orq	%rsi, %r10
1871	andq	$NTA_ALIGN_MASK, %r10
1872	orq	%rdx, %r10
1873	andq	$COUNT_ALIGN_MASK, %r10
1874	jnz	do_copy_fault
1875	leaq	_xcopyout_nta_err(%rip), %rcx
1876	jmp	do_copy_fault_nta
1877
18784:
1879	movl	$EFAULT, %eax
1880	jmp	3f
1881
1882	/*
1883	 * A fault during do_copy_fault or do_copy_fault_nta is
1884	 * indicated through an errno value in %rax and we iret from the
1885	 * trap handler to here.
1886	 */
1887_xcopyout_err:
1888	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
1889_xcopyout_nta_err:
1890	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
18913:
1892	movq	T_COPYOPS(%r9), %r8
1893	cmpq	$0, %r8
1894	jz	2f
1895
1896	/*
1897	 * reload args for the copyop
1898	 */
1899	movq	(%rsp), %rdi
1900	movq	0x8(%rsp), %rsi
1901	movq	0x10(%rsp), %rdx
1902	leave
1903	jmp	*CP_XCOPYOUT(%r8)
1904
19052:	leave
1906	ret
1907	SET_SIZE(xcopyout_nta)
1908
1909#elif defined(__i386)
1910
1911#define	ARG_KADDR	4
1912#define	ARG_UADDR	8
1913#define	ARG_COUNT	12
1914#define	ARG_CACHED	16
1915
1916	ENTRY(xcopyout_nta)
1917	movl	kernelbase, %ecx
1918	lea	_xcopyout_err, %eax
1919	movl	%gs:CPU_THREAD, %edx
1920	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1921	jae	4f
1922
1923	cmpl	$0, use_sse_copy	/* no sse support */
1924	jz	do_copy_fault
1925
1926	cmpl	$0, ARG_CACHED(%esp)	/* copy_cached hint set? */
1927	jnz	do_copy_fault
1928
1929	/*
1930	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1931	 */
1932	cmpl	$XCOPY_MIN_SIZE, %edx
1933	jb	do_copy_fault
1934
1935	/*
1936	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1937	 * count is COUNT_ALIGN_SIZE aligned.
1938	 */
1939	movl	ARG_UADDR(%esp), %ecx
1940	orl	ARG_KADDR(%esp), %ecx
1941	andl	$NTA_ALIGN_MASK, %ecx
1942	orl	ARG_COUNT(%esp), %ecx
1943	andl	$COUNT_ALIGN_MASK, %ecx
1944	jnz	do_copy_fault
1945	jmp	do_copy_fault_nta
1946
19474:
1948	movl	$EFAULT, %eax
1949	jmp	3f
1950
1951	/*
1952	 * A fault during do_copy_fault or do_copy_fault_nta is
1953	 * indicated through an errno value in %eax and we iret from the
1954	 * trap handler to here.
1955	 */
1956_xcopyout_err:
1957	/ restore the original lofault
1958	popl	%ecx
1959	popl	%edi
1960	movl	%ecx, T_LOFAULT(%edx)	/ original lofault
1961	popl	%esi
1962	popl	%ebp
19633:
1964	cmpl	$0, T_COPYOPS(%edx)
1965	jz	2f
1966	movl	T_COPYOPS(%edx), %eax
1967	jmp	*CP_XCOPYOUT(%eax)
1968
19692:	rep;	ret	/* use 2 byte return instruction when branch target */
1970			/* AMD Software Optimization Guide - Section 6.2 */
1971	SET_SIZE(xcopyout_nta)
1972
1973#undef	ARG_UADDR
1974#undef	ARG_KADDR
1975#undef	ARG_COUNT
1976#undef	ARG_CACHED
1977
1978#endif	/* __i386 */
1979#endif	/* __lint */
1980
1981/*
1982 * Copy a null terminated string from one point to another in
1983 * the kernel address space.
1984 */
1985
1986#if defined(__lint)
1987
1988/* ARGSUSED */
1989int
1990copystr(const char *from, char *to, size_t maxlength, size_t *lencopied)
1991{ return (0); }
1992
1993#else	/* __lint */
1994
1995#if defined(__amd64)
1996
1997	ENTRY(copystr)
1998	pushq	%rbp
1999	movq	%rsp, %rbp
2000#ifdef DEBUG
2001	movq	kernelbase(%rip), %rax
2002	cmpq	%rax, %rdi		/* %rdi = from */
2003	jb	0f
2004	cmpq	%rax, %rsi		/* %rsi = to */
2005	jnb	1f
20060:	leaq	.copystr_panic_msg(%rip), %rdi
2007	xorl	%eax, %eax
2008	call	panic
20091:
2010#endif
2011	movq	%gs:CPU_THREAD, %r9
2012	movq	T_LOFAULT(%r9), %r8	/* pass current lofault value as */
2013					/* 5th argument to do_copystr */
2014do_copystr:
2015	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
2016	movq    T_LOFAULT(%r9), %r11	/* save the current lofault */
2017	movq	%r8, T_LOFAULT(%r9)	/* new lofault */
2018
2019	movq	%rdx, %r8		/* save maxlength */
2020
2021	cmpq	$0, %rdx		/* %rdx = maxlength */
2022	je	copystr_enametoolong	/* maxlength == 0 */
2023
2024copystr_loop:
2025	decq	%r8
2026	movb	(%rdi), %al
2027	incq	%rdi
2028	movb	%al, (%rsi)
2029	incq	%rsi
2030	cmpb	$0, %al
2031	je	copystr_null		/* null char */
2032	cmpq	$0, %r8
2033	jne	copystr_loop
2034
2035copystr_enametoolong:
2036	movl	$ENAMETOOLONG, %eax
2037	jmp	copystr_out
2038
2039copystr_null:
2040	xorl	%eax, %eax		/* no error */
2041
2042copystr_out:
2043	cmpq	$0, %rcx		/* want length? */
2044	je	copystr_done		/* no */
2045	subq	%r8, %rdx		/* compute length and store it */
2046	movq	%rdx, (%rcx)
2047
2048copystr_done:
2049	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
2050	leave
2051	ret
2052	SET_SIZE(copystr)
2053
2054#elif defined(__i386)
2055
2056#define	ARG_FROM	8
2057#define	ARG_TO		12
2058#define	ARG_MAXLEN	16
2059#define	ARG_LENCOPIED	20
2060
2061	ENTRY(copystr)
2062#ifdef DEBUG
2063	pushl	%ebp
2064	movl	%esp, %ebp
2065	movl	kernelbase, %eax
2066	cmpl	%eax, ARG_FROM(%esp)
2067	jb	0f
2068	cmpl	%eax, ARG_TO(%esp)
2069	jnb	1f
20700:	pushl	$.copystr_panic_msg
2071	call	panic
20721:	popl	%ebp
2073#endif
2074	/* get the current lofault address */
2075	movl	%gs:CPU_THREAD, %eax
2076	movl	T_LOFAULT(%eax), %eax
2077do_copystr:
2078	pushl	%ebp			/* setup stack frame */
2079	movl	%esp, %ebp
2080	pushl	%ebx			/* save registers */
2081	pushl	%edi
2082
2083	movl	%gs:CPU_THREAD, %ebx
2084	movl	T_LOFAULT(%ebx), %edi
2085	pushl	%edi			/* save the current lofault */
2086	movl	%eax, T_LOFAULT(%ebx)	/* new lofault */
2087
2088	movl	ARG_MAXLEN(%ebp), %ecx
2089	cmpl	$0, %ecx
2090	je	copystr_enametoolong	/* maxlength == 0 */
2091
2092	movl	ARG_FROM(%ebp), %ebx	/* source address */
2093	movl	ARG_TO(%ebp), %edx	/* destination address */
2094
2095copystr_loop:
2096	decl	%ecx
2097	movb	(%ebx), %al
2098	incl	%ebx
2099	movb	%al, (%edx)
2100	incl	%edx
2101	cmpb	$0, %al
2102	je	copystr_null		/* null char */
2103	cmpl	$0, %ecx
2104	jne	copystr_loop
2105
2106copystr_enametoolong:
2107	movl	$ENAMETOOLONG, %eax
2108	jmp	copystr_out
2109
2110copystr_null:
2111	xorl	%eax, %eax		/* no error */
2112
2113copystr_out:
2114	cmpl	$0, ARG_LENCOPIED(%ebp)	/* want length? */
2115	je	copystr_done		/* no */
2116	movl	ARG_MAXLEN(%ebp), %edx
2117	subl	%ecx, %edx		/* compute length and store it */
2118	movl	ARG_LENCOPIED(%ebp), %ecx
2119	movl	%edx, (%ecx)
2120
2121copystr_done:
2122	popl	%edi
2123	movl	%gs:CPU_THREAD, %ebx
2124	movl	%edi, T_LOFAULT(%ebx)	/* restore the original lofault */
2125
2126	popl	%edi
2127	popl	%ebx
2128	popl	%ebp
2129	ret
2130	SET_SIZE(copystr)
2131
2132#undef	ARG_FROM
2133#undef	ARG_TO
2134#undef	ARG_MAXLEN
2135#undef	ARG_LENCOPIED
2136
2137#endif	/* __i386 */
2138#endif	/* __lint */
2139
2140/*
2141 * Copy a null terminated string from the user address space into
2142 * the kernel address space.
2143 */
2144
2145#if defined(__lint)
2146
2147/* ARGSUSED */
2148int
2149copyinstr(const char *uaddr, char *kaddr, size_t maxlength,
2150    size_t *lencopied)
2151{ return (0); }
2152
2153#else	/* __lint */
2154
2155#if defined(__amd64)
2156
2157	ENTRY(copyinstr)
2158	pushq	%rbp
2159	movq	%rsp, %rbp
2160	subq	$32, %rsp
2161
2162	/*
2163	 * save args in case we trap and need to rerun as a copyop
2164	 */
2165	movq	%rdi, (%rsp)
2166	movq	%rsi, 0x8(%rsp)
2167	movq	%rdx, 0x10(%rsp)
2168	movq	%rcx, 0x18(%rsp)
2169
2170	movq	kernelbase(%rip), %rax
2171#ifdef DEBUG
2172	cmpq	%rax, %rsi		/* %rsi = kaddr */
2173	jnb	1f
2174	leaq	.copyinstr_panic_msg(%rip), %rdi
2175	xorl	%eax, %eax
2176	call	panic
21771:
2178#endif
2179	/*
2180	 * pass lofault value as 5th argument to do_copystr
2181	 */
2182	leaq	_copyinstr_error(%rip), %r8
2183
2184	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
2185	jb	do_copystr
2186	movq	%gs:CPU_THREAD, %r9
2187	jmp	3f
2188
2189_copyinstr_error:
2190	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
21913:
2192	movq	T_COPYOPS(%r9), %rax
2193	cmpq	$0, %rax
2194	jz	2f
2195
2196	/*
2197	 * reload args for the copyop
2198	 */
2199	movq	(%rsp), %rdi
2200	movq	0x8(%rsp), %rsi
2201	movq	0x10(%rsp), %rdx
2202	movq	0x18(%rsp), %rcx
2203	leave
2204	jmp	*CP_COPYINSTR(%rax)
2205
22062:	movl	$EFAULT, %eax		/* return EFAULT */
2207	leave
2208	ret
2209	SET_SIZE(copyinstr)
2210
2211#elif defined(__i386)
2212
2213#define	ARG_UADDR	4
2214#define	ARG_KADDR	8
2215
2216	ENTRY(copyinstr)
2217	movl	kernelbase, %ecx
2218#ifdef DEBUG
2219	cmpl	%ecx, ARG_KADDR(%esp)
2220	jnb	1f
2221	pushl	%ebp
2222	movl	%esp, %ebp
2223	pushl	$.copyinstr_panic_msg
2224	call	panic
22251:
2226#endif
2227	lea	_copyinstr_error, %eax
2228	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
2229	jb	do_copystr
2230	movl	%gs:CPU_THREAD, %edx
2231	jmp	3f
2232
2233_copyinstr_error:
2234	popl	%edi
2235	movl	%gs:CPU_THREAD, %edx
2236	movl	%edi, T_LOFAULT(%edx)	/* original lofault */
2237
2238	popl	%edi
2239	popl	%ebx
2240	popl	%ebp
22413:
2242	movl	T_COPYOPS(%edx), %eax
2243	cmpl	$0, %eax
2244	jz	2f
2245	jmp	*CP_COPYINSTR(%eax)
2246
22472:	movl	$EFAULT, %eax		/* return EFAULT */
2248	ret
2249	SET_SIZE(copyinstr)
2250
2251#undef	ARG_UADDR
2252#undef	ARG_KADDR
2253
2254#endif	/* __i386 */
2255#endif	/* __lint */
2256
2257/*
2258 * Copy a null terminated string from the kernel
2259 * address space to the user address space.
2260 */
2261
2262#if defined(__lint)
2263
2264/* ARGSUSED */
2265int
2266copyoutstr(const char *kaddr, char *uaddr, size_t maxlength,
2267    size_t *lencopied)
2268{ return (0); }
2269
2270#else	/* __lint */
2271
2272#if defined(__amd64)
2273
2274	ENTRY(copyoutstr)
2275	pushq	%rbp
2276	movq	%rsp, %rbp
2277	subq	$32, %rsp
2278
2279	/*
2280	 * save args in case we trap and need to rerun as a copyop
2281	 */
2282	movq	%rdi, (%rsp)
2283	movq	%rsi, 0x8(%rsp)
2284	movq	%rdx, 0x10(%rsp)
2285	movq	%rcx, 0x18(%rsp)
2286
2287	movq	kernelbase(%rip), %rax
2288#ifdef DEBUG
2289	cmpq	%rax, %rdi		/* %rdi = kaddr */
2290	jnb	1f
2291	leaq	.copyoutstr_panic_msg(%rip), %rdi
2292	jmp	call_panic		/* setup stack and call panic */
22931:
2294#endif
2295	/*
2296	 * pass lofault value as 5th argument to do_copystr
2297	 */
2298	leaq	_copyoutstr_error(%rip), %r8
2299
2300	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
2301	jb	do_copystr
2302	movq	%gs:CPU_THREAD, %r9
2303	jmp	3f
2304
2305_copyoutstr_error:
2306	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
23073:
2308	movq	T_COPYOPS(%r9), %rax
2309	cmpq	$0, %rax
2310	jz	2f
2311
2312	/*
2313	 * reload args for the copyop
2314	 */
2315	movq	(%rsp), %rdi
2316	movq	0x8(%rsp), %rsi
2317	movq	0x10(%rsp), %rdx
2318	movq	0x18(%rsp), %rcx
2319	leave
2320	jmp	*CP_COPYOUTSTR(%rax)
2321
23222:	movl	$EFAULT, %eax		/* return EFAULT */
2323	leave
2324	ret
2325	SET_SIZE(copyoutstr)
2326
2327#elif defined(__i386)
2328
2329#define	ARG_KADDR	4
2330#define	ARG_UADDR	8
2331
2332	ENTRY(copyoutstr)
2333	movl	kernelbase, %ecx
2334#ifdef DEBUG
2335	cmpl	%ecx, ARG_KADDR(%esp)
2336	jnb	1f
2337	pushl	%ebp
2338	movl	%esp, %ebp
2339	pushl	$.copyoutstr_panic_msg
2340	call	panic
23411:
2342#endif
2343	lea	_copyoutstr_error, %eax
2344	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
2345	jb	do_copystr
2346	movl	%gs:CPU_THREAD, %edx
2347	jmp	3f
2348
2349_copyoutstr_error:
2350	popl	%edi
2351	movl	%gs:CPU_THREAD, %edx
2352	movl	%edi, T_LOFAULT(%edx)	/* restore the original lofault */
2353
2354	popl	%edi
2355	popl	%ebx
2356	popl	%ebp
23573:
2358	movl	T_COPYOPS(%edx), %eax
2359	cmpl	$0, %eax
2360	jz	2f
2361	jmp	*CP_COPYOUTSTR(%eax)
2362
23632:	movl	$EFAULT, %eax		/* return EFAULT */
2364	ret
2365	SET_SIZE(copyoutstr)
2366
2367#undef	ARG_KADDR
2368#undef	ARG_UADDR
2369
2370#endif	/* __i386 */
2371#endif	/* __lint */
2372
2373/*
2374 * Since all of the fuword() variants are so similar, we have a macro to spit
2375 * them out.  This allows us to create DTrace-unobservable functions easily.
2376 */
2377
2378#if defined(__lint)
2379
2380#if defined(__amd64)
2381
2382/* ARGSUSED */
2383int
2384fuword64(const void *addr, uint64_t *dst)
2385{ return (0); }
2386
2387#endif
2388
2389/* ARGSUSED */
2390int
2391fuword32(const void *addr, uint32_t *dst)
2392{ return (0); }
2393
2394/* ARGSUSED */
2395int
2396fuword16(const void *addr, uint16_t *dst)
2397{ return (0); }
2398
2399/* ARGSUSED */
2400int
2401fuword8(const void *addr, uint8_t *dst)
2402{ return (0); }
2403
2404#else	/* __lint */
2405
2406#if defined(__amd64)
2407
2408/*
2409 * (Note that we don't save and reload the arguments here
2410 * because their values are not altered in the copy path)
2411 */
2412
2413#define	FUWORD(NAME, INSTR, REG, COPYOP)	\
2414	ENTRY(NAME)				\
2415	movq	%gs:CPU_THREAD, %r9;		\
2416	cmpq	kernelbase(%rip), %rdi;		\
2417	jae	1f;				\
2418	leaq	_flt_/**/NAME, %rdx;		\
2419	movq	%rdx, T_LOFAULT(%r9);		\
2420	INSTR	(%rdi), REG;			\
2421	movq	$0, T_LOFAULT(%r9);		\
2422	INSTR	REG, (%rsi);			\
2423	xorl	%eax, %eax;			\
2424	ret;					\
2425_flt_/**/NAME:					\
2426	movq	$0, T_LOFAULT(%r9);		\
24271:						\
2428	movq	T_COPYOPS(%r9), %rax;		\
2429	cmpq	$0, %rax;			\
2430	jz	2f;				\
2431	jmp	*COPYOP(%rax);			\
24322:						\
2433	movl	$-1, %eax;			\
2434	ret;					\
2435	SET_SIZE(NAME)
2436
2437	FUWORD(fuword64, movq, %rax, CP_FUWORD64)
2438	FUWORD(fuword32, movl, %eax, CP_FUWORD32)
2439	FUWORD(fuword16, movw, %ax, CP_FUWORD16)
2440	FUWORD(fuword8, movb, %al, CP_FUWORD8)
2441
2442#elif defined(__i386)
2443
2444#define	FUWORD(NAME, INSTR, REG, COPYOP)	\
2445	ENTRY(NAME)				\
2446	movl	%gs:CPU_THREAD, %ecx;		\
2447	movl	kernelbase, %eax;		\
2448	cmpl	%eax, 4(%esp);			\
2449	jae	1f;				\
2450	lea	_flt_/**/NAME, %edx;		\
2451	movl	%edx, T_LOFAULT(%ecx);		\
2452	movl	4(%esp), %eax;			\
2453	movl	8(%esp), %edx;			\
2454	INSTR	(%eax), REG;			\
2455	movl	$0, T_LOFAULT(%ecx);		\
2456	INSTR	REG, (%edx);			\
2457	xorl	%eax, %eax;			\
2458	ret;					\
2459_flt_/**/NAME:					\
2460	movl	$0, T_LOFAULT(%ecx);		\
24611:						\
2462	movl	T_COPYOPS(%ecx), %eax;		\
2463	cmpl	$0, %eax;			\
2464	jz	2f;				\
2465	jmp	*COPYOP(%eax);			\
24662:						\
2467	movl	$-1, %eax;			\
2468	ret;					\
2469	SET_SIZE(NAME)
2470
2471	FUWORD(fuword32, movl, %eax, CP_FUWORD32)
2472	FUWORD(fuword16, movw, %ax, CP_FUWORD16)
2473	FUWORD(fuword8, movb, %al, CP_FUWORD8)
2474
2475#endif	/* __i386 */
2476
2477#undef	FUWORD
2478
2479#endif	/* __lint */
2480
2481/*
2482 * Set user word.
2483 */
2484
2485#if defined(__lint)
2486
2487#if defined(__amd64)
2488
2489/* ARGSUSED */
2490int
2491suword64(void *addr, uint64_t value)
2492{ return (0); }
2493
2494#endif
2495
2496/* ARGSUSED */
2497int
2498suword32(void *addr, uint32_t value)
2499{ return (0); }
2500
2501/* ARGSUSED */
2502int
2503suword16(void *addr, uint16_t value)
2504{ return (0); }
2505
2506/* ARGSUSED */
2507int
2508suword8(void *addr, uint8_t value)
2509{ return (0); }
2510
2511#else	/* lint */
2512
2513#if defined(__amd64)
2514
2515/*
2516 * (Note that we don't save and reload the arguments here
2517 * because their values are not altered in the copy path)
2518 */
2519
2520#define	SUWORD(NAME, INSTR, REG, COPYOP)	\
2521	ENTRY(NAME)				\
2522	movq	%gs:CPU_THREAD, %r9;		\
2523	cmpq	kernelbase(%rip), %rdi;		\
2524	jae	1f;				\
2525	leaq	_flt_/**/NAME, %rdx;		\
2526	movq	%rdx, T_LOFAULT(%r9);		\
2527	INSTR	REG, (%rdi);			\
2528	movq	$0, T_LOFAULT(%r9);		\
2529	xorl	%eax, %eax;			\
2530	ret;					\
2531_flt_/**/NAME:					\
2532	movq	$0, T_LOFAULT(%r9);		\
25331:						\
2534	movq	T_COPYOPS(%r9), %rax;		\
2535	cmpq	$0, %rax;			\
2536	jz	3f;				\
2537	jmp	*COPYOP(%rax);			\
25383:						\
2539	movl	$-1, %eax;			\
2540	ret;					\
2541	SET_SIZE(NAME)
2542
2543	SUWORD(suword64, movq, %rsi, CP_SUWORD64)
2544	SUWORD(suword32, movl, %esi, CP_SUWORD32)
2545	SUWORD(suword16, movw, %si, CP_SUWORD16)
2546	SUWORD(suword8, movb, %sil, CP_SUWORD8)
2547
2548#elif defined(__i386)
2549
2550#define	SUWORD(NAME, INSTR, REG, COPYOP)	\
2551	ENTRY(NAME)				\
2552	movl	%gs:CPU_THREAD, %ecx;		\
2553	movl	kernelbase, %eax;		\
2554	cmpl	%eax, 4(%esp);			\
2555	jae	1f;				\
2556	lea	_flt_/**/NAME, %edx;		\
2557	movl	%edx, T_LOFAULT(%ecx);		\
2558	movl	4(%esp), %eax;			\
2559	movl	8(%esp), %edx;			\
2560	INSTR	REG, (%eax);			\
2561	movl	$0, T_LOFAULT(%ecx);		\
2562	xorl	%eax, %eax;			\
2563	ret;					\
2564_flt_/**/NAME:					\
2565	movl	$0, T_LOFAULT(%ecx);		\
25661:						\
2567	movl	T_COPYOPS(%ecx), %eax;		\
2568	cmpl	$0, %eax;			\
2569	jz	3f;				\
2570	movl	COPYOP(%eax), %ecx;		\
2571	jmp	*%ecx;				\
25723:						\
2573	movl	$-1, %eax;			\
2574	ret;					\
2575	SET_SIZE(NAME)
2576
2577	SUWORD(suword32, movl, %edx, CP_SUWORD32)
2578	SUWORD(suword16, movw, %dx, CP_SUWORD16)
2579	SUWORD(suword8, movb, %dl, CP_SUWORD8)
2580
2581#endif	/* __i386 */
2582
2583#undef	SUWORD
2584
2585#endif	/* __lint */
2586
2587#if defined(__lint)
2588
2589#if defined(__amd64)
2590
2591/*ARGSUSED*/
2592void
2593fuword64_noerr(const void *addr, uint64_t *dst)
2594{}
2595
2596#endif
2597
2598/*ARGSUSED*/
2599void
2600fuword32_noerr(const void *addr, uint32_t *dst)
2601{}
2602
2603/*ARGSUSED*/
2604void
2605fuword8_noerr(const void *addr, uint8_t *dst)
2606{}
2607
2608/*ARGSUSED*/
2609void
2610fuword16_noerr(const void *addr, uint16_t *dst)
2611{}
2612
2613#else   /* __lint */
2614
2615#if defined(__amd64)
2616
2617#define	FUWORD_NOERR(NAME, INSTR, REG)		\
2618	ENTRY(NAME)				\
2619	cmpq	kernelbase(%rip), %rdi;		\
2620	cmovnbq	kernelbase(%rip), %rdi;		\
2621	INSTR	(%rdi), REG;			\
2622	INSTR	REG, (%rsi);			\
2623	ret;					\
2624	SET_SIZE(NAME)
2625
2626	FUWORD_NOERR(fuword64_noerr, movq, %rax)
2627	FUWORD_NOERR(fuword32_noerr, movl, %eax)
2628	FUWORD_NOERR(fuword16_noerr, movw, %ax)
2629	FUWORD_NOERR(fuword8_noerr, movb, %al)
2630
2631#elif defined(__i386)
2632
2633#define	FUWORD_NOERR(NAME, INSTR, REG)		\
2634	ENTRY(NAME)				\
2635	movl	4(%esp), %eax;			\
2636	cmpl	kernelbase, %eax;		\
2637	jb	1f;				\
2638	movl	kernelbase, %eax;		\
26391:	movl	8(%esp), %edx;			\
2640	INSTR	(%eax), REG;			\
2641	INSTR	REG, (%edx);			\
2642	ret;					\
2643	SET_SIZE(NAME)
2644
2645	FUWORD_NOERR(fuword32_noerr, movl, %ecx)
2646	FUWORD_NOERR(fuword16_noerr, movw, %cx)
2647	FUWORD_NOERR(fuword8_noerr, movb, %cl)
2648
2649#endif	/* __i386 */
2650
2651#undef	FUWORD_NOERR
2652
2653#endif	/* __lint */
2654
2655#if defined(__lint)
2656
2657#if defined(__amd64)
2658
2659/*ARGSUSED*/
2660void
2661suword64_noerr(void *addr, uint64_t value)
2662{}
2663
2664#endif
2665
2666/*ARGSUSED*/
2667void
2668suword32_noerr(void *addr, uint32_t value)
2669{}
2670
2671/*ARGSUSED*/
2672void
2673suword16_noerr(void *addr, uint16_t value)
2674{}
2675
2676/*ARGSUSED*/
2677void
2678suword8_noerr(void *addr, uint8_t value)
2679{}
2680
2681#else	/* lint */
2682
2683#if defined(__amd64)
2684
2685#define	SUWORD_NOERR(NAME, INSTR, REG)		\
2686	ENTRY(NAME)				\
2687	cmpq	kernelbase(%rip), %rdi;		\
2688	cmovnbq	kernelbase(%rip), %rdi;		\
2689	INSTR	REG, (%rdi);			\
2690	ret;					\
2691	SET_SIZE(NAME)
2692
2693	SUWORD_NOERR(suword64_noerr, movq, %rsi)
2694	SUWORD_NOERR(suword32_noerr, movl, %esi)
2695	SUWORD_NOERR(suword16_noerr, movw, %si)
2696	SUWORD_NOERR(suword8_noerr, movb, %sil)
2697
2698#elif defined(__i386)
2699
2700#define	SUWORD_NOERR(NAME, INSTR, REG)		\
2701	ENTRY(NAME)				\
2702	movl	4(%esp), %eax;			\
2703	cmpl	kernelbase, %eax;		\
2704	jb	1f;				\
2705	movl	kernelbase, %eax;		\
27061:						\
2707	movl	8(%esp), %edx;			\
2708	INSTR	REG, (%eax);			\
2709	ret;					\
2710	SET_SIZE(NAME)
2711
2712	SUWORD_NOERR(suword32_noerr, movl, %edx)
2713	SUWORD_NOERR(suword16_noerr, movw, %dx)
2714	SUWORD_NOERR(suword8_noerr, movb, %dl)
2715
2716#endif	/* __i386 */
2717
2718#undef	SUWORD_NOERR
2719
2720#endif	/* lint */
2721
2722
2723#if defined(__lint)
2724
2725/*ARGSUSED*/
2726int
2727subyte(void *addr, uchar_t value)
2728{ return (0); }
2729
2730/*ARGSUSED*/
2731void
2732subyte_noerr(void *addr, uchar_t value)
2733{}
2734
2735/*ARGSUSED*/
2736int
2737fulword(const void *addr, ulong_t *valuep)
2738{ return (0); }
2739
2740/*ARGSUSED*/
2741void
2742fulword_noerr(const void *addr, ulong_t *valuep)
2743{}
2744
2745/*ARGSUSED*/
2746int
2747sulword(void *addr, ulong_t valuep)
2748{ return (0); }
2749
2750/*ARGSUSED*/
2751void
2752sulword_noerr(void *addr, ulong_t valuep)
2753{}
2754
2755#else
2756
2757	.weak	subyte
2758	subyte=suword8
2759	.weak	subyte_noerr
2760	subyte_noerr=suword8_noerr
2761
2762#if defined(__amd64)
2763
2764	.weak	fulword
2765	fulword=fuword64
2766	.weak	fulword_noerr
2767	fulword_noerr=fuword64_noerr
2768	.weak	sulword
2769	sulword=suword64
2770	.weak	sulword_noerr
2771	sulword_noerr=suword64_noerr
2772
2773#elif defined(__i386)
2774
2775	.weak	fulword
2776	fulword=fuword32
2777	.weak	fulword_noerr
2778	fulword_noerr=fuword32_noerr
2779	.weak	sulword
2780	sulword=suword32
2781	.weak	sulword_noerr
2782	sulword_noerr=suword32_noerr
2783
2784#endif /* __i386 */
2785
2786#endif /* __lint */
2787
2788#if defined(__lint)
2789
2790/*
2791 * Copy a block of storage - must not overlap (from + len <= to).
2792 * No fault handler installed (to be called under on_fault())
2793 */
2794
2795/* ARGSUSED */
2796void
2797copyout_noerr(const void *kfrom, void *uto, size_t count)
2798{}
2799
2800/* ARGSUSED */
2801void
2802copyin_noerr(const void *ufrom, void *kto, size_t count)
2803{}
2804
2805/*
2806 * Zero a block of storage in user space
2807 */
2808
2809/* ARGSUSED */
2810void
2811uzero(void *addr, size_t count)
2812{}
2813
2814/*
2815 * copy a block of storage in user space
2816 */
2817
2818/* ARGSUSED */
2819void
2820ucopy(const void *ufrom, void *uto, size_t ulength)
2821{}
2822
2823/*
2824 * copy a string in user space
2825 */
2826
2827/* ARGSUSED */
2828void
2829ucopystr(const char *ufrom, char *uto, size_t umaxlength, size_t *lencopied)
2830{}
2831
2832#else /* __lint */
2833
2834#if defined(__amd64)
2835
2836	ENTRY(copyin_noerr)
2837	movq	kernelbase(%rip), %rax
2838#ifdef DEBUG
2839	cmpq	%rax, %rsi		/* %rsi = kto */
2840	jae	1f
2841	leaq	.cpyin_ne_pmsg(%rip), %rdi
2842	jmp	call_panic		/* setup stack and call panic */
28431:
2844#endif
2845	cmpq	%rax, %rdi		/* ufrom < kernelbase */
2846	jb	do_copy
2847	movq	%rax, %rdi		/* force fault at kernelbase */
2848	jmp	do_copy
2849	SET_SIZE(copyin_noerr)
2850
2851	ENTRY(copyout_noerr)
2852	movq	kernelbase(%rip), %rax
2853#ifdef DEBUG
2854	cmpq	%rax, %rdi		/* %rdi = kfrom */
2855	jae	1f
2856	leaq	.cpyout_ne_pmsg(%rip), %rdi
2857	jmp	call_panic		/* setup stack and call panic */
28581:
2859#endif
2860	cmpq	%rax, %rsi		/* uto < kernelbase */
2861	jb	do_copy
2862	movq	%rax, %rsi		/* force fault at kernelbase */
2863	jmp	do_copy
2864	SET_SIZE(copyout_noerr)
2865
2866	ENTRY(uzero)
2867	movq	kernelbase(%rip), %rax
2868	cmpq	%rax, %rdi
2869	jb	do_zero
2870	movq	%rax, %rdi	/* force fault at kernelbase */
2871	jmp	do_zero
2872	SET_SIZE(uzero)
2873
2874	ENTRY(ucopy)
2875	movq	kernelbase(%rip), %rax
2876	cmpq	%rax, %rdi
2877	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
2878	cmpq	%rax, %rsi
2879	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
2880	jmp	do_copy
2881	SET_SIZE(ucopy)
2882
2883	ENTRY(ucopystr)
2884	movq	kernelbase(%rip), %rax
2885	cmpq	%rax, %rdi
2886	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
2887	cmpq	%rax, %rsi
2888	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
2889	/* do_copystr expects lofault address in %r8 */
2890	movq	%gs:CPU_THREAD, %r8
2891	movq	T_LOFAULT(%r8), %r8
2892	jmp	do_copystr
2893	SET_SIZE(ucopystr)
2894
2895#elif defined(__i386)
2896
2897	ENTRY(copyin_noerr)
2898	movl	kernelbase, %eax
2899#ifdef DEBUG
2900	cmpl	%eax, 8(%esp)
2901	jae	1f
2902	pushl	$.cpyin_ne_pmsg
2903	call	panic
29041:
2905#endif
2906	cmpl	%eax, 4(%esp)
2907	jb	do_copy
2908	movl	%eax, 4(%esp)	/* force fault at kernelbase */
2909	jmp	do_copy
2910	SET_SIZE(copyin_noerr)
2911
2912	ENTRY(copyout_noerr)
2913	movl	kernelbase, %eax
2914#ifdef DEBUG
2915	cmpl	%eax, 4(%esp)
2916	jae	1f
2917	pushl	$.cpyout_ne_pmsg
2918	call	panic
29191:
2920#endif
2921	cmpl	%eax, 8(%esp)
2922	jb	do_copy
2923	movl	%eax, 8(%esp)	/* force fault at kernelbase */
2924	jmp	do_copy
2925	SET_SIZE(copyout_noerr)
2926
2927	ENTRY(uzero)
2928	movl	kernelbase, %eax
2929	cmpl	%eax, 4(%esp)
2930	jb	do_zero
2931	movl	%eax, 4(%esp)	/* force fault at kernelbase */
2932	jmp	do_zero
2933	SET_SIZE(uzero)
2934
2935	ENTRY(ucopy)
2936	movl	kernelbase, %eax
2937	cmpl	%eax, 4(%esp)
2938	jb	1f
2939	movl	%eax, 4(%esp)	/* force fault at kernelbase */
29401:
2941	cmpl	%eax, 8(%esp)
2942	jb	do_copy
2943	movl	%eax, 8(%esp)	/* force fault at kernelbase */
2944	jmp	do_copy
2945	SET_SIZE(ucopy)
2946
2947	ENTRY(ucopystr)
2948	movl	kernelbase, %eax
2949	cmpl	%eax, 4(%esp)
2950	jb	1f
2951	movl	%eax, 4(%esp)	/* force fault at kernelbase */
29521:
2953	cmpl	%eax, 8(%esp)
2954	jb	2f
2955	movl	%eax, 8(%esp)	/* force fault at kernelbase */
29562:
2957	/* do_copystr expects the lofault address in %eax */
2958	movl	%gs:CPU_THREAD, %eax
2959	movl	T_LOFAULT(%eax), %eax
2960	jmp	do_copystr
2961	SET_SIZE(ucopystr)
2962
2963#endif	/* __i386 */
2964
2965#ifdef DEBUG
2966	.data
2967.kcopy_panic_msg:
2968	.string "kcopy: arguments below kernelbase"
2969.bcopy_panic_msg:
2970	.string "bcopy: arguments below kernelbase"
2971.kzero_panic_msg:
2972        .string "kzero: arguments below kernelbase"
2973.bzero_panic_msg:
2974	.string	"bzero: arguments below kernelbase"
2975.copyin_panic_msg:
2976	.string "copyin: kaddr argument below kernelbase"
2977.xcopyin_panic_msg:
2978	.string	"xcopyin: kaddr argument below kernelbase"
2979.copyout_panic_msg:
2980	.string "copyout: kaddr argument below kernelbase"
2981.xcopyout_panic_msg:
2982	.string	"xcopyout: kaddr argument below kernelbase"
2983.copystr_panic_msg:
2984	.string	"copystr: arguments in user space"
2985.copyinstr_panic_msg:
2986	.string	"copyinstr: kaddr argument not in kernel address space"
2987.copyoutstr_panic_msg:
2988	.string	"copyoutstr: kaddr argument not in kernel address space"
2989.cpyin_ne_pmsg:
2990	.string "copyin_noerr: argument not in kernel address space"
2991.cpyout_ne_pmsg:
2992	.string "copyout_noerr: argument not in kernel address space"
2993#endif
2994
2995#endif	/* __lint */
2996