xref: /titanic_52/usr/src/uts/intel/ia32/ml/copy.s (revision 3afe87ebb25691cb6d158edaa34a6fb9b703a691)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright (c) 2008, Intel Corporation
28 * All rights reserved.
29 */
30
31/*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
32/*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T		*/
33/*         All Rights Reserved						*/
34
35/*       Copyright (c) 1987, 1988 Microsoft Corporation			*/
36/*         All Rights Reserved						*/
37
38#include <sys/errno.h>
39#include <sys/asm_linkage.h>
40
41#if defined(__lint)
42#include <sys/types.h>
43#include <sys/systm.h>
44#else	/* __lint */
45#include "assym.h"
46#endif	/* __lint */
47
48#define	KCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
49#define	XCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
50/*
51 * Non-temopral access (NTA) alignment requirement
52 */
53#define	NTA_ALIGN_SIZE	4	/* Must be at least 4-byte aligned */
54#define	NTA_ALIGN_MASK	_CONST(NTA_ALIGN_SIZE-1)
55#define	COUNT_ALIGN_SIZE	16	/* Must be at least 16-byte aligned */
56#define	COUNT_ALIGN_MASK	_CONST(COUNT_ALIGN_SIZE-1)
57
58/*
59 * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
60 * "rep smovq" for large sizes. Performance data shows that many calls to
61 * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
62 * these small sizes unrolled code is used. For medium sizes loops writing
63 * 64-bytes per loop are used. Transition points were determined experimentally.
64 */
65#define BZERO_USE_REP	(1024)
66#define BCOPY_DFLT_REP	(128)
67#define	BCOPY_NHM_REP	(768)
68
69/*
70 * Copy a block of storage, returning an error code if `from' or
71 * `to' takes a kernel pagefault which cannot be resolved.
72 * Returns errno value on pagefault error, 0 if all ok
73 */
74
75#if defined(__lint)
76
77/* ARGSUSED */
78int
79kcopy(const void *from, void *to, size_t count)
80{ return (0); }
81
82#else	/* __lint */
83
84	.globl	kernelbase
85	.globl	postbootkernelbase
86
87#if defined(__amd64)
88
89	ENTRY(kcopy)
90	pushq	%rbp
91	movq	%rsp, %rbp
92#ifdef DEBUG
93	cmpq	postbootkernelbase(%rip), %rdi 		/* %rdi = from */
94	jb	0f
95	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
96	jnb	1f
970:	leaq	.kcopy_panic_msg(%rip), %rdi
98	xorl	%eax, %eax
99	call	panic
1001:
101#endif
102	/*
103	 * pass lofault value as 4th argument to do_copy_fault
104	 */
105	leaq	_kcopy_copyerr(%rip), %rcx
106	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
107
108do_copy_fault:
109	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
110	movq	%rcx, T_LOFAULT(%r9)	/* new lofault */
111	call	bcopy_altentry
112	xorl	%eax, %eax		/* return 0 (success) */
113
114	/*
115	 * A fault during do_copy_fault is indicated through an errno value
116	 * in %rax and we iretq from the trap handler to here.
117	 */
118_kcopy_copyerr:
119	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
120	leave
121	ret
122	SET_SIZE(kcopy)
123
124#elif defined(__i386)
125
126#define	ARG_FROM	8
127#define	ARG_TO		12
128#define	ARG_COUNT	16
129
130	ENTRY(kcopy)
131#ifdef DEBUG
132	pushl	%ebp
133	movl	%esp, %ebp
134	movl	postbootkernelbase, %eax
135	cmpl	%eax, ARG_FROM(%ebp)
136	jb	0f
137	cmpl	%eax, ARG_TO(%ebp)
138	jnb	1f
1390:	pushl	$.kcopy_panic_msg
140	call	panic
1411:	popl	%ebp
142#endif
143	lea	_kcopy_copyerr, %eax	/* lofault value */
144	movl	%gs:CPU_THREAD, %edx
145
146do_copy_fault:
147	pushl	%ebp
148	movl	%esp, %ebp		/* setup stack frame */
149	pushl	%esi
150	pushl	%edi			/* save registers */
151
152	movl	T_LOFAULT(%edx), %edi
153	pushl	%edi			/* save the current lofault */
154	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
155
156	movl	ARG_COUNT(%ebp), %ecx
157	movl	ARG_FROM(%ebp), %esi
158	movl	ARG_TO(%ebp), %edi
159	shrl	$2, %ecx		/* word count */
160	rep
161	  smovl
162	movl	ARG_COUNT(%ebp), %ecx
163	andl	$3, %ecx		/* bytes left over */
164	rep
165	  smovb
166	xorl	%eax, %eax
167
168	/*
169	 * A fault during do_copy_fault is indicated through an errno value
170	 * in %eax and we iret from the trap handler to here.
171	 */
172_kcopy_copyerr:
173	popl	%ecx
174	popl	%edi
175	movl	%ecx, T_LOFAULT(%edx)	/* restore the original lofault */
176	popl	%esi
177	popl	%ebp
178	ret
179	SET_SIZE(kcopy)
180
181#undef	ARG_FROM
182#undef	ARG_TO
183#undef	ARG_COUNT
184
185#endif	/* __i386 */
186#endif	/* __lint */
187
188#if defined(__lint)
189
190/*
191 * Copy a block of storage.  Similar to kcopy but uses non-temporal
192 * instructions.
193 */
194
195/* ARGSUSED */
196int
197kcopy_nta(const void *from, void *to, size_t count, int copy_cached)
198{ return (0); }
199
200#else	/* __lint */
201
202#if defined(__amd64)
203
204#define	COPY_LOOP_INIT(src, dst, cnt)	\
205	addq	cnt, src;			\
206	addq	cnt, dst;			\
207	shrq	$3, cnt;			\
208	neg	cnt
209
210	/* Copy 16 bytes per loop.  Uses %rax and %r8 */
211#define	COPY_LOOP_BODY(src, dst, cnt)	\
212	prefetchnta	0x100(src, cnt, 8);	\
213	movq	(src, cnt, 8), %rax;		\
214	movq	0x8(src, cnt, 8), %r8;		\
215	movnti	%rax, (dst, cnt, 8);		\
216	movnti	%r8, 0x8(dst, cnt, 8);		\
217	addq	$2, cnt
218
219	ENTRY(kcopy_nta)
220	pushq	%rbp
221	movq	%rsp, %rbp
222#ifdef DEBUG
223	cmpq	postbootkernelbase(%rip), %rdi 		/* %rdi = from */
224	jb	0f
225	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
226	jnb	1f
2270:	leaq	.kcopy_panic_msg(%rip), %rdi
228	xorl	%eax, %eax
229	call	panic
2301:
231#endif
232
233	movq	%gs:CPU_THREAD, %r9
234	cmpq	$0, %rcx		/* No non-temporal access? */
235	/*
236	 * pass lofault value as 4th argument to do_copy_fault
237	 */
238	leaq	_kcopy_nta_copyerr(%rip), %rcx	/* doesn't set rflags */
239	jnz	do_copy_fault		/* use regular access */
240	/*
241	 * Make sure cnt is >= KCOPY_MIN_SIZE
242	 */
243	cmpq	$KCOPY_MIN_SIZE, %rdx
244	jb	do_copy_fault
245
246	/*
247	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
248	 * count is COUNT_ALIGN_SIZE aligned.
249	 */
250	movq	%rdi, %r10
251	orq	%rsi, %r10
252	andq	$NTA_ALIGN_MASK, %r10
253	orq	%rdx, %r10
254	andq	$COUNT_ALIGN_MASK, %r10
255	jnz	do_copy_fault
256
257	ALTENTRY(do_copy_fault_nta)
258	movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
259	movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
260	movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
261
262	/*
263	 * COPY_LOOP_BODY uses %rax and %r8
264	 */
265	COPY_LOOP_INIT(%rdi, %rsi, %rdx)
2662:	COPY_LOOP_BODY(%rdi, %rsi, %rdx)
267	jnz	2b
268
269	mfence
270	xorl	%eax, %eax		/* return 0 (success) */
271
272_kcopy_nta_copyerr:
273	movq	%r11, T_LOFAULT(%r9)    /* restore original lofault */
274	leave
275	ret
276	SET_SIZE(do_copy_fault_nta)
277	SET_SIZE(kcopy_nta)
278
279#elif defined(__i386)
280
281#define	ARG_FROM	8
282#define	ARG_TO		12
283#define	ARG_COUNT	16
284
285#define	COPY_LOOP_INIT(src, dst, cnt)	\
286	addl	cnt, src;			\
287	addl	cnt, dst;			\
288	shrl	$3, cnt;			\
289	neg	cnt
290
291#define	COPY_LOOP_BODY(src, dst, cnt)	\
292	prefetchnta	0x100(src, cnt, 8);	\
293	movl	(src, cnt, 8), %esi;		\
294	movnti	%esi, (dst, cnt, 8);		\
295	movl	0x4(src, cnt, 8), %esi;		\
296	movnti	%esi, 0x4(dst, cnt, 8);		\
297	movl	0x8(src, cnt, 8), %esi;		\
298	movnti	%esi, 0x8(dst, cnt, 8);		\
299	movl	0xc(src, cnt, 8), %esi;		\
300	movnti	%esi, 0xc(dst, cnt, 8);		\
301	addl	$2, cnt
302
303	/*
304	 * kcopy_nta is not implemented for 32-bit as no performance
305	 * improvement was shown.  We simply jump directly to kcopy
306	 * and discard the 4 arguments.
307	 */
308	ENTRY(kcopy_nta)
309	jmp	kcopy
310
311	lea	_kcopy_nta_copyerr, %eax	/* lofault value */
312	ALTENTRY(do_copy_fault_nta)
313	pushl	%ebp
314	movl	%esp, %ebp		/* setup stack frame */
315	pushl	%esi
316	pushl	%edi
317
318	movl	%gs:CPU_THREAD, %edx
319	movl	T_LOFAULT(%edx), %edi
320	pushl	%edi			/* save the current lofault */
321	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
322
323	/* COPY_LOOP_BODY needs to use %esi */
324	movl	ARG_COUNT(%ebp), %ecx
325	movl	ARG_FROM(%ebp), %edi
326	movl	ARG_TO(%ebp), %eax
327	COPY_LOOP_INIT(%edi, %eax, %ecx)
3281:	COPY_LOOP_BODY(%edi, %eax, %ecx)
329	jnz	1b
330	mfence
331
332	xorl	%eax, %eax
333_kcopy_nta_copyerr:
334	popl	%ecx
335	popl	%edi
336	movl	%ecx, T_LOFAULT(%edx)	/* restore the original lofault */
337	popl	%esi
338	leave
339	ret
340	SET_SIZE(do_copy_fault_nta)
341	SET_SIZE(kcopy_nta)
342
343#undef	ARG_FROM
344#undef	ARG_TO
345#undef	ARG_COUNT
346
347#endif	/* __i386 */
348#endif	/* __lint */
349
350#if defined(__lint)
351
352/* ARGSUSED */
353void
354bcopy(const void *from, void *to, size_t count)
355{}
356
357#else	/* __lint */
358
359#if defined(__amd64)
360
361	ENTRY(bcopy)
362#ifdef DEBUG
363	orq	%rdx, %rdx		/* %rdx = count */
364	jz	1f
365	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
366	jb	0f
367	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
368	jnb	1f
3690:	leaq	.bcopy_panic_msg(%rip), %rdi
370	jmp	call_panic		/* setup stack and call panic */
3711:
372#endif
373	/*
374	 * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
375	 * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
376	 * uses these registers in future they must be saved and restored.
377	 */
378	ALTENTRY(bcopy_altentry)
379do_copy:
380#define	L(s) .bcopy/**/s
381	cmpq	$0x50, %rdx		/* 80 */
382	jge	bcopy_ck_size
383
384	/*
385	 * Performance data shows many caller's copy small buffers. So for
386	 * best perf for these sizes unrolled code is used. Store data without
387	 * worrying about alignment.
388	 */
389	leaq	L(fwdPxQx)(%rip), %r10
390	addq	%rdx, %rdi
391	addq	%rdx, %rsi
392	movslq	(%r10,%rdx,4), %rcx
393	leaq	(%rcx,%r10,1), %r10
394	jmpq	*%r10
395
396	.p2align 4
397L(fwdPxQx):
398	.int       L(P0Q0)-L(fwdPxQx)	/* 0 */
399	.int       L(P1Q0)-L(fwdPxQx)
400	.int       L(P2Q0)-L(fwdPxQx)
401	.int       L(P3Q0)-L(fwdPxQx)
402	.int       L(P4Q0)-L(fwdPxQx)
403	.int       L(P5Q0)-L(fwdPxQx)
404	.int       L(P6Q0)-L(fwdPxQx)
405	.int       L(P7Q0)-L(fwdPxQx)
406
407	.int       L(P0Q1)-L(fwdPxQx)	/* 8 */
408	.int       L(P1Q1)-L(fwdPxQx)
409	.int       L(P2Q1)-L(fwdPxQx)
410	.int       L(P3Q1)-L(fwdPxQx)
411	.int       L(P4Q1)-L(fwdPxQx)
412	.int       L(P5Q1)-L(fwdPxQx)
413	.int       L(P6Q1)-L(fwdPxQx)
414	.int       L(P7Q1)-L(fwdPxQx)
415
416	.int       L(P0Q2)-L(fwdPxQx)	/* 16 */
417	.int       L(P1Q2)-L(fwdPxQx)
418	.int       L(P2Q2)-L(fwdPxQx)
419	.int       L(P3Q2)-L(fwdPxQx)
420	.int       L(P4Q2)-L(fwdPxQx)
421	.int       L(P5Q2)-L(fwdPxQx)
422	.int       L(P6Q2)-L(fwdPxQx)
423	.int       L(P7Q2)-L(fwdPxQx)
424
425	.int       L(P0Q3)-L(fwdPxQx)	/* 24 */
426	.int       L(P1Q3)-L(fwdPxQx)
427	.int       L(P2Q3)-L(fwdPxQx)
428	.int       L(P3Q3)-L(fwdPxQx)
429	.int       L(P4Q3)-L(fwdPxQx)
430	.int       L(P5Q3)-L(fwdPxQx)
431	.int       L(P6Q3)-L(fwdPxQx)
432	.int       L(P7Q3)-L(fwdPxQx)
433
434	.int       L(P0Q4)-L(fwdPxQx)	/* 32 */
435	.int       L(P1Q4)-L(fwdPxQx)
436	.int       L(P2Q4)-L(fwdPxQx)
437	.int       L(P3Q4)-L(fwdPxQx)
438	.int       L(P4Q4)-L(fwdPxQx)
439	.int       L(P5Q4)-L(fwdPxQx)
440	.int       L(P6Q4)-L(fwdPxQx)
441	.int       L(P7Q4)-L(fwdPxQx)
442
443	.int       L(P0Q5)-L(fwdPxQx)	/* 40 */
444	.int       L(P1Q5)-L(fwdPxQx)
445	.int       L(P2Q5)-L(fwdPxQx)
446	.int       L(P3Q5)-L(fwdPxQx)
447	.int       L(P4Q5)-L(fwdPxQx)
448	.int       L(P5Q5)-L(fwdPxQx)
449	.int       L(P6Q5)-L(fwdPxQx)
450	.int       L(P7Q5)-L(fwdPxQx)
451
452	.int       L(P0Q6)-L(fwdPxQx)	/* 48 */
453	.int       L(P1Q6)-L(fwdPxQx)
454	.int       L(P2Q6)-L(fwdPxQx)
455	.int       L(P3Q6)-L(fwdPxQx)
456	.int       L(P4Q6)-L(fwdPxQx)
457	.int       L(P5Q6)-L(fwdPxQx)
458	.int       L(P6Q6)-L(fwdPxQx)
459	.int       L(P7Q6)-L(fwdPxQx)
460
461	.int       L(P0Q7)-L(fwdPxQx)	/* 56 */
462	.int       L(P1Q7)-L(fwdPxQx)
463	.int       L(P2Q7)-L(fwdPxQx)
464	.int       L(P3Q7)-L(fwdPxQx)
465	.int       L(P4Q7)-L(fwdPxQx)
466	.int       L(P5Q7)-L(fwdPxQx)
467	.int       L(P6Q7)-L(fwdPxQx)
468	.int       L(P7Q7)-L(fwdPxQx)
469
470	.int       L(P0Q8)-L(fwdPxQx)	/* 64 */
471	.int       L(P1Q8)-L(fwdPxQx)
472	.int       L(P2Q8)-L(fwdPxQx)
473	.int       L(P3Q8)-L(fwdPxQx)
474	.int       L(P4Q8)-L(fwdPxQx)
475	.int       L(P5Q8)-L(fwdPxQx)
476	.int       L(P6Q8)-L(fwdPxQx)
477	.int       L(P7Q8)-L(fwdPxQx)
478
479	.int       L(P0Q9)-L(fwdPxQx)	/* 72 */
480	.int       L(P1Q9)-L(fwdPxQx)
481	.int       L(P2Q9)-L(fwdPxQx)
482	.int       L(P3Q9)-L(fwdPxQx)
483	.int       L(P4Q9)-L(fwdPxQx)
484	.int       L(P5Q9)-L(fwdPxQx)
485	.int       L(P6Q9)-L(fwdPxQx)
486	.int       L(P7Q9)-L(fwdPxQx)	/* 79 */
487
488	.p2align 4
489L(P0Q9):
490	mov    -0x48(%rdi), %rcx
491	mov    %rcx, -0x48(%rsi)
492L(P0Q8):
493	mov    -0x40(%rdi), %r10
494	mov    %r10, -0x40(%rsi)
495L(P0Q7):
496	mov    -0x38(%rdi), %r8
497	mov    %r8, -0x38(%rsi)
498L(P0Q6):
499	mov    -0x30(%rdi), %rcx
500	mov    %rcx, -0x30(%rsi)
501L(P0Q5):
502	mov    -0x28(%rdi), %r10
503	mov    %r10, -0x28(%rsi)
504L(P0Q4):
505	mov    -0x20(%rdi), %r8
506	mov    %r8, -0x20(%rsi)
507L(P0Q3):
508	mov    -0x18(%rdi), %rcx
509	mov    %rcx, -0x18(%rsi)
510L(P0Q2):
511	mov    -0x10(%rdi), %r10
512	mov    %r10, -0x10(%rsi)
513L(P0Q1):
514	mov    -0x8(%rdi), %r8
515	mov    %r8, -0x8(%rsi)
516L(P0Q0):
517	ret
518
519	.p2align 4
520L(P1Q9):
521	mov    -0x49(%rdi), %r8
522	mov    %r8, -0x49(%rsi)
523L(P1Q8):
524	mov    -0x41(%rdi), %rcx
525	mov    %rcx, -0x41(%rsi)
526L(P1Q7):
527	mov    -0x39(%rdi), %r10
528	mov    %r10, -0x39(%rsi)
529L(P1Q6):
530	mov    -0x31(%rdi), %r8
531	mov    %r8, -0x31(%rsi)
532L(P1Q5):
533	mov    -0x29(%rdi), %rcx
534	mov    %rcx, -0x29(%rsi)
535L(P1Q4):
536	mov    -0x21(%rdi), %r10
537	mov    %r10, -0x21(%rsi)
538L(P1Q3):
539	mov    -0x19(%rdi), %r8
540	mov    %r8, -0x19(%rsi)
541L(P1Q2):
542	mov    -0x11(%rdi), %rcx
543	mov    %rcx, -0x11(%rsi)
544L(P1Q1):
545	mov    -0x9(%rdi), %r10
546	mov    %r10, -0x9(%rsi)
547L(P1Q0):
548	movzbq -0x1(%rdi), %r8
549	mov    %r8b, -0x1(%rsi)
550	ret
551
552	.p2align 4
553L(P2Q9):
554	mov    -0x4a(%rdi), %r8
555	mov    %r8, -0x4a(%rsi)
556L(P2Q8):
557	mov    -0x42(%rdi), %rcx
558	mov    %rcx, -0x42(%rsi)
559L(P2Q7):
560	mov    -0x3a(%rdi), %r10
561	mov    %r10, -0x3a(%rsi)
562L(P2Q6):
563	mov    -0x32(%rdi), %r8
564	mov    %r8, -0x32(%rsi)
565L(P2Q5):
566	mov    -0x2a(%rdi), %rcx
567	mov    %rcx, -0x2a(%rsi)
568L(P2Q4):
569	mov    -0x22(%rdi), %r10
570	mov    %r10, -0x22(%rsi)
571L(P2Q3):
572	mov    -0x1a(%rdi), %r8
573	mov    %r8, -0x1a(%rsi)
574L(P2Q2):
575	mov    -0x12(%rdi), %rcx
576	mov    %rcx, -0x12(%rsi)
577L(P2Q1):
578	mov    -0xa(%rdi), %r10
579	mov    %r10, -0xa(%rsi)
580L(P2Q0):
581	movzwq -0x2(%rdi), %r8
582	mov    %r8w, -0x2(%rsi)
583	ret
584
585	.p2align 4
586L(P3Q9):
587	mov    -0x4b(%rdi), %r8
588	mov    %r8, -0x4b(%rsi)
589L(P3Q8):
590	mov    -0x43(%rdi), %rcx
591	mov    %rcx, -0x43(%rsi)
592L(P3Q7):
593	mov    -0x3b(%rdi), %r10
594	mov    %r10, -0x3b(%rsi)
595L(P3Q6):
596	mov    -0x33(%rdi), %r8
597	mov    %r8, -0x33(%rsi)
598L(P3Q5):
599	mov    -0x2b(%rdi), %rcx
600	mov    %rcx, -0x2b(%rsi)
601L(P3Q4):
602	mov    -0x23(%rdi), %r10
603	mov    %r10, -0x23(%rsi)
604L(P3Q3):
605	mov    -0x1b(%rdi), %r8
606	mov    %r8, -0x1b(%rsi)
607L(P3Q2):
608	mov    -0x13(%rdi), %rcx
609	mov    %rcx, -0x13(%rsi)
610L(P3Q1):
611	mov    -0xb(%rdi), %r10
612	mov    %r10, -0xb(%rsi)
613	/*
614	 * These trailing loads/stores have to do all their loads 1st,
615	 * then do the stores.
616	 */
617L(P3Q0):
618	movzwq -0x3(%rdi), %r8
619	movzbq -0x1(%rdi), %r10
620	mov    %r8w, -0x3(%rsi)
621	mov    %r10b, -0x1(%rsi)
622	ret
623
624	.p2align 4
625L(P4Q9):
626	mov    -0x4c(%rdi), %r8
627	mov    %r8, -0x4c(%rsi)
628L(P4Q8):
629	mov    -0x44(%rdi), %rcx
630	mov    %rcx, -0x44(%rsi)
631L(P4Q7):
632	mov    -0x3c(%rdi), %r10
633	mov    %r10, -0x3c(%rsi)
634L(P4Q6):
635	mov    -0x34(%rdi), %r8
636	mov    %r8, -0x34(%rsi)
637L(P4Q5):
638	mov    -0x2c(%rdi), %rcx
639	mov    %rcx, -0x2c(%rsi)
640L(P4Q4):
641	mov    -0x24(%rdi), %r10
642	mov    %r10, -0x24(%rsi)
643L(P4Q3):
644	mov    -0x1c(%rdi), %r8
645	mov    %r8, -0x1c(%rsi)
646L(P4Q2):
647	mov    -0x14(%rdi), %rcx
648	mov    %rcx, -0x14(%rsi)
649L(P4Q1):
650	mov    -0xc(%rdi), %r10
651	mov    %r10, -0xc(%rsi)
652L(P4Q0):
653	mov    -0x4(%rdi), %r8d
654	mov    %r8d, -0x4(%rsi)
655	ret
656
657	.p2align 4
658L(P5Q9):
659	mov    -0x4d(%rdi), %r8
660	mov    %r8, -0x4d(%rsi)
661L(P5Q8):
662	mov    -0x45(%rdi), %rcx
663	mov    %rcx, -0x45(%rsi)
664L(P5Q7):
665	mov    -0x3d(%rdi), %r10
666	mov    %r10, -0x3d(%rsi)
667L(P5Q6):
668	mov    -0x35(%rdi), %r8
669	mov    %r8, -0x35(%rsi)
670L(P5Q5):
671	mov    -0x2d(%rdi), %rcx
672	mov    %rcx, -0x2d(%rsi)
673L(P5Q4):
674	mov    -0x25(%rdi), %r10
675	mov    %r10, -0x25(%rsi)
676L(P5Q3):
677	mov    -0x1d(%rdi), %r8
678	mov    %r8, -0x1d(%rsi)
679L(P5Q2):
680	mov    -0x15(%rdi), %rcx
681	mov    %rcx, -0x15(%rsi)
682L(P5Q1):
683	mov    -0xd(%rdi), %r10
684	mov    %r10, -0xd(%rsi)
685L(P5Q0):
686	mov    -0x5(%rdi), %r8d
687	movzbq -0x1(%rdi), %r10
688	mov    %r8d, -0x5(%rsi)
689	mov    %r10b, -0x1(%rsi)
690	ret
691
692	.p2align 4
693L(P6Q9):
694	mov    -0x4e(%rdi), %r8
695	mov    %r8, -0x4e(%rsi)
696L(P6Q8):
697	mov    -0x46(%rdi), %rcx
698	mov    %rcx, -0x46(%rsi)
699L(P6Q7):
700	mov    -0x3e(%rdi), %r10
701	mov    %r10, -0x3e(%rsi)
702L(P6Q6):
703	mov    -0x36(%rdi), %r8
704	mov    %r8, -0x36(%rsi)
705L(P6Q5):
706	mov    -0x2e(%rdi), %rcx
707	mov    %rcx, -0x2e(%rsi)
708L(P6Q4):
709	mov    -0x26(%rdi), %r10
710	mov    %r10, -0x26(%rsi)
711L(P6Q3):
712	mov    -0x1e(%rdi), %r8
713	mov    %r8, -0x1e(%rsi)
714L(P6Q2):
715	mov    -0x16(%rdi), %rcx
716	mov    %rcx, -0x16(%rsi)
717L(P6Q1):
718	mov    -0xe(%rdi), %r10
719	mov    %r10, -0xe(%rsi)
720L(P6Q0):
721	mov    -0x6(%rdi), %r8d
722	movzwq -0x2(%rdi), %r10
723	mov    %r8d, -0x6(%rsi)
724	mov    %r10w, -0x2(%rsi)
725	ret
726
727	.p2align 4
728L(P7Q9):
729	mov    -0x4f(%rdi), %r8
730	mov    %r8, -0x4f(%rsi)
731L(P7Q8):
732	mov    -0x47(%rdi), %rcx
733	mov    %rcx, -0x47(%rsi)
734L(P7Q7):
735	mov    -0x3f(%rdi), %r10
736	mov    %r10, -0x3f(%rsi)
737L(P7Q6):
738	mov    -0x37(%rdi), %r8
739	mov    %r8, -0x37(%rsi)
740L(P7Q5):
741	mov    -0x2f(%rdi), %rcx
742	mov    %rcx, -0x2f(%rsi)
743L(P7Q4):
744	mov    -0x27(%rdi), %r10
745	mov    %r10, -0x27(%rsi)
746L(P7Q3):
747	mov    -0x1f(%rdi), %r8
748	mov    %r8, -0x1f(%rsi)
749L(P7Q2):
750	mov    -0x17(%rdi), %rcx
751	mov    %rcx, -0x17(%rsi)
752L(P7Q1):
753	mov    -0xf(%rdi), %r10
754	mov    %r10, -0xf(%rsi)
755L(P7Q0):
756	mov    -0x7(%rdi), %r8d
757	movzwq -0x3(%rdi), %r10
758	movzbq -0x1(%rdi), %rcx
759	mov    %r8d, -0x7(%rsi)
760	mov    %r10w, -0x3(%rsi)
761	mov    %cl, -0x1(%rsi)
762	ret
763
764	/*
765	 * For large sizes rep smovq is fastest.
766	 * Transition point determined experimentally as measured on
767	 * Intel Xeon processors (incl. Nehalem and previous generations) and
768	 * AMD Opteron. The transition value is patched at boot time to avoid
769	 * memory reference hit.
770	 */
771	.globl bcopy_patch_start
772bcopy_patch_start:
773	cmpq	$BCOPY_NHM_REP, %rdx
774	.globl bcopy_patch_end
775bcopy_patch_end:
776
777	.p2align 4
778	.globl bcopy_ck_size
779bcopy_ck_size:
780	cmpq	$BCOPY_DFLT_REP, %rdx
781	jge	L(use_rep)
782
783	/*
784	 * Align to a 8-byte boundary. Avoids penalties from unaligned stores
785	 * as well as from stores spanning cachelines.
786	 */
787	test	$0x7, %rsi
788	jz	L(aligned_loop)
789	test	$0x1, %rsi
790	jz	2f
791	movzbq	(%rdi), %r8
792	dec	%rdx
793	inc	%rdi
794	mov	%r8b, (%rsi)
795	inc	%rsi
7962:
797	test	$0x2, %rsi
798	jz	4f
799	movzwq	(%rdi), %r8
800	sub	$0x2, %rdx
801	add	$0x2, %rdi
802	mov	%r8w, (%rsi)
803	add	$0x2, %rsi
8044:
805	test	$0x4, %rsi
806	jz	L(aligned_loop)
807	mov	(%rdi), %r8d
808	sub	$0x4, %rdx
809	add	$0x4, %rdi
810	mov	%r8d, (%rsi)
811	add	$0x4, %rsi
812
813	/*
814	 * Copy 64-bytes per loop
815	 */
816	.p2align 4
817L(aligned_loop):
818	mov	(%rdi), %r8
819	mov	0x8(%rdi), %r10
820	lea	-0x40(%rdx), %rdx
821	mov	%r8, (%rsi)
822	mov	%r10, 0x8(%rsi)
823	mov	0x10(%rdi), %rcx
824	mov	0x18(%rdi), %r8
825	mov	%rcx, 0x10(%rsi)
826	mov	%r8, 0x18(%rsi)
827
828	cmp	$0x40, %rdx
829	mov	0x20(%rdi), %r10
830	mov	0x28(%rdi), %rcx
831	mov	%r10, 0x20(%rsi)
832	mov	%rcx, 0x28(%rsi)
833	mov	0x30(%rdi), %r8
834	mov	0x38(%rdi), %r10
835	lea	0x40(%rdi), %rdi
836	mov	%r8, 0x30(%rsi)
837	mov	%r10, 0x38(%rsi)
838	lea	0x40(%rsi), %rsi
839	jge	L(aligned_loop)
840
841	/*
842	 * Copy remaining bytes (0-63)
843	 */
844L(do_remainder):
845	leaq	L(fwdPxQx)(%rip), %r10
846	addq	%rdx, %rdi
847	addq	%rdx, %rsi
848	movslq	(%r10,%rdx,4), %rcx
849	leaq	(%rcx,%r10,1), %r10
850	jmpq	*%r10
851
852	/*
853	 * Use rep smovq. Clear remainder via unrolled code
854	 */
855	.p2align 4
856L(use_rep):
857	xchgq	%rdi, %rsi		/* %rsi = source, %rdi = destination */
858	movq	%rdx, %rcx		/* %rcx = count */
859	shrq	$3, %rcx		/* 8-byte word count */
860	rep
861	  smovq
862
863	xchgq	%rsi, %rdi		/* %rdi = src, %rsi = destination */
864	andq	$7, %rdx		/* remainder */
865	jnz	L(do_remainder)
866	ret
867#undef	L
868
869#ifdef DEBUG
870	/*
871	 * Setup frame on the run-time stack. The end of the input argument
872	 * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
873	 * always points to the end of the latest allocated stack frame.
874	 * panic(const char *format, ...) is a varargs function. When a
875	 * function taking variable arguments is called, %rax must be set
876	 * to eight times the number of floating point parameters passed
877	 * to the function in SSE registers.
878	 */
879call_panic:
880	pushq	%rbp			/* align stack properly */
881	movq	%rsp, %rbp
882	xorl	%eax, %eax		/* no variable arguments */
883	call	panic			/* %rdi = format string */
884#endif
885	SET_SIZE(bcopy_altentry)
886	SET_SIZE(bcopy)
887
888#elif defined(__i386)
889
890#define	ARG_FROM	4
891#define	ARG_TO		8
892#define	ARG_COUNT	12
893
894	ENTRY(bcopy)
895#ifdef DEBUG
896	movl	ARG_COUNT(%esp), %eax
897	orl	%eax, %eax
898	jz	1f
899	movl	postbootkernelbase, %eax
900	cmpl	%eax, ARG_FROM(%esp)
901	jb	0f
902	cmpl	%eax, ARG_TO(%esp)
903	jnb	1f
9040:	pushl	%ebp
905	movl	%esp, %ebp
906	pushl	$.bcopy_panic_msg
907	call	panic
9081:
909#endif
910do_copy:
911	movl	%esi, %eax		/* save registers */
912	movl	%edi, %edx
913	movl	ARG_COUNT(%esp), %ecx
914	movl	ARG_FROM(%esp), %esi
915	movl	ARG_TO(%esp), %edi
916
917	shrl	$2, %ecx		/* word count */
918	rep
919	  smovl
920	movl	ARG_COUNT(%esp), %ecx
921	andl	$3, %ecx		/* bytes left over */
922	rep
923	  smovb
924	movl	%eax, %esi		/* restore registers */
925	movl	%edx, %edi
926	ret
927	SET_SIZE(bcopy)
928
929#undef	ARG_COUNT
930#undef	ARG_FROM
931#undef	ARG_TO
932
933#endif	/* __i386 */
934#endif	/* __lint */
935
936
937/*
938 * Zero a block of storage, returning an error code if we
939 * take a kernel pagefault which cannot be resolved.
940 * Returns errno value on pagefault error, 0 if all ok
941 */
942
943#if defined(__lint)
944
945/* ARGSUSED */
946int
947kzero(void *addr, size_t count)
948{ return (0); }
949
950#else	/* __lint */
951
952#if defined(__amd64)
953
954	ENTRY(kzero)
955#ifdef DEBUG
956        cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
957        jnb	0f
958        leaq	.kzero_panic_msg(%rip), %rdi
959	jmp	call_panic		/* setup stack and call panic */
9600:
961#endif
962	/*
963	 * pass lofault value as 3rd argument to do_zero_fault
964	 */
965	leaq	_kzeroerr(%rip), %rdx
966
967do_zero_fault:
968	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
969	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
970	movq	%rdx, T_LOFAULT(%r9)	/* new lofault */
971	call	bzero_altentry
972
973	/*
974	 * A fault during do_zero_fault is indicated through an errno value
975	 * in %rax when we iretq to here.
976	 */
977_kzeroerr:
978	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
979	ret
980	SET_SIZE(kzero)
981
982#elif defined(__i386)
983
984#define	ARG_ADDR	8
985#define	ARG_COUNT	12
986
987	ENTRY(kzero)
988#ifdef DEBUG
989	pushl	%ebp
990	movl	%esp, %ebp
991	movl	postbootkernelbase, %eax
992        cmpl	%eax, ARG_ADDR(%ebp)
993        jnb	0f
994        pushl   $.kzero_panic_msg
995        call    panic
9960:	popl	%ebp
997#endif
998	lea	_kzeroerr, %eax		/* kzeroerr is lofault value */
999
1000do_zero_fault:
1001	pushl	%ebp			/* save stack base */
1002	movl	%esp, %ebp		/* set new stack base */
1003	pushl	%edi			/* save %edi */
1004
1005	mov	%gs:CPU_THREAD, %edx
1006	movl	T_LOFAULT(%edx), %edi
1007	pushl	%edi			/* save the current lofault */
1008	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
1009
1010	movl	ARG_COUNT(%ebp), %ecx	/* get size in bytes */
1011	movl	ARG_ADDR(%ebp), %edi	/* %edi <- address of bytes to clear */
1012	shrl	$2, %ecx		/* Count of double words to zero */
1013	xorl	%eax, %eax		/* sstol val */
1014	rep
1015	  sstol			/* %ecx contains words to clear (%eax=0) */
1016
1017	movl	ARG_COUNT(%ebp), %ecx	/* get size in bytes */
1018	andl	$3, %ecx		/* do mod 4 */
1019	rep
1020	  sstob			/* %ecx contains residual bytes to clear */
1021
1022	/*
1023	 * A fault during do_zero_fault is indicated through an errno value
1024	 * in %eax when we iret to here.
1025	 */
1026_kzeroerr:
1027	popl	%edi
1028	movl	%edi, T_LOFAULT(%edx)	/* restore the original lofault */
1029	popl	%edi
1030	popl	%ebp
1031	ret
1032	SET_SIZE(kzero)
1033
1034#undef	ARG_ADDR
1035#undef	ARG_COUNT
1036
1037#endif	/* __i386 */
1038#endif	/* __lint */
1039
1040/*
1041 * Zero a block of storage.
1042 */
1043
1044#if defined(__lint)
1045
1046/* ARGSUSED */
1047void
1048bzero(void *addr, size_t count)
1049{}
1050
1051#else	/* __lint */
1052
1053#if defined(__amd64)
1054
1055	ENTRY(bzero)
1056#ifdef DEBUG
1057	cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
1058	jnb	0f
1059	leaq	.bzero_panic_msg(%rip), %rdi
1060	jmp	call_panic		/* setup stack and call panic */
10610:
1062#endif
1063	ALTENTRY(bzero_altentry)
1064do_zero:
1065#define	L(s) .bzero/**/s
1066	xorl	%eax, %eax
1067
1068	cmpq	$0x50, %rsi		/* 80 */
1069	jge	L(ck_align)
1070
1071	/*
1072	 * Performance data shows many caller's are zeroing small buffers. So
1073	 * for best perf for these sizes unrolled code is used. Store zeros
1074	 * without worrying about alignment.
1075	 */
1076	leaq	L(setPxQx)(%rip), %r10
1077	addq	%rsi, %rdi
1078	movslq	(%r10,%rsi,4), %rcx
1079	leaq	(%rcx,%r10,1), %r10
1080	jmpq	*%r10
1081
1082	.p2align 4
1083L(setPxQx):
1084	.int       L(P0Q0)-L(setPxQx)	/* 0 */
1085	.int       L(P1Q0)-L(setPxQx)
1086	.int       L(P2Q0)-L(setPxQx)
1087	.int       L(P3Q0)-L(setPxQx)
1088	.int       L(P4Q0)-L(setPxQx)
1089	.int       L(P5Q0)-L(setPxQx)
1090	.int       L(P6Q0)-L(setPxQx)
1091	.int       L(P7Q0)-L(setPxQx)
1092
1093	.int       L(P0Q1)-L(setPxQx)	/* 8 */
1094	.int       L(P1Q1)-L(setPxQx)
1095	.int       L(P2Q1)-L(setPxQx)
1096	.int       L(P3Q1)-L(setPxQx)
1097	.int       L(P4Q1)-L(setPxQx)
1098	.int       L(P5Q1)-L(setPxQx)
1099	.int       L(P6Q1)-L(setPxQx)
1100	.int       L(P7Q1)-L(setPxQx)
1101
1102	.int       L(P0Q2)-L(setPxQx)	/* 16 */
1103	.int       L(P1Q2)-L(setPxQx)
1104	.int       L(P2Q2)-L(setPxQx)
1105	.int       L(P3Q2)-L(setPxQx)
1106	.int       L(P4Q2)-L(setPxQx)
1107	.int       L(P5Q2)-L(setPxQx)
1108	.int       L(P6Q2)-L(setPxQx)
1109	.int       L(P7Q2)-L(setPxQx)
1110
1111	.int       L(P0Q3)-L(setPxQx)	/* 24 */
1112	.int       L(P1Q3)-L(setPxQx)
1113	.int       L(P2Q3)-L(setPxQx)
1114	.int       L(P3Q3)-L(setPxQx)
1115	.int       L(P4Q3)-L(setPxQx)
1116	.int       L(P5Q3)-L(setPxQx)
1117	.int       L(P6Q3)-L(setPxQx)
1118	.int       L(P7Q3)-L(setPxQx)
1119
1120	.int       L(P0Q4)-L(setPxQx)	/* 32 */
1121	.int       L(P1Q4)-L(setPxQx)
1122	.int       L(P2Q4)-L(setPxQx)
1123	.int       L(P3Q4)-L(setPxQx)
1124	.int       L(P4Q4)-L(setPxQx)
1125	.int       L(P5Q4)-L(setPxQx)
1126	.int       L(P6Q4)-L(setPxQx)
1127	.int       L(P7Q4)-L(setPxQx)
1128
1129	.int       L(P0Q5)-L(setPxQx)	/* 40 */
1130	.int       L(P1Q5)-L(setPxQx)
1131	.int       L(P2Q5)-L(setPxQx)
1132	.int       L(P3Q5)-L(setPxQx)
1133	.int       L(P4Q5)-L(setPxQx)
1134	.int       L(P5Q5)-L(setPxQx)
1135	.int       L(P6Q5)-L(setPxQx)
1136	.int       L(P7Q5)-L(setPxQx)
1137
1138	.int       L(P0Q6)-L(setPxQx)	/* 48 */
1139	.int       L(P1Q6)-L(setPxQx)
1140	.int       L(P2Q6)-L(setPxQx)
1141	.int       L(P3Q6)-L(setPxQx)
1142	.int       L(P4Q6)-L(setPxQx)
1143	.int       L(P5Q6)-L(setPxQx)
1144	.int       L(P6Q6)-L(setPxQx)
1145	.int       L(P7Q6)-L(setPxQx)
1146
1147	.int       L(P0Q7)-L(setPxQx)	/* 56 */
1148	.int       L(P1Q7)-L(setPxQx)
1149	.int       L(P2Q7)-L(setPxQx)
1150	.int       L(P3Q7)-L(setPxQx)
1151	.int       L(P4Q7)-L(setPxQx)
1152	.int       L(P5Q7)-L(setPxQx)
1153	.int       L(P6Q7)-L(setPxQx)
1154	.int       L(P7Q7)-L(setPxQx)
1155
1156	.int       L(P0Q8)-L(setPxQx)	/* 64 */
1157	.int       L(P1Q8)-L(setPxQx)
1158	.int       L(P2Q8)-L(setPxQx)
1159	.int       L(P3Q8)-L(setPxQx)
1160	.int       L(P4Q8)-L(setPxQx)
1161	.int       L(P5Q8)-L(setPxQx)
1162	.int       L(P6Q8)-L(setPxQx)
1163	.int       L(P7Q8)-L(setPxQx)
1164
1165	.int       L(P0Q9)-L(setPxQx)	/* 72 */
1166	.int       L(P1Q9)-L(setPxQx)
1167	.int       L(P2Q9)-L(setPxQx)
1168	.int       L(P3Q9)-L(setPxQx)
1169	.int       L(P4Q9)-L(setPxQx)
1170	.int       L(P5Q9)-L(setPxQx)
1171	.int       L(P6Q9)-L(setPxQx)
1172	.int       L(P7Q9)-L(setPxQx)	/* 79 */
1173
1174	.p2align 4
1175L(P0Q9): mov    %rax, -0x48(%rdi)
1176L(P0Q8): mov    %rax, -0x40(%rdi)
1177L(P0Q7): mov    %rax, -0x38(%rdi)
1178L(P0Q6): mov    %rax, -0x30(%rdi)
1179L(P0Q5): mov    %rax, -0x28(%rdi)
1180L(P0Q4): mov    %rax, -0x20(%rdi)
1181L(P0Q3): mov    %rax, -0x18(%rdi)
1182L(P0Q2): mov    %rax, -0x10(%rdi)
1183L(P0Q1): mov    %rax, -0x8(%rdi)
1184L(P0Q0):
1185	 ret
1186
1187	.p2align 4
1188L(P1Q9): mov    %rax, -0x49(%rdi)
1189L(P1Q8): mov    %rax, -0x41(%rdi)
1190L(P1Q7): mov    %rax, -0x39(%rdi)
1191L(P1Q6): mov    %rax, -0x31(%rdi)
1192L(P1Q5): mov    %rax, -0x29(%rdi)
1193L(P1Q4): mov    %rax, -0x21(%rdi)
1194L(P1Q3): mov    %rax, -0x19(%rdi)
1195L(P1Q2): mov    %rax, -0x11(%rdi)
1196L(P1Q1): mov    %rax, -0x9(%rdi)
1197L(P1Q0): mov    %al, -0x1(%rdi)
1198	 ret
1199
1200	.p2align 4
1201L(P2Q9): mov    %rax, -0x4a(%rdi)
1202L(P2Q8): mov    %rax, -0x42(%rdi)
1203L(P2Q7): mov    %rax, -0x3a(%rdi)
1204L(P2Q6): mov    %rax, -0x32(%rdi)
1205L(P2Q5): mov    %rax, -0x2a(%rdi)
1206L(P2Q4): mov    %rax, -0x22(%rdi)
1207L(P2Q3): mov    %rax, -0x1a(%rdi)
1208L(P2Q2): mov    %rax, -0x12(%rdi)
1209L(P2Q1): mov    %rax, -0xa(%rdi)
1210L(P2Q0): mov    %ax, -0x2(%rdi)
1211	 ret
1212
1213	.p2align 4
1214L(P3Q9): mov    %rax, -0x4b(%rdi)
1215L(P3Q8): mov    %rax, -0x43(%rdi)
1216L(P3Q7): mov    %rax, -0x3b(%rdi)
1217L(P3Q6): mov    %rax, -0x33(%rdi)
1218L(P3Q5): mov    %rax, -0x2b(%rdi)
1219L(P3Q4): mov    %rax, -0x23(%rdi)
1220L(P3Q3): mov    %rax, -0x1b(%rdi)
1221L(P3Q2): mov    %rax, -0x13(%rdi)
1222L(P3Q1): mov    %rax, -0xb(%rdi)
1223L(P3Q0): mov    %ax, -0x3(%rdi)
1224	 mov    %al, -0x1(%rdi)
1225	 ret
1226
1227	.p2align 4
1228L(P4Q9): mov    %rax, -0x4c(%rdi)
1229L(P4Q8): mov    %rax, -0x44(%rdi)
1230L(P4Q7): mov    %rax, -0x3c(%rdi)
1231L(P4Q6): mov    %rax, -0x34(%rdi)
1232L(P4Q5): mov    %rax, -0x2c(%rdi)
1233L(P4Q4): mov    %rax, -0x24(%rdi)
1234L(P4Q3): mov    %rax, -0x1c(%rdi)
1235L(P4Q2): mov    %rax, -0x14(%rdi)
1236L(P4Q1): mov    %rax, -0xc(%rdi)
1237L(P4Q0): mov    %eax, -0x4(%rdi)
1238	 ret
1239
1240	.p2align 4
1241L(P5Q9): mov    %rax, -0x4d(%rdi)
1242L(P5Q8): mov    %rax, -0x45(%rdi)
1243L(P5Q7): mov    %rax, -0x3d(%rdi)
1244L(P5Q6): mov    %rax, -0x35(%rdi)
1245L(P5Q5): mov    %rax, -0x2d(%rdi)
1246L(P5Q4): mov    %rax, -0x25(%rdi)
1247L(P5Q3): mov    %rax, -0x1d(%rdi)
1248L(P5Q2): mov    %rax, -0x15(%rdi)
1249L(P5Q1): mov    %rax, -0xd(%rdi)
1250L(P5Q0): mov    %eax, -0x5(%rdi)
1251	 mov    %al, -0x1(%rdi)
1252	 ret
1253
1254	.p2align 4
1255L(P6Q9): mov    %rax, -0x4e(%rdi)
1256L(P6Q8): mov    %rax, -0x46(%rdi)
1257L(P6Q7): mov    %rax, -0x3e(%rdi)
1258L(P6Q6): mov    %rax, -0x36(%rdi)
1259L(P6Q5): mov    %rax, -0x2e(%rdi)
1260L(P6Q4): mov    %rax, -0x26(%rdi)
1261L(P6Q3): mov    %rax, -0x1e(%rdi)
1262L(P6Q2): mov    %rax, -0x16(%rdi)
1263L(P6Q1): mov    %rax, -0xe(%rdi)
1264L(P6Q0): mov    %eax, -0x6(%rdi)
1265	 mov    %ax, -0x2(%rdi)
1266	 ret
1267
1268	.p2align 4
1269L(P7Q9): mov    %rax, -0x4f(%rdi)
1270L(P7Q8): mov    %rax, -0x47(%rdi)
1271L(P7Q7): mov    %rax, -0x3f(%rdi)
1272L(P7Q6): mov    %rax, -0x37(%rdi)
1273L(P7Q5): mov    %rax, -0x2f(%rdi)
1274L(P7Q4): mov    %rax, -0x27(%rdi)
1275L(P7Q3): mov    %rax, -0x1f(%rdi)
1276L(P7Q2): mov    %rax, -0x17(%rdi)
1277L(P7Q1): mov    %rax, -0xf(%rdi)
1278L(P7Q0): mov    %eax, -0x7(%rdi)
1279	 mov    %ax, -0x3(%rdi)
1280	 mov    %al, -0x1(%rdi)
1281	 ret
1282
1283	/*
1284	 * Align to a 16-byte boundary. Avoids penalties from unaligned stores
1285	 * as well as from stores spanning cachelines. Note 16-byte alignment
1286	 * is better in case where rep sstosq is used.
1287	 */
1288	.p2align 4
1289L(ck_align):
1290	test	$0xf, %rdi
1291	jz	L(aligned_now)
1292	test	$1, %rdi
1293	jz	2f
1294	mov	%al, (%rdi)
1295	dec	%rsi
1296	lea	1(%rdi),%rdi
12972:
1298	test	$2, %rdi
1299	jz	4f
1300	mov	%ax, (%rdi)
1301	sub	$2, %rsi
1302	lea	2(%rdi),%rdi
13034:
1304	test	$4, %rdi
1305	jz	8f
1306	mov	%eax, (%rdi)
1307	sub	$4, %rsi
1308	lea	4(%rdi),%rdi
13098:
1310	test	$8, %rdi
1311	jz	L(aligned_now)
1312	mov	%rax, (%rdi)
1313	sub	$8, %rsi
1314	lea	8(%rdi),%rdi
1315
1316	/*
1317	 * For large sizes rep sstoq is fastest.
1318	 * Transition point determined experimentally as measured on
1319	 * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
1320	 */
1321L(aligned_now):
1322	cmp	$BZERO_USE_REP, %rsi
1323	jg	L(use_rep)
1324
1325	/*
1326	 * zero 64-bytes per loop
1327	 */
1328	.p2align 4
1329L(bzero_loop):
1330	leaq	-0x40(%rsi), %rsi
1331	cmpq	$0x40, %rsi
1332	movq	%rax, (%rdi)
1333	movq	%rax, 0x8(%rdi)
1334	movq	%rax, 0x10(%rdi)
1335	movq	%rax, 0x18(%rdi)
1336	movq	%rax, 0x20(%rdi)
1337	movq	%rax, 0x28(%rdi)
1338	movq	%rax, 0x30(%rdi)
1339	movq	%rax, 0x38(%rdi)
1340	leaq	0x40(%rdi), %rdi
1341	jge	L(bzero_loop)
1342
1343	/*
1344	 * Clear any remaining bytes..
1345	 */
13469:
1347	leaq	L(setPxQx)(%rip), %r10
1348	addq	%rsi, %rdi
1349	movslq	(%r10,%rsi,4), %rcx
1350	leaq	(%rcx,%r10,1), %r10
1351	jmpq	*%r10
1352
1353	/*
1354	 * Use rep sstoq. Clear any remainder via unrolled code
1355	 */
1356	.p2align 4
1357L(use_rep):
1358	movq	%rsi, %rcx		/* get size in bytes */
1359	shrq	$3, %rcx		/* count of 8-byte words to zero */
1360	rep
1361	  sstoq				/* %rcx = words to clear (%rax=0) */
1362	andq	$7, %rsi		/* remaining bytes */
1363	jnz	9b
1364	ret
1365#undef	L
1366	SET_SIZE(bzero_altentry)
1367	SET_SIZE(bzero)
1368
1369#elif defined(__i386)
1370
1371#define	ARG_ADDR	4
1372#define	ARG_COUNT	8
1373
1374	ENTRY(bzero)
1375#ifdef DEBUG
1376	movl	postbootkernelbase, %eax
1377	cmpl	%eax, ARG_ADDR(%esp)
1378	jnb	0f
1379	pushl	%ebp
1380	movl	%esp, %ebp
1381	pushl	$.bzero_panic_msg
1382	call	panic
13830:
1384#endif
1385do_zero:
1386	movl	%edi, %edx
1387	movl	ARG_COUNT(%esp), %ecx
1388	movl	ARG_ADDR(%esp), %edi
1389	shrl	$2, %ecx
1390	xorl	%eax, %eax
1391	rep
1392	  sstol
1393	movl	ARG_COUNT(%esp), %ecx
1394	andl	$3, %ecx
1395	rep
1396	  sstob
1397	movl	%edx, %edi
1398	ret
1399	SET_SIZE(bzero)
1400
1401#undef	ARG_ADDR
1402#undef	ARG_COUNT
1403
1404#endif	/* __i386 */
1405#endif	/* __lint */
1406
1407/*
1408 * Transfer data to and from user space -
1409 * Note that these routines can cause faults
1410 * It is assumed that the kernel has nothing at
1411 * less than KERNELBASE in the virtual address space.
1412 *
1413 * Note that copyin(9F) and copyout(9F) are part of the
1414 * DDI/DKI which specifies that they return '-1' on "errors."
1415 *
1416 * Sigh.
1417 *
1418 * So there's two extremely similar routines - xcopyin_nta() and
1419 * xcopyout_nta() which return the errno that we've faithfully computed.
1420 * This allows other callers (e.g. uiomove(9F)) to work correctly.
1421 * Given that these are used pretty heavily, we expand the calling
1422 * sequences inline for all flavours (rather than making wrappers).
1423 */
1424
1425/*
1426 * Copy user data to kernel space.
1427 */
1428
1429#if defined(__lint)
1430
1431/* ARGSUSED */
1432int
1433copyin(const void *uaddr, void *kaddr, size_t count)
1434{ return (0); }
1435
1436#else	/* lint */
1437
1438#if defined(__amd64)
1439
1440	ENTRY(copyin)
1441	pushq	%rbp
1442	movq	%rsp, %rbp
1443	subq	$32, %rsp
1444
1445	/*
1446	 * save args in case we trap and need to rerun as a copyop
1447	 */
1448	movq	%rdi, (%rsp)
1449	movq	%rsi, 0x8(%rsp)
1450	movq	%rdx, 0x10(%rsp)
1451
1452	movq	kernelbase(%rip), %rax
1453#ifdef DEBUG
1454	cmpq	%rax, %rsi		/* %rsi = kaddr */
1455	jnb	1f
1456	leaq	.copyin_panic_msg(%rip), %rdi
1457	xorl	%eax, %eax
1458	call	panic
14591:
1460#endif
1461	/*
1462	 * pass lofault value as 4th argument to do_copy_fault
1463	 */
1464	leaq	_copyin_err(%rip), %rcx
1465
1466	movq	%gs:CPU_THREAD, %r9
1467	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1468	jb	do_copy_fault
1469	jmp	3f
1470
1471_copyin_err:
1472	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
14733:
1474	movq	T_COPYOPS(%r9), %rax
1475	cmpq	$0, %rax
1476	jz	2f
1477	/*
1478	 * reload args for the copyop
1479	 */
1480	movq	(%rsp), %rdi
1481	movq	0x8(%rsp), %rsi
1482	movq	0x10(%rsp), %rdx
1483	leave
1484	jmp	*CP_COPYIN(%rax)
1485
14862:	movl	$-1, %eax
1487	leave
1488	ret
1489	SET_SIZE(copyin)
1490
1491#elif defined(__i386)
1492
1493#define	ARG_UADDR	4
1494#define	ARG_KADDR	8
1495
1496	ENTRY(copyin)
1497	movl	kernelbase, %ecx
1498#ifdef DEBUG
1499	cmpl	%ecx, ARG_KADDR(%esp)
1500	jnb	1f
1501	pushl	%ebp
1502	movl	%esp, %ebp
1503	pushl	$.copyin_panic_msg
1504	call	panic
15051:
1506#endif
1507	lea	_copyin_err, %eax
1508
1509	movl	%gs:CPU_THREAD, %edx
1510	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1511	jb	do_copy_fault
1512	jmp	3f
1513
1514_copyin_err:
1515	popl	%ecx
1516	popl	%edi
1517	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1518	popl	%esi
1519	popl	%ebp
15203:
1521	movl	T_COPYOPS(%edx), %eax
1522	cmpl	$0, %eax
1523	jz	2f
1524	jmp	*CP_COPYIN(%eax)
1525
15262:	movl	$-1, %eax
1527	ret
1528	SET_SIZE(copyin)
1529
1530#undef	ARG_UADDR
1531#undef	ARG_KADDR
1532
1533#endif	/* __i386 */
1534#endif	/* __lint */
1535
1536#if defined(__lint)
1537
1538/* ARGSUSED */
1539int
1540xcopyin_nta(const void *uaddr, void *kaddr, size_t count, int copy_cached)
1541{ return (0); }
1542
1543#else	/* __lint */
1544
1545#if defined(__amd64)
1546
1547	ENTRY(xcopyin_nta)
1548	pushq	%rbp
1549	movq	%rsp, %rbp
1550	subq	$32, %rsp
1551
1552	/*
1553	 * save args in case we trap and need to rerun as a copyop
1554	 * %rcx is consumed in this routine so we don't need to save
1555	 * it.
1556	 */
1557	movq	%rdi, (%rsp)
1558	movq	%rsi, 0x8(%rsp)
1559	movq	%rdx, 0x10(%rsp)
1560
1561	movq	kernelbase(%rip), %rax
1562#ifdef DEBUG
1563	cmpq	%rax, %rsi		/* %rsi = kaddr */
1564	jnb	1f
1565	leaq	.xcopyin_panic_msg(%rip), %rdi
1566	xorl	%eax, %eax
1567	call	panic
15681:
1569#endif
1570	movq	%gs:CPU_THREAD, %r9
1571	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1572	jae	4f
1573	cmpq	$0, %rcx		/* No non-temporal access? */
1574	/*
1575	 * pass lofault value as 4th argument to do_copy_fault
1576	 */
1577	leaq	_xcopyin_err(%rip), %rcx	/* doesn't set rflags */
1578	jnz	do_copy_fault		/* use regular access */
1579	/*
1580	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1581	 */
1582	cmpq	$XCOPY_MIN_SIZE, %rdx
1583	jb	do_copy_fault
1584
1585	/*
1586	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1587	 * count is COUNT_ALIGN_SIZE aligned.
1588	 */
1589	movq	%rdi, %r10
1590	orq	%rsi, %r10
1591	andq	$NTA_ALIGN_MASK, %r10
1592	orq	%rdx, %r10
1593	andq	$COUNT_ALIGN_MASK, %r10
1594	jnz	do_copy_fault
1595	jmp	do_copy_fault_nta	/* use non-temporal access */
1596
15974:
1598	movl	$EFAULT, %eax
1599	jmp	3f
1600
1601	/*
1602	 * A fault during do_copy_fault or do_copy_fault_nta is
1603	 * indicated through an errno value in %rax and we iret from the
1604	 * trap handler to here.
1605	 */
1606_xcopyin_err:
1607	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
16083:
1609	movq	T_COPYOPS(%r9), %r8
1610	cmpq	$0, %r8
1611	jz	2f
1612
1613	/*
1614	 * reload args for the copyop
1615	 */
1616	movq	(%rsp), %rdi
1617	movq	0x8(%rsp), %rsi
1618	movq	0x10(%rsp), %rdx
1619	leave
1620	jmp	*CP_XCOPYIN(%r8)
1621
16222:	leave
1623	ret
1624	SET_SIZE(xcopyin_nta)
1625
1626#elif defined(__i386)
1627
1628#define	ARG_UADDR	4
1629#define	ARG_KADDR	8
1630#define	ARG_COUNT	12
1631#define	ARG_CACHED	16
1632
1633	.globl	use_sse_copy
1634
1635	ENTRY(xcopyin_nta)
1636	movl	kernelbase, %ecx
1637	lea	_xcopyin_err, %eax
1638	movl	%gs:CPU_THREAD, %edx
1639	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1640	jae	4f
1641
1642	cmpl	$0, use_sse_copy	/* no sse support */
1643	jz	do_copy_fault
1644
1645	cmpl	$0, ARG_CACHED(%esp)	/* copy_cached hint set? */
1646	jnz	do_copy_fault
1647
1648	/*
1649	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1650	 */
1651	cmpl	$XCOPY_MIN_SIZE, ARG_COUNT(%esp)
1652	jb	do_copy_fault
1653
1654	/*
1655	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1656	 * count is COUNT_ALIGN_SIZE aligned.
1657	 */
1658	movl	ARG_UADDR(%esp), %ecx
1659	orl	ARG_KADDR(%esp), %ecx
1660	andl	$NTA_ALIGN_MASK, %ecx
1661	orl	ARG_COUNT(%esp), %ecx
1662	andl	$COUNT_ALIGN_MASK, %ecx
1663	jnz	do_copy_fault
1664
1665	jmp	do_copy_fault_nta	/* use regular access */
1666
16674:
1668	movl	$EFAULT, %eax
1669	jmp	3f
1670
1671	/*
1672	 * A fault during do_copy_fault or do_copy_fault_nta is
1673	 * indicated through an errno value in %eax and we iret from the
1674	 * trap handler to here.
1675	 */
1676_xcopyin_err:
1677	popl	%ecx
1678	popl	%edi
1679	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1680	popl	%esi
1681	popl	%ebp
16823:
1683	cmpl	$0, T_COPYOPS(%edx)
1684	jz	2f
1685	movl	T_COPYOPS(%edx), %eax
1686	jmp	*CP_XCOPYIN(%eax)
1687
16882:	rep; 	ret	/* use 2 byte return instruction when branch target */
1689			/* AMD Software Optimization Guide - Section 6.2 */
1690	SET_SIZE(xcopyin_nta)
1691
1692#undef	ARG_UADDR
1693#undef	ARG_KADDR
1694#undef	ARG_COUNT
1695#undef	ARG_CACHED
1696
1697#endif	/* __i386 */
1698#endif	/* __lint */
1699
1700/*
1701 * Copy kernel data to user space.
1702 */
1703
1704#if defined(__lint)
1705
1706/* ARGSUSED */
1707int
1708copyout(const void *kaddr, void *uaddr, size_t count)
1709{ return (0); }
1710
1711#else	/* __lint */
1712
1713#if defined(__amd64)
1714
1715	ENTRY(copyout)
1716	pushq	%rbp
1717	movq	%rsp, %rbp
1718	subq	$32, %rsp
1719
1720	/*
1721	 * save args in case we trap and need to rerun as a copyop
1722	 */
1723	movq	%rdi, (%rsp)
1724	movq	%rsi, 0x8(%rsp)
1725	movq	%rdx, 0x10(%rsp)
1726
1727	movq	kernelbase(%rip), %rax
1728#ifdef DEBUG
1729	cmpq	%rax, %rdi		/* %rdi = kaddr */
1730	jnb	1f
1731	leaq	.copyout_panic_msg(%rip), %rdi
1732	xorl	%eax, %eax
1733	call	panic
17341:
1735#endif
1736	/*
1737	 * pass lofault value as 4th argument to do_copy_fault
1738	 */
1739	leaq	_copyout_err(%rip), %rcx
1740
1741	movq	%gs:CPU_THREAD, %r9
1742	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1743	jb	do_copy_fault
1744	jmp	3f
1745
1746_copyout_err:
1747	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
17483:
1749	movq	T_COPYOPS(%r9), %rax
1750	cmpq	$0, %rax
1751	jz	2f
1752
1753	/*
1754	 * reload args for the copyop
1755	 */
1756	movq	(%rsp), %rdi
1757	movq	0x8(%rsp), %rsi
1758	movq	0x10(%rsp), %rdx
1759	leave
1760	jmp	*CP_COPYOUT(%rax)
1761
17622:	movl	$-1, %eax
1763	leave
1764	ret
1765	SET_SIZE(copyout)
1766
1767#elif defined(__i386)
1768
1769#define	ARG_KADDR	4
1770#define	ARG_UADDR	8
1771
1772	ENTRY(copyout)
1773	movl	kernelbase, %ecx
1774#ifdef DEBUG
1775	cmpl	%ecx, ARG_KADDR(%esp)
1776	jnb	1f
1777	pushl	%ebp
1778	movl	%esp, %ebp
1779	pushl	$.copyout_panic_msg
1780	call	panic
17811:
1782#endif
1783	lea	_copyout_err, %eax
1784	movl	%gs:CPU_THREAD, %edx
1785	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1786	jb	do_copy_fault
1787	jmp	3f
1788
1789_copyout_err:
1790	popl	%ecx
1791	popl	%edi
1792	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1793	popl	%esi
1794	popl	%ebp
17953:
1796	movl	T_COPYOPS(%edx), %eax
1797	cmpl	$0, %eax
1798	jz	2f
1799	jmp	*CP_COPYOUT(%eax)
1800
18012:	movl	$-1, %eax
1802	ret
1803	SET_SIZE(copyout)
1804
1805#undef	ARG_UADDR
1806#undef	ARG_KADDR
1807
1808#endif	/* __i386 */
1809#endif	/* __lint */
1810
1811#if defined(__lint)
1812
1813/* ARGSUSED */
1814int
1815xcopyout_nta(const void *kaddr, void *uaddr, size_t count, int copy_cached)
1816{ return (0); }
1817
1818#else	/* __lint */
1819
1820#if defined(__amd64)
1821
1822	ENTRY(xcopyout_nta)
1823	pushq	%rbp
1824	movq	%rsp, %rbp
1825	subq	$32, %rsp
1826
1827	/*
1828	 * save args in case we trap and need to rerun as a copyop
1829	 */
1830	movq	%rdi, (%rsp)
1831	movq	%rsi, 0x8(%rsp)
1832	movq	%rdx, 0x10(%rsp)
1833
1834	movq	kernelbase(%rip), %rax
1835#ifdef DEBUG
1836	cmpq	%rax, %rdi		/* %rdi = kaddr */
1837	jnb	1f
1838	leaq	.xcopyout_panic_msg(%rip), %rdi
1839	xorl	%eax, %eax
1840	call	panic
18411:
1842#endif
1843	movq	%gs:CPU_THREAD, %r9
1844	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1845	jae	4f
1846
1847	cmpq	$0, %rcx		/* No non-temporal access? */
1848	/*
1849	 * pass lofault value as 4th argument to do_copy_fault
1850	 */
1851	leaq	_xcopyout_err(%rip), %rcx
1852	jnz	do_copy_fault
1853	/*
1854	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1855	 */
1856	cmpq	$XCOPY_MIN_SIZE, %rdx
1857	jb	do_copy_fault
1858
1859	/*
1860	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1861	 * count is COUNT_ALIGN_SIZE aligned.
1862	 */
1863	movq	%rdi, %r10
1864	orq	%rsi, %r10
1865	andq	$NTA_ALIGN_MASK, %r10
1866	orq	%rdx, %r10
1867	andq	$COUNT_ALIGN_MASK, %r10
1868	jnz	do_copy_fault
1869	jmp	do_copy_fault_nta
1870
18714:
1872	movl	$EFAULT, %eax
1873	jmp	3f
1874
1875	/*
1876	 * A fault during do_copy_fault or do_copy_fault_nta is
1877	 * indicated through an errno value in %rax and we iret from the
1878	 * trap handler to here.
1879	 */
1880_xcopyout_err:
1881	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
18823:
1883	movq	T_COPYOPS(%r9), %r8
1884	cmpq	$0, %r8
1885	jz	2f
1886
1887	/*
1888	 * reload args for the copyop
1889	 */
1890	movq	(%rsp), %rdi
1891	movq	0x8(%rsp), %rsi
1892	movq	0x10(%rsp), %rdx
1893	leave
1894	jmp	*CP_XCOPYOUT(%r8)
1895
18962:	leave
1897	ret
1898	SET_SIZE(xcopyout_nta)
1899
1900#elif defined(__i386)
1901
1902#define	ARG_KADDR	4
1903#define	ARG_UADDR	8
1904#define	ARG_COUNT	12
1905#define	ARG_CACHED	16
1906
1907	ENTRY(xcopyout_nta)
1908	movl	kernelbase, %ecx
1909	lea	_xcopyout_err, %eax
1910	movl	%gs:CPU_THREAD, %edx
1911	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1912	jae	4f
1913
1914	cmpl	$0, use_sse_copy	/* no sse support */
1915	jz	do_copy_fault
1916
1917	cmpl	$0, ARG_CACHED(%esp)	/* copy_cached hint set? */
1918	jnz	do_copy_fault
1919
1920	/*
1921	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1922	 */
1923	cmpl	$XCOPY_MIN_SIZE, %edx
1924	jb	do_copy_fault
1925
1926	/*
1927	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1928	 * count is COUNT_ALIGN_SIZE aligned.
1929	 */
1930	movl	ARG_UADDR(%esp), %ecx
1931	orl	ARG_KADDR(%esp), %ecx
1932	andl	$NTA_ALIGN_MASK, %ecx
1933	orl	ARG_COUNT(%esp), %ecx
1934	andl	$COUNT_ALIGN_MASK, %ecx
1935	jnz	do_copy_fault
1936	jmp	do_copy_fault_nta
1937
19384:
1939	movl	$EFAULT, %eax
1940	jmp	3f
1941
1942	/*
1943	 * A fault during do_copy_fault or do_copy_fault_nta is
1944	 * indicated through an errno value in %eax and we iret from the
1945	 * trap handler to here.
1946	 */
1947_xcopyout_err:
1948	/ restore the original lofault
1949	popl	%ecx
1950	popl	%edi
1951	movl	%ecx, T_LOFAULT(%edx)	/ original lofault
1952	popl	%esi
1953	popl	%ebp
19543:
1955	cmpl	$0, T_COPYOPS(%edx)
1956	jz	2f
1957	movl	T_COPYOPS(%edx), %eax
1958	jmp	*CP_XCOPYOUT(%eax)
1959
19602:	rep;	ret	/* use 2 byte return instruction when branch target */
1961			/* AMD Software Optimization Guide - Section 6.2 */
1962	SET_SIZE(xcopyout_nta)
1963
1964#undef	ARG_UADDR
1965#undef	ARG_KADDR
1966#undef	ARG_COUNT
1967#undef	ARG_CACHED
1968
1969#endif	/* __i386 */
1970#endif	/* __lint */
1971
1972/*
1973 * Copy a null terminated string from one point to another in
1974 * the kernel address space.
1975 */
1976
1977#if defined(__lint)
1978
1979/* ARGSUSED */
1980int
1981copystr(const char *from, char *to, size_t maxlength, size_t *lencopied)
1982{ return (0); }
1983
1984#else	/* __lint */
1985
1986#if defined(__amd64)
1987
1988	ENTRY(copystr)
1989	pushq	%rbp
1990	movq	%rsp, %rbp
1991#ifdef DEBUG
1992	movq	kernelbase(%rip), %rax
1993	cmpq	%rax, %rdi		/* %rdi = from */
1994	jb	0f
1995	cmpq	%rax, %rsi		/* %rsi = to */
1996	jnb	1f
19970:	leaq	.copystr_panic_msg(%rip), %rdi
1998	xorl	%eax, %eax
1999	call	panic
20001:
2001#endif
2002	movq	%gs:CPU_THREAD, %r9
2003	movq	T_LOFAULT(%r9), %r8	/* pass current lofault value as */
2004					/* 5th argument to do_copystr */
2005do_copystr:
2006	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
2007	movq    T_LOFAULT(%r9), %r11	/* save the current lofault */
2008	movq	%r8, T_LOFAULT(%r9)	/* new lofault */
2009
2010	movq	%rdx, %r8		/* save maxlength */
2011
2012	cmpq	$0, %rdx		/* %rdx = maxlength */
2013	je	copystr_enametoolong	/* maxlength == 0 */
2014
2015copystr_loop:
2016	decq	%r8
2017	movb	(%rdi), %al
2018	incq	%rdi
2019	movb	%al, (%rsi)
2020	incq	%rsi
2021	cmpb	$0, %al
2022	je	copystr_null		/* null char */
2023	cmpq	$0, %r8
2024	jne	copystr_loop
2025
2026copystr_enametoolong:
2027	movl	$ENAMETOOLONG, %eax
2028	jmp	copystr_out
2029
2030copystr_null:
2031	xorl	%eax, %eax		/* no error */
2032
2033copystr_out:
2034	cmpq	$0, %rcx		/* want length? */
2035	je	copystr_done		/* no */
2036	subq	%r8, %rdx		/* compute length and store it */
2037	movq	%rdx, (%rcx)
2038
2039copystr_done:
2040	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
2041	leave
2042	ret
2043	SET_SIZE(copystr)
2044
2045#elif defined(__i386)
2046
2047#define	ARG_FROM	8
2048#define	ARG_TO		12
2049#define	ARG_MAXLEN	16
2050#define	ARG_LENCOPIED	20
2051
2052	ENTRY(copystr)
2053#ifdef DEBUG
2054	pushl	%ebp
2055	movl	%esp, %ebp
2056	movl	kernelbase, %eax
2057	cmpl	%eax, ARG_FROM(%esp)
2058	jb	0f
2059	cmpl	%eax, ARG_TO(%esp)
2060	jnb	1f
20610:	pushl	$.copystr_panic_msg
2062	call	panic
20631:	popl	%ebp
2064#endif
2065	/* get the current lofault address */
2066	movl	%gs:CPU_THREAD, %eax
2067	movl	T_LOFAULT(%eax), %eax
2068do_copystr:
2069	pushl	%ebp			/* setup stack frame */
2070	movl	%esp, %ebp
2071	pushl	%ebx			/* save registers */
2072	pushl	%edi
2073
2074	movl	%gs:CPU_THREAD, %ebx
2075	movl	T_LOFAULT(%ebx), %edi
2076	pushl	%edi			/* save the current lofault */
2077	movl	%eax, T_LOFAULT(%ebx)	/* new lofault */
2078
2079	movl	ARG_MAXLEN(%ebp), %ecx
2080	cmpl	$0, %ecx
2081	je	copystr_enametoolong	/* maxlength == 0 */
2082
2083	movl	ARG_FROM(%ebp), %ebx	/* source address */
2084	movl	ARG_TO(%ebp), %edx	/* destination address */
2085
2086copystr_loop:
2087	decl	%ecx
2088	movb	(%ebx), %al
2089	incl	%ebx
2090	movb	%al, (%edx)
2091	incl	%edx
2092	cmpb	$0, %al
2093	je	copystr_null		/* null char */
2094	cmpl	$0, %ecx
2095	jne	copystr_loop
2096
2097copystr_enametoolong:
2098	movl	$ENAMETOOLONG, %eax
2099	jmp	copystr_out
2100
2101copystr_null:
2102	xorl	%eax, %eax		/* no error */
2103
2104copystr_out:
2105	cmpl	$0, ARG_LENCOPIED(%ebp)	/* want length? */
2106	je	copystr_done		/* no */
2107	movl	ARG_MAXLEN(%ebp), %edx
2108	subl	%ecx, %edx		/* compute length and store it */
2109	movl	ARG_LENCOPIED(%ebp), %ecx
2110	movl	%edx, (%ecx)
2111
2112copystr_done:
2113	popl	%edi
2114	movl	%gs:CPU_THREAD, %ebx
2115	movl	%edi, T_LOFAULT(%ebx)	/* restore the original lofault */
2116
2117	popl	%edi
2118	popl	%ebx
2119	popl	%ebp
2120	ret
2121	SET_SIZE(copystr)
2122
2123#undef	ARG_FROM
2124#undef	ARG_TO
2125#undef	ARG_MAXLEN
2126#undef	ARG_LENCOPIED
2127
2128#endif	/* __i386 */
2129#endif	/* __lint */
2130
2131/*
2132 * Copy a null terminated string from the user address space into
2133 * the kernel address space.
2134 */
2135
2136#if defined(__lint)
2137
2138/* ARGSUSED */
2139int
2140copyinstr(const char *uaddr, char *kaddr, size_t maxlength,
2141    size_t *lencopied)
2142{ return (0); }
2143
2144#else	/* __lint */
2145
2146#if defined(__amd64)
2147
2148	ENTRY(copyinstr)
2149	pushq	%rbp
2150	movq	%rsp, %rbp
2151	subq	$32, %rsp
2152
2153	/*
2154	 * save args in case we trap and need to rerun as a copyop
2155	 */
2156	movq	%rdi, (%rsp)
2157	movq	%rsi, 0x8(%rsp)
2158	movq	%rdx, 0x10(%rsp)
2159	movq	%rcx, 0x18(%rsp)
2160
2161	movq	kernelbase(%rip), %rax
2162#ifdef DEBUG
2163	cmpq	%rax, %rsi		/* %rsi = kaddr */
2164	jnb	1f
2165	leaq	.copyinstr_panic_msg(%rip), %rdi
2166	xorl	%eax, %eax
2167	call	panic
21681:
2169#endif
2170	/*
2171	 * pass lofault value as 5th argument to do_copystr
2172	 */
2173	leaq	_copyinstr_error(%rip), %r8
2174
2175	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
2176	jb	do_copystr
2177	movq	%gs:CPU_THREAD, %r9
2178	jmp	3f
2179
2180_copyinstr_error:
2181	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
21823:
2183	movq	T_COPYOPS(%r9), %rax
2184	cmpq	$0, %rax
2185	jz	2f
2186
2187	/*
2188	 * reload args for the copyop
2189	 */
2190	movq	(%rsp), %rdi
2191	movq	0x8(%rsp), %rsi
2192	movq	0x10(%rsp), %rdx
2193	movq	0x18(%rsp), %rcx
2194	leave
2195	jmp	*CP_COPYINSTR(%rax)
2196
21972:	movl	$EFAULT, %eax		/* return EFAULT */
2198	leave
2199	ret
2200	SET_SIZE(copyinstr)
2201
2202#elif defined(__i386)
2203
2204#define	ARG_UADDR	4
2205#define	ARG_KADDR	8
2206
2207	ENTRY(copyinstr)
2208	movl	kernelbase, %ecx
2209#ifdef DEBUG
2210	cmpl	%ecx, ARG_KADDR(%esp)
2211	jnb	1f
2212	pushl	%ebp
2213	movl	%esp, %ebp
2214	pushl	$.copyinstr_panic_msg
2215	call	panic
22161:
2217#endif
2218	lea	_copyinstr_error, %eax
2219	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
2220	jb	do_copystr
2221	movl	%gs:CPU_THREAD, %edx
2222	jmp	3f
2223
2224_copyinstr_error:
2225	popl	%edi
2226	movl	%gs:CPU_THREAD, %edx
2227	movl	%edi, T_LOFAULT(%edx)	/* original lofault */
2228
2229	popl	%edi
2230	popl	%ebx
2231	popl	%ebp
22323:
2233	movl	T_COPYOPS(%edx), %eax
2234	cmpl	$0, %eax
2235	jz	2f
2236	jmp	*CP_COPYINSTR(%eax)
2237
22382:	movl	$EFAULT, %eax		/* return EFAULT */
2239	ret
2240	SET_SIZE(copyinstr)
2241
2242#undef	ARG_UADDR
2243#undef	ARG_KADDR
2244
2245#endif	/* __i386 */
2246#endif	/* __lint */
2247
2248/*
2249 * Copy a null terminated string from the kernel
2250 * address space to the user address space.
2251 */
2252
2253#if defined(__lint)
2254
2255/* ARGSUSED */
2256int
2257copyoutstr(const char *kaddr, char *uaddr, size_t maxlength,
2258    size_t *lencopied)
2259{ return (0); }
2260
2261#else	/* __lint */
2262
2263#if defined(__amd64)
2264
2265	ENTRY(copyoutstr)
2266	pushq	%rbp
2267	movq	%rsp, %rbp
2268	subq	$32, %rsp
2269
2270	/*
2271	 * save args in case we trap and need to rerun as a copyop
2272	 */
2273	movq	%rdi, (%rsp)
2274	movq	%rsi, 0x8(%rsp)
2275	movq	%rdx, 0x10(%rsp)
2276	movq	%rcx, 0x18(%rsp)
2277
2278	movq	kernelbase(%rip), %rax
2279#ifdef DEBUG
2280	cmpq	%rax, %rdi		/* %rdi = kaddr */
2281	jnb	1f
2282	leaq	.copyoutstr_panic_msg(%rip), %rdi
2283	jmp	call_panic		/* setup stack and call panic */
22841:
2285#endif
2286	/*
2287	 * pass lofault value as 5th argument to do_copystr
2288	 */
2289	leaq	_copyoutstr_error(%rip), %r8
2290
2291	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
2292	jb	do_copystr
2293	movq	%gs:CPU_THREAD, %r9
2294	jmp	3f
2295
2296_copyoutstr_error:
2297	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
22983:
2299	movq	T_COPYOPS(%r9), %rax
2300	cmpq	$0, %rax
2301	jz	2f
2302
2303	/*
2304	 * reload args for the copyop
2305	 */
2306	movq	(%rsp), %rdi
2307	movq	0x8(%rsp), %rsi
2308	movq	0x10(%rsp), %rdx
2309	movq	0x18(%rsp), %rcx
2310	leave
2311	jmp	*CP_COPYOUTSTR(%rax)
2312
23132:	movl	$EFAULT, %eax		/* return EFAULT */
2314	leave
2315	ret
2316	SET_SIZE(copyoutstr)
2317
2318#elif defined(__i386)
2319
2320#define	ARG_KADDR	4
2321#define	ARG_UADDR	8
2322
2323	ENTRY(copyoutstr)
2324	movl	kernelbase, %ecx
2325#ifdef DEBUG
2326	cmpl	%ecx, ARG_KADDR(%esp)
2327	jnb	1f
2328	pushl	%ebp
2329	movl	%esp, %ebp
2330	pushl	$.copyoutstr_panic_msg
2331	call	panic
23321:
2333#endif
2334	lea	_copyoutstr_error, %eax
2335	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
2336	jb	do_copystr
2337	movl	%gs:CPU_THREAD, %edx
2338	jmp	3f
2339
2340_copyoutstr_error:
2341	popl	%edi
2342	movl	%gs:CPU_THREAD, %edx
2343	movl	%edi, T_LOFAULT(%edx)	/* restore the original lofault */
2344
2345	popl	%edi
2346	popl	%ebx
2347	popl	%ebp
23483:
2349	movl	T_COPYOPS(%edx), %eax
2350	cmpl	$0, %eax
2351	jz	2f
2352	jmp	*CP_COPYOUTSTR(%eax)
2353
23542:	movl	$EFAULT, %eax		/* return EFAULT */
2355	ret
2356	SET_SIZE(copyoutstr)
2357
2358#undef	ARG_KADDR
2359#undef	ARG_UADDR
2360
2361#endif	/* __i386 */
2362#endif	/* __lint */
2363
2364/*
2365 * Since all of the fuword() variants are so similar, we have a macro to spit
2366 * them out.  This allows us to create DTrace-unobservable functions easily.
2367 */
2368
2369#if defined(__lint)
2370
2371#if defined(__amd64)
2372
2373/* ARGSUSED */
2374int
2375fuword64(const void *addr, uint64_t *dst)
2376{ return (0); }
2377
2378#endif
2379
2380/* ARGSUSED */
2381int
2382fuword32(const void *addr, uint32_t *dst)
2383{ return (0); }
2384
2385/* ARGSUSED */
2386int
2387fuword16(const void *addr, uint16_t *dst)
2388{ return (0); }
2389
2390/* ARGSUSED */
2391int
2392fuword8(const void *addr, uint8_t *dst)
2393{ return (0); }
2394
2395#else	/* __lint */
2396
2397#if defined(__amd64)
2398
2399/*
2400 * (Note that we don't save and reload the arguments here
2401 * because their values are not altered in the copy path)
2402 */
2403
2404#define	FUWORD(NAME, INSTR, REG, COPYOP)	\
2405	ENTRY(NAME)				\
2406	movq	%gs:CPU_THREAD, %r9;		\
2407	cmpq	kernelbase(%rip), %rdi;		\
2408	jae	1f;				\
2409	leaq	_flt_/**/NAME, %rdx;		\
2410	movq	%rdx, T_LOFAULT(%r9);		\
2411	INSTR	(%rdi), REG;			\
2412	movq	$0, T_LOFAULT(%r9);		\
2413	INSTR	REG, (%rsi);			\
2414	xorl	%eax, %eax;			\
2415	ret;					\
2416_flt_/**/NAME:					\
2417	movq	$0, T_LOFAULT(%r9);		\
24181:						\
2419	movq	T_COPYOPS(%r9), %rax;		\
2420	cmpq	$0, %rax;			\
2421	jz	2f;				\
2422	jmp	*COPYOP(%rax);			\
24232:						\
2424	movl	$-1, %eax;			\
2425	ret;					\
2426	SET_SIZE(NAME)
2427
2428	FUWORD(fuword64, movq, %rax, CP_FUWORD64)
2429	FUWORD(fuword32, movl, %eax, CP_FUWORD32)
2430	FUWORD(fuword16, movw, %ax, CP_FUWORD16)
2431	FUWORD(fuword8, movb, %al, CP_FUWORD8)
2432
2433#elif defined(__i386)
2434
2435#define	FUWORD(NAME, INSTR, REG, COPYOP)	\
2436	ENTRY(NAME)				\
2437	movl	%gs:CPU_THREAD, %ecx;		\
2438	movl	kernelbase, %eax;		\
2439	cmpl	%eax, 4(%esp);			\
2440	jae	1f;				\
2441	lea	_flt_/**/NAME, %edx;		\
2442	movl	%edx, T_LOFAULT(%ecx);		\
2443	movl	4(%esp), %eax;			\
2444	movl	8(%esp), %edx;			\
2445	INSTR	(%eax), REG;			\
2446	movl	$0, T_LOFAULT(%ecx);		\
2447	INSTR	REG, (%edx);			\
2448	xorl	%eax, %eax;			\
2449	ret;					\
2450_flt_/**/NAME:					\
2451	movl	$0, T_LOFAULT(%ecx);		\
24521:						\
2453	movl	T_COPYOPS(%ecx), %eax;		\
2454	cmpl	$0, %eax;			\
2455	jz	2f;				\
2456	jmp	*COPYOP(%eax);			\
24572:						\
2458	movl	$-1, %eax;			\
2459	ret;					\
2460	SET_SIZE(NAME)
2461
2462	FUWORD(fuword32, movl, %eax, CP_FUWORD32)
2463	FUWORD(fuword16, movw, %ax, CP_FUWORD16)
2464	FUWORD(fuword8, movb, %al, CP_FUWORD8)
2465
2466#endif	/* __i386 */
2467
2468#undef	FUWORD
2469
2470#endif	/* __lint */
2471
2472/*
2473 * Set user word.
2474 */
2475
2476#if defined(__lint)
2477
2478#if defined(__amd64)
2479
2480/* ARGSUSED */
2481int
2482suword64(void *addr, uint64_t value)
2483{ return (0); }
2484
2485#endif
2486
2487/* ARGSUSED */
2488int
2489suword32(void *addr, uint32_t value)
2490{ return (0); }
2491
2492/* ARGSUSED */
2493int
2494suword16(void *addr, uint16_t value)
2495{ return (0); }
2496
2497/* ARGSUSED */
2498int
2499suword8(void *addr, uint8_t value)
2500{ return (0); }
2501
2502#else	/* lint */
2503
2504#if defined(__amd64)
2505
2506/*
2507 * (Note that we don't save and reload the arguments here
2508 * because their values are not altered in the copy path)
2509 */
2510
2511#define	SUWORD(NAME, INSTR, REG, COPYOP)	\
2512	ENTRY(NAME)				\
2513	movq	%gs:CPU_THREAD, %r9;		\
2514	cmpq	kernelbase(%rip), %rdi;		\
2515	jae	1f;				\
2516	leaq	_flt_/**/NAME, %rdx;		\
2517	movq	%rdx, T_LOFAULT(%r9);		\
2518	INSTR	REG, (%rdi);			\
2519	movq	$0, T_LOFAULT(%r9);		\
2520	xorl	%eax, %eax;			\
2521	ret;					\
2522_flt_/**/NAME:					\
2523	movq	$0, T_LOFAULT(%r9);		\
25241:						\
2525	movq	T_COPYOPS(%r9), %rax;		\
2526	cmpq	$0, %rax;			\
2527	jz	3f;				\
2528	jmp	*COPYOP(%rax);			\
25293:						\
2530	movl	$-1, %eax;			\
2531	ret;					\
2532	SET_SIZE(NAME)
2533
2534	SUWORD(suword64, movq, %rsi, CP_SUWORD64)
2535	SUWORD(suword32, movl, %esi, CP_SUWORD32)
2536	SUWORD(suword16, movw, %si, CP_SUWORD16)
2537	SUWORD(suword8, movb, %sil, CP_SUWORD8)
2538
2539#elif defined(__i386)
2540
2541#define	SUWORD(NAME, INSTR, REG, COPYOP)	\
2542	ENTRY(NAME)				\
2543	movl	%gs:CPU_THREAD, %ecx;		\
2544	movl	kernelbase, %eax;		\
2545	cmpl	%eax, 4(%esp);			\
2546	jae	1f;				\
2547	lea	_flt_/**/NAME, %edx;		\
2548	movl	%edx, T_LOFAULT(%ecx);		\
2549	movl	4(%esp), %eax;			\
2550	movl	8(%esp), %edx;			\
2551	INSTR	REG, (%eax);			\
2552	movl	$0, T_LOFAULT(%ecx);		\
2553	xorl	%eax, %eax;			\
2554	ret;					\
2555_flt_/**/NAME:					\
2556	movl	$0, T_LOFAULT(%ecx);		\
25571:						\
2558	movl	T_COPYOPS(%ecx), %eax;		\
2559	cmpl	$0, %eax;			\
2560	jz	3f;				\
2561	movl	COPYOP(%eax), %ecx;		\
2562	jmp	*%ecx;				\
25633:						\
2564	movl	$-1, %eax;			\
2565	ret;					\
2566	SET_SIZE(NAME)
2567
2568	SUWORD(suword32, movl, %edx, CP_SUWORD32)
2569	SUWORD(suword16, movw, %dx, CP_SUWORD16)
2570	SUWORD(suword8, movb, %dl, CP_SUWORD8)
2571
2572#endif	/* __i386 */
2573
2574#undef	SUWORD
2575
2576#endif	/* __lint */
2577
2578#if defined(__lint)
2579
2580#if defined(__amd64)
2581
2582/*ARGSUSED*/
2583void
2584fuword64_noerr(const void *addr, uint64_t *dst)
2585{}
2586
2587#endif
2588
2589/*ARGSUSED*/
2590void
2591fuword32_noerr(const void *addr, uint32_t *dst)
2592{}
2593
2594/*ARGSUSED*/
2595void
2596fuword8_noerr(const void *addr, uint8_t *dst)
2597{}
2598
2599/*ARGSUSED*/
2600void
2601fuword16_noerr(const void *addr, uint16_t *dst)
2602{}
2603
2604#else   /* __lint */
2605
2606#if defined(__amd64)
2607
2608#define	FUWORD_NOERR(NAME, INSTR, REG)		\
2609	ENTRY(NAME)				\
2610	cmpq	kernelbase(%rip), %rdi;		\
2611	cmovnbq	kernelbase(%rip), %rdi;		\
2612	INSTR	(%rdi), REG;			\
2613	INSTR	REG, (%rsi);			\
2614	ret;					\
2615	SET_SIZE(NAME)
2616
2617	FUWORD_NOERR(fuword64_noerr, movq, %rax)
2618	FUWORD_NOERR(fuword32_noerr, movl, %eax)
2619	FUWORD_NOERR(fuword16_noerr, movw, %ax)
2620	FUWORD_NOERR(fuword8_noerr, movb, %al)
2621
2622#elif defined(__i386)
2623
2624#define	FUWORD_NOERR(NAME, INSTR, REG)		\
2625	ENTRY(NAME)				\
2626	movl	4(%esp), %eax;			\
2627	cmpl	kernelbase, %eax;		\
2628	jb	1f;				\
2629	movl	kernelbase, %eax;		\
26301:	movl	8(%esp), %edx;			\
2631	INSTR	(%eax), REG;			\
2632	INSTR	REG, (%edx);			\
2633	ret;					\
2634	SET_SIZE(NAME)
2635
2636	FUWORD_NOERR(fuword32_noerr, movl, %ecx)
2637	FUWORD_NOERR(fuword16_noerr, movw, %cx)
2638	FUWORD_NOERR(fuword8_noerr, movb, %cl)
2639
2640#endif	/* __i386 */
2641
2642#undef	FUWORD_NOERR
2643
2644#endif	/* __lint */
2645
2646#if defined(__lint)
2647
2648#if defined(__amd64)
2649
2650/*ARGSUSED*/
2651void
2652suword64_noerr(void *addr, uint64_t value)
2653{}
2654
2655#endif
2656
2657/*ARGSUSED*/
2658void
2659suword32_noerr(void *addr, uint32_t value)
2660{}
2661
2662/*ARGSUSED*/
2663void
2664suword16_noerr(void *addr, uint16_t value)
2665{}
2666
2667/*ARGSUSED*/
2668void
2669suword8_noerr(void *addr, uint8_t value)
2670{}
2671
2672#else	/* lint */
2673
2674#if defined(__amd64)
2675
2676#define	SUWORD_NOERR(NAME, INSTR, REG)		\
2677	ENTRY(NAME)				\
2678	cmpq	kernelbase(%rip), %rdi;		\
2679	cmovnbq	kernelbase(%rip), %rdi;		\
2680	INSTR	REG, (%rdi);			\
2681	ret;					\
2682	SET_SIZE(NAME)
2683
2684	SUWORD_NOERR(suword64_noerr, movq, %rsi)
2685	SUWORD_NOERR(suword32_noerr, movl, %esi)
2686	SUWORD_NOERR(suword16_noerr, movw, %si)
2687	SUWORD_NOERR(suword8_noerr, movb, %sil)
2688
2689#elif defined(__i386)
2690
2691#define	SUWORD_NOERR(NAME, INSTR, REG)		\
2692	ENTRY(NAME)				\
2693	movl	4(%esp), %eax;			\
2694	cmpl	kernelbase, %eax;		\
2695	jb	1f;				\
2696	movl	kernelbase, %eax;		\
26971:						\
2698	movl	8(%esp), %edx;			\
2699	INSTR	REG, (%eax);			\
2700	ret;					\
2701	SET_SIZE(NAME)
2702
2703	SUWORD_NOERR(suword32_noerr, movl, %edx)
2704	SUWORD_NOERR(suword16_noerr, movw, %dx)
2705	SUWORD_NOERR(suword8_noerr, movb, %dl)
2706
2707#endif	/* __i386 */
2708
2709#undef	SUWORD_NOERR
2710
2711#endif	/* lint */
2712
2713
2714#if defined(__lint)
2715
2716/*ARGSUSED*/
2717int
2718subyte(void *addr, uchar_t value)
2719{ return (0); }
2720
2721/*ARGSUSED*/
2722void
2723subyte_noerr(void *addr, uchar_t value)
2724{}
2725
2726/*ARGSUSED*/
2727int
2728fulword(const void *addr, ulong_t *valuep)
2729{ return (0); }
2730
2731/*ARGSUSED*/
2732void
2733fulword_noerr(const void *addr, ulong_t *valuep)
2734{}
2735
2736/*ARGSUSED*/
2737int
2738sulword(void *addr, ulong_t valuep)
2739{ return (0); }
2740
2741/*ARGSUSED*/
2742void
2743sulword_noerr(void *addr, ulong_t valuep)
2744{}
2745
2746#else
2747
2748	.weak	subyte
2749	subyte=suword8
2750	.weak	subyte_noerr
2751	subyte_noerr=suword8_noerr
2752
2753#if defined(__amd64)
2754
2755	.weak	fulword
2756	fulword=fuword64
2757	.weak	fulword_noerr
2758	fulword_noerr=fuword64_noerr
2759	.weak	sulword
2760	sulword=suword64
2761	.weak	sulword_noerr
2762	sulword_noerr=suword64_noerr
2763
2764#elif defined(__i386)
2765
2766	.weak	fulword
2767	fulword=fuword32
2768	.weak	fulword_noerr
2769	fulword_noerr=fuword32_noerr
2770	.weak	sulword
2771	sulword=suword32
2772	.weak	sulword_noerr
2773	sulword_noerr=suword32_noerr
2774
2775#endif /* __i386 */
2776
2777#endif /* __lint */
2778
2779#if defined(__lint)
2780
2781/*
2782 * Copy a block of storage - must not overlap (from + len <= to).
2783 * No fault handler installed (to be called under on_fault())
2784 */
2785
2786/* ARGSUSED */
2787void
2788copyout_noerr(const void *kfrom, void *uto, size_t count)
2789{}
2790
2791/* ARGSUSED */
2792void
2793copyin_noerr(const void *ufrom, void *kto, size_t count)
2794{}
2795
2796/*
2797 * Zero a block of storage in user space
2798 */
2799
2800/* ARGSUSED */
2801void
2802uzero(void *addr, size_t count)
2803{}
2804
2805/*
2806 * copy a block of storage in user space
2807 */
2808
2809/* ARGSUSED */
2810void
2811ucopy(const void *ufrom, void *uto, size_t ulength)
2812{}
2813
2814/*
2815 * copy a string in user space
2816 */
2817
2818/* ARGSUSED */
2819void
2820ucopystr(const char *ufrom, char *uto, size_t umaxlength, size_t *lencopied)
2821{}
2822
2823#else /* __lint */
2824
2825#if defined(__amd64)
2826
2827	ENTRY(copyin_noerr)
2828	movq	kernelbase(%rip), %rax
2829#ifdef DEBUG
2830	cmpq	%rax, %rsi		/* %rsi = kto */
2831	jae	1f
2832	leaq	.cpyin_ne_pmsg(%rip), %rdi
2833	jmp	call_panic		/* setup stack and call panic */
28341:
2835#endif
2836	cmpq	%rax, %rdi		/* ufrom < kernelbase */
2837	jb	do_copy
2838	movq	%rax, %rdi		/* force fault at kernelbase */
2839	jmp	do_copy
2840	SET_SIZE(copyin_noerr)
2841
2842	ENTRY(copyout_noerr)
2843	movq	kernelbase(%rip), %rax
2844#ifdef DEBUG
2845	cmpq	%rax, %rdi		/* %rdi = kfrom */
2846	jae	1f
2847	leaq	.cpyout_ne_pmsg(%rip), %rdi
2848	jmp	call_panic		/* setup stack and call panic */
28491:
2850#endif
2851	cmpq	%rax, %rsi		/* uto < kernelbase */
2852	jb	do_copy
2853	movq	%rax, %rsi		/* force fault at kernelbase */
2854	jmp	do_copy
2855	SET_SIZE(copyout_noerr)
2856
2857	ENTRY(uzero)
2858	movq	kernelbase(%rip), %rax
2859	cmpq	%rax, %rdi
2860	jb	do_zero
2861	movq	%rax, %rdi	/* force fault at kernelbase */
2862	jmp	do_zero
2863	SET_SIZE(uzero)
2864
2865	ENTRY(ucopy)
2866	movq	kernelbase(%rip), %rax
2867	cmpq	%rax, %rdi
2868	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
2869	cmpq	%rax, %rsi
2870	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
2871	jmp	do_copy
2872	SET_SIZE(ucopy)
2873
2874	ENTRY(ucopystr)
2875	movq	kernelbase(%rip), %rax
2876	cmpq	%rax, %rdi
2877	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
2878	cmpq	%rax, %rsi
2879	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
2880	/* do_copystr expects lofault address in %r8 */
2881	movq	%gs:CPU_THREAD, %r8
2882	movq	T_LOFAULT(%r8), %r8
2883	jmp	do_copystr
2884	SET_SIZE(ucopystr)
2885
2886#elif defined(__i386)
2887
2888	ENTRY(copyin_noerr)
2889	movl	kernelbase, %eax
2890#ifdef DEBUG
2891	cmpl	%eax, 8(%esp)
2892	jae	1f
2893	pushl	$.cpyin_ne_pmsg
2894	call	panic
28951:
2896#endif
2897	cmpl	%eax, 4(%esp)
2898	jb	do_copy
2899	movl	%eax, 4(%esp)	/* force fault at kernelbase */
2900	jmp	do_copy
2901	SET_SIZE(copyin_noerr)
2902
2903	ENTRY(copyout_noerr)
2904	movl	kernelbase, %eax
2905#ifdef DEBUG
2906	cmpl	%eax, 4(%esp)
2907	jae	1f
2908	pushl	$.cpyout_ne_pmsg
2909	call	panic
29101:
2911#endif
2912	cmpl	%eax, 8(%esp)
2913	jb	do_copy
2914	movl	%eax, 8(%esp)	/* force fault at kernelbase */
2915	jmp	do_copy
2916	SET_SIZE(copyout_noerr)
2917
2918	ENTRY(uzero)
2919	movl	kernelbase, %eax
2920	cmpl	%eax, 4(%esp)
2921	jb	do_zero
2922	movl	%eax, 4(%esp)	/* force fault at kernelbase */
2923	jmp	do_zero
2924	SET_SIZE(uzero)
2925
2926	ENTRY(ucopy)
2927	movl	kernelbase, %eax
2928	cmpl	%eax, 4(%esp)
2929	jb	1f
2930	movl	%eax, 4(%esp)	/* force fault at kernelbase */
29311:
2932	cmpl	%eax, 8(%esp)
2933	jb	do_copy
2934	movl	%eax, 8(%esp)	/* force fault at kernelbase */
2935	jmp	do_copy
2936	SET_SIZE(ucopy)
2937
2938	ENTRY(ucopystr)
2939	movl	kernelbase, %eax
2940	cmpl	%eax, 4(%esp)
2941	jb	1f
2942	movl	%eax, 4(%esp)	/* force fault at kernelbase */
29431:
2944	cmpl	%eax, 8(%esp)
2945	jb	2f
2946	movl	%eax, 8(%esp)	/* force fault at kernelbase */
29472:
2948	/* do_copystr expects the lofault address in %eax */
2949	movl	%gs:CPU_THREAD, %eax
2950	movl	T_LOFAULT(%eax), %eax
2951	jmp	do_copystr
2952	SET_SIZE(ucopystr)
2953
2954#endif	/* __i386 */
2955
2956#ifdef DEBUG
2957	.data
2958.kcopy_panic_msg:
2959	.string "kcopy: arguments below kernelbase"
2960.bcopy_panic_msg:
2961	.string "bcopy: arguments below kernelbase"
2962.kzero_panic_msg:
2963        .string "kzero: arguments below kernelbase"
2964.bzero_panic_msg:
2965	.string	"bzero: arguments below kernelbase"
2966.copyin_panic_msg:
2967	.string "copyin: kaddr argument below kernelbase"
2968.xcopyin_panic_msg:
2969	.string	"xcopyin: kaddr argument below kernelbase"
2970.copyout_panic_msg:
2971	.string "copyout: kaddr argument below kernelbase"
2972.xcopyout_panic_msg:
2973	.string	"xcopyout: kaddr argument below kernelbase"
2974.copystr_panic_msg:
2975	.string	"copystr: arguments in user space"
2976.copyinstr_panic_msg:
2977	.string	"copyinstr: kaddr argument not in kernel address space"
2978.copyoutstr_panic_msg:
2979	.string	"copyoutstr: kaddr argument not in kernel address space"
2980.cpyin_ne_pmsg:
2981	.string "copyin_noerr: argument not in kernel address space"
2982.cpyout_ne_pmsg:
2983	.string "copyout_noerr: argument not in kernel address space"
2984#endif
2985
2986#endif	/* __lint */
2987