xref: /titanic_50/usr/src/uts/intel/ia32/ml/copy.s (revision 87aafc05e247a75cc8434e694c3b98c74a1287f0)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright (c) 2009, Intel Corporation
28 * All rights reserved.
29 */
30
31/*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
32/*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T		*/
33/*         All Rights Reserved						*/
34
35/*       Copyright (c) 1987, 1988 Microsoft Corporation			*/
36/*         All Rights Reserved						*/
37
38/*
39 * Copyright 2016 Joyent, Inc.
40 */
41
42#include <sys/errno.h>
43#include <sys/asm_linkage.h>
44
45#if defined(__lint)
46#include <sys/types.h>
47#include <sys/systm.h>
48#else	/* __lint */
49#include "assym.h"
50#endif	/* __lint */
51
52#define	KCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
53#define	XCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
54/*
55 * Non-temopral access (NTA) alignment requirement
56 */
57#define	NTA_ALIGN_SIZE	4	/* Must be at least 4-byte aligned */
58#define	NTA_ALIGN_MASK	_CONST(NTA_ALIGN_SIZE-1)
59#define	COUNT_ALIGN_SIZE	16	/* Must be at least 16-byte aligned */
60#define	COUNT_ALIGN_MASK	_CONST(COUNT_ALIGN_SIZE-1)
61
62/*
63 * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
64 * "rep smovq" for large sizes. Performance data shows that many calls to
65 * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
66 * these small sizes unrolled code is used. For medium sizes loops writing
67 * 64-bytes per loop are used. Transition points were determined experimentally.
68 */
69#define BZERO_USE_REP	(1024)
70#define BCOPY_DFLT_REP	(128)
71#define	BCOPY_NHM_REP	(768)
72
73/*
74 * Copy a block of storage, returning an error code if `from' or
75 * `to' takes a kernel pagefault which cannot be resolved.
76 * Returns errno value on pagefault error, 0 if all ok
77 */
78
79#if defined(__lint)
80
81/* ARGSUSED */
82int
83kcopy(const void *from, void *to, size_t count)
84{ return (0); }
85
86#else	/* __lint */
87
88	.globl	kernelbase
89	.globl	postbootkernelbase
90
91#if defined(__amd64)
92
93	ENTRY(kcopy)
94	pushq	%rbp
95	movq	%rsp, %rbp
96#ifdef DEBUG
97	cmpq	postbootkernelbase(%rip), %rdi 		/* %rdi = from */
98	jb	0f
99	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
100	jnb	1f
1010:	leaq	.kcopy_panic_msg(%rip), %rdi
102	xorl	%eax, %eax
103	call	panic
1041:
105#endif
106	/*
107	 * pass lofault value as 4th argument to do_copy_fault
108	 */
109	leaq	_kcopy_copyerr(%rip), %rcx
110	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
111
112do_copy_fault:
113	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
114	movq	%rcx, T_LOFAULT(%r9)	/* new lofault */
115	call	bcopy_altentry
116	xorl	%eax, %eax		/* return 0 (success) */
117
118	/*
119	 * A fault during do_copy_fault is indicated through an errno value
120	 * in %rax and we iretq from the trap handler to here.
121	 */
122_kcopy_copyerr:
123	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
124	leave
125	ret
126	SET_SIZE(kcopy)
127
128#elif defined(__i386)
129
130#define	ARG_FROM	8
131#define	ARG_TO		12
132#define	ARG_COUNT	16
133
134	ENTRY(kcopy)
135#ifdef DEBUG
136	pushl	%ebp
137	movl	%esp, %ebp
138	movl	postbootkernelbase, %eax
139	cmpl	%eax, ARG_FROM(%ebp)
140	jb	0f
141	cmpl	%eax, ARG_TO(%ebp)
142	jnb	1f
1430:	pushl	$.kcopy_panic_msg
144	call	panic
1451:	popl	%ebp
146#endif
147	lea	_kcopy_copyerr, %eax	/* lofault value */
148	movl	%gs:CPU_THREAD, %edx
149
150do_copy_fault:
151	pushl	%ebp
152	movl	%esp, %ebp		/* setup stack frame */
153	pushl	%esi
154	pushl	%edi			/* save registers */
155
156	movl	T_LOFAULT(%edx), %edi
157	pushl	%edi			/* save the current lofault */
158	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
159
160	movl	ARG_COUNT(%ebp), %ecx
161	movl	ARG_FROM(%ebp), %esi
162	movl	ARG_TO(%ebp), %edi
163	shrl	$2, %ecx		/* word count */
164	rep
165	  smovl
166	movl	ARG_COUNT(%ebp), %ecx
167	andl	$3, %ecx		/* bytes left over */
168	rep
169	  smovb
170	xorl	%eax, %eax
171
172	/*
173	 * A fault during do_copy_fault is indicated through an errno value
174	 * in %eax and we iret from the trap handler to here.
175	 */
176_kcopy_copyerr:
177	popl	%ecx
178	popl	%edi
179	movl	%ecx, T_LOFAULT(%edx)	/* restore the original lofault */
180	popl	%esi
181	popl	%ebp
182	ret
183	SET_SIZE(kcopy)
184
185#undef	ARG_FROM
186#undef	ARG_TO
187#undef	ARG_COUNT
188
189#endif	/* __i386 */
190#endif	/* __lint */
191
192#if defined(__lint)
193
194/*
195 * Copy a block of storage.  Similar to kcopy but uses non-temporal
196 * instructions.
197 */
198
199/* ARGSUSED */
200int
201kcopy_nta(const void *from, void *to, size_t count, int copy_cached)
202{ return (0); }
203
204#else	/* __lint */
205
206#if defined(__amd64)
207
208#define	COPY_LOOP_INIT(src, dst, cnt)	\
209	addq	cnt, src;			\
210	addq	cnt, dst;			\
211	shrq	$3, cnt;			\
212	neg	cnt
213
214	/* Copy 16 bytes per loop.  Uses %rax and %r8 */
215#define	COPY_LOOP_BODY(src, dst, cnt)	\
216	prefetchnta	0x100(src, cnt, 8);	\
217	movq	(src, cnt, 8), %rax;		\
218	movq	0x8(src, cnt, 8), %r8;		\
219	movnti	%rax, (dst, cnt, 8);		\
220	movnti	%r8, 0x8(dst, cnt, 8);		\
221	addq	$2, cnt
222
223	ENTRY(kcopy_nta)
224	pushq	%rbp
225	movq	%rsp, %rbp
226#ifdef DEBUG
227	cmpq	postbootkernelbase(%rip), %rdi 		/* %rdi = from */
228	jb	0f
229	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
230	jnb	1f
2310:	leaq	.kcopy_panic_msg(%rip), %rdi
232	xorl	%eax, %eax
233	call	panic
2341:
235#endif
236
237	movq	%gs:CPU_THREAD, %r9
238	cmpq	$0, %rcx		/* No non-temporal access? */
239	/*
240	 * pass lofault value as 4th argument to do_copy_fault
241	 */
242	leaq	_kcopy_nta_copyerr(%rip), %rcx	/* doesn't set rflags */
243	jnz	do_copy_fault		/* use regular access */
244	/*
245	 * Make sure cnt is >= KCOPY_MIN_SIZE
246	 */
247	cmpq	$KCOPY_MIN_SIZE, %rdx
248	jb	do_copy_fault
249
250	/*
251	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
252	 * count is COUNT_ALIGN_SIZE aligned.
253	 */
254	movq	%rdi, %r10
255	orq	%rsi, %r10
256	andq	$NTA_ALIGN_MASK, %r10
257	orq	%rdx, %r10
258	andq	$COUNT_ALIGN_MASK, %r10
259	jnz	do_copy_fault
260
261	ALTENTRY(do_copy_fault_nta)
262	movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
263	movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
264	movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
265
266	/*
267	 * COPY_LOOP_BODY uses %rax and %r8
268	 */
269	COPY_LOOP_INIT(%rdi, %rsi, %rdx)
2702:	COPY_LOOP_BODY(%rdi, %rsi, %rdx)
271	jnz	2b
272
273	mfence
274	xorl	%eax, %eax		/* return 0 (success) */
275
276_kcopy_nta_copyerr:
277	movq	%r11, T_LOFAULT(%r9)    /* restore original lofault */
278	leave
279	ret
280	SET_SIZE(do_copy_fault_nta)
281	SET_SIZE(kcopy_nta)
282
283#elif defined(__i386)
284
285#define	ARG_FROM	8
286#define	ARG_TO		12
287#define	ARG_COUNT	16
288
289#define	COPY_LOOP_INIT(src, dst, cnt)	\
290	addl	cnt, src;			\
291	addl	cnt, dst;			\
292	shrl	$3, cnt;			\
293	neg	cnt
294
295#define	COPY_LOOP_BODY(src, dst, cnt)	\
296	prefetchnta	0x100(src, cnt, 8);	\
297	movl	(src, cnt, 8), %esi;		\
298	movnti	%esi, (dst, cnt, 8);		\
299	movl	0x4(src, cnt, 8), %esi;		\
300	movnti	%esi, 0x4(dst, cnt, 8);		\
301	movl	0x8(src, cnt, 8), %esi;		\
302	movnti	%esi, 0x8(dst, cnt, 8);		\
303	movl	0xc(src, cnt, 8), %esi;		\
304	movnti	%esi, 0xc(dst, cnt, 8);		\
305	addl	$2, cnt
306
307	/*
308	 * kcopy_nta is not implemented for 32-bit as no performance
309	 * improvement was shown.  We simply jump directly to kcopy
310	 * and discard the 4 arguments.
311	 */
312	ENTRY(kcopy_nta)
313	jmp	kcopy
314
315	lea	_kcopy_nta_copyerr, %eax	/* lofault value */
316	ALTENTRY(do_copy_fault_nta)
317	pushl	%ebp
318	movl	%esp, %ebp		/* setup stack frame */
319	pushl	%esi
320	pushl	%edi
321
322	movl	%gs:CPU_THREAD, %edx
323	movl	T_LOFAULT(%edx), %edi
324	pushl	%edi			/* save the current lofault */
325	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
326
327	/* COPY_LOOP_BODY needs to use %esi */
328	movl	ARG_COUNT(%ebp), %ecx
329	movl	ARG_FROM(%ebp), %edi
330	movl	ARG_TO(%ebp), %eax
331	COPY_LOOP_INIT(%edi, %eax, %ecx)
3321:	COPY_LOOP_BODY(%edi, %eax, %ecx)
333	jnz	1b
334	mfence
335
336	xorl	%eax, %eax
337_kcopy_nta_copyerr:
338	popl	%ecx
339	popl	%edi
340	movl	%ecx, T_LOFAULT(%edx)	/* restore the original lofault */
341	popl	%esi
342	leave
343	ret
344	SET_SIZE(do_copy_fault_nta)
345	SET_SIZE(kcopy_nta)
346
347#undef	ARG_FROM
348#undef	ARG_TO
349#undef	ARG_COUNT
350
351#endif	/* __i386 */
352#endif	/* __lint */
353
354#if defined(__lint)
355
356/* ARGSUSED */
357void
358bcopy(const void *from, void *to, size_t count)
359{}
360
361#else	/* __lint */
362
363#if defined(__amd64)
364
365	ENTRY(bcopy)
366#ifdef DEBUG
367	orq	%rdx, %rdx		/* %rdx = count */
368	jz	1f
369	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
370	jb	0f
371	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
372	jnb	1f
3730:	leaq	.bcopy_panic_msg(%rip), %rdi
374	jmp	call_panic		/* setup stack and call panic */
3751:
376#endif
377	/*
378	 * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
379	 * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
380	 * uses these registers in future they must be saved and restored.
381	 */
382	ALTENTRY(bcopy_altentry)
383do_copy:
384#define	L(s) .bcopy/**/s
385	cmpq	$0x50, %rdx		/* 80 */
386	jae	bcopy_ck_size
387
388	/*
389	 * Performance data shows many caller's copy small buffers. So for
390	 * best perf for these sizes unrolled code is used. Store data without
391	 * worrying about alignment.
392	 */
393	leaq	L(fwdPxQx)(%rip), %r10
394	addq	%rdx, %rdi
395	addq	%rdx, %rsi
396	movslq	(%r10,%rdx,4), %rcx
397	leaq	(%rcx,%r10,1), %r10
398	jmpq	*%r10
399
400	.p2align 4
401L(fwdPxQx):
402	.int       L(P0Q0)-L(fwdPxQx)	/* 0 */
403	.int       L(P1Q0)-L(fwdPxQx)
404	.int       L(P2Q0)-L(fwdPxQx)
405	.int       L(P3Q0)-L(fwdPxQx)
406	.int       L(P4Q0)-L(fwdPxQx)
407	.int       L(P5Q0)-L(fwdPxQx)
408	.int       L(P6Q0)-L(fwdPxQx)
409	.int       L(P7Q0)-L(fwdPxQx)
410
411	.int       L(P0Q1)-L(fwdPxQx)	/* 8 */
412	.int       L(P1Q1)-L(fwdPxQx)
413	.int       L(P2Q1)-L(fwdPxQx)
414	.int       L(P3Q1)-L(fwdPxQx)
415	.int       L(P4Q1)-L(fwdPxQx)
416	.int       L(P5Q1)-L(fwdPxQx)
417	.int       L(P6Q1)-L(fwdPxQx)
418	.int       L(P7Q1)-L(fwdPxQx)
419
420	.int       L(P0Q2)-L(fwdPxQx)	/* 16 */
421	.int       L(P1Q2)-L(fwdPxQx)
422	.int       L(P2Q2)-L(fwdPxQx)
423	.int       L(P3Q2)-L(fwdPxQx)
424	.int       L(P4Q2)-L(fwdPxQx)
425	.int       L(P5Q2)-L(fwdPxQx)
426	.int       L(P6Q2)-L(fwdPxQx)
427	.int       L(P7Q2)-L(fwdPxQx)
428
429	.int       L(P0Q3)-L(fwdPxQx)	/* 24 */
430	.int       L(P1Q3)-L(fwdPxQx)
431	.int       L(P2Q3)-L(fwdPxQx)
432	.int       L(P3Q3)-L(fwdPxQx)
433	.int       L(P4Q3)-L(fwdPxQx)
434	.int       L(P5Q3)-L(fwdPxQx)
435	.int       L(P6Q3)-L(fwdPxQx)
436	.int       L(P7Q3)-L(fwdPxQx)
437
438	.int       L(P0Q4)-L(fwdPxQx)	/* 32 */
439	.int       L(P1Q4)-L(fwdPxQx)
440	.int       L(P2Q4)-L(fwdPxQx)
441	.int       L(P3Q4)-L(fwdPxQx)
442	.int       L(P4Q4)-L(fwdPxQx)
443	.int       L(P5Q4)-L(fwdPxQx)
444	.int       L(P6Q4)-L(fwdPxQx)
445	.int       L(P7Q4)-L(fwdPxQx)
446
447	.int       L(P0Q5)-L(fwdPxQx)	/* 40 */
448	.int       L(P1Q5)-L(fwdPxQx)
449	.int       L(P2Q5)-L(fwdPxQx)
450	.int       L(P3Q5)-L(fwdPxQx)
451	.int       L(P4Q5)-L(fwdPxQx)
452	.int       L(P5Q5)-L(fwdPxQx)
453	.int       L(P6Q5)-L(fwdPxQx)
454	.int       L(P7Q5)-L(fwdPxQx)
455
456	.int       L(P0Q6)-L(fwdPxQx)	/* 48 */
457	.int       L(P1Q6)-L(fwdPxQx)
458	.int       L(P2Q6)-L(fwdPxQx)
459	.int       L(P3Q6)-L(fwdPxQx)
460	.int       L(P4Q6)-L(fwdPxQx)
461	.int       L(P5Q6)-L(fwdPxQx)
462	.int       L(P6Q6)-L(fwdPxQx)
463	.int       L(P7Q6)-L(fwdPxQx)
464
465	.int       L(P0Q7)-L(fwdPxQx)	/* 56 */
466	.int       L(P1Q7)-L(fwdPxQx)
467	.int       L(P2Q7)-L(fwdPxQx)
468	.int       L(P3Q7)-L(fwdPxQx)
469	.int       L(P4Q7)-L(fwdPxQx)
470	.int       L(P5Q7)-L(fwdPxQx)
471	.int       L(P6Q7)-L(fwdPxQx)
472	.int       L(P7Q7)-L(fwdPxQx)
473
474	.int       L(P0Q8)-L(fwdPxQx)	/* 64 */
475	.int       L(P1Q8)-L(fwdPxQx)
476	.int       L(P2Q8)-L(fwdPxQx)
477	.int       L(P3Q8)-L(fwdPxQx)
478	.int       L(P4Q8)-L(fwdPxQx)
479	.int       L(P5Q8)-L(fwdPxQx)
480	.int       L(P6Q8)-L(fwdPxQx)
481	.int       L(P7Q8)-L(fwdPxQx)
482
483	.int       L(P0Q9)-L(fwdPxQx)	/* 72 */
484	.int       L(P1Q9)-L(fwdPxQx)
485	.int       L(P2Q9)-L(fwdPxQx)
486	.int       L(P3Q9)-L(fwdPxQx)
487	.int       L(P4Q9)-L(fwdPxQx)
488	.int       L(P5Q9)-L(fwdPxQx)
489	.int       L(P6Q9)-L(fwdPxQx)
490	.int       L(P7Q9)-L(fwdPxQx)	/* 79 */
491
492	.p2align 4
493L(P0Q9):
494	mov    -0x48(%rdi), %rcx
495	mov    %rcx, -0x48(%rsi)
496L(P0Q8):
497	mov    -0x40(%rdi), %r10
498	mov    %r10, -0x40(%rsi)
499L(P0Q7):
500	mov    -0x38(%rdi), %r8
501	mov    %r8, -0x38(%rsi)
502L(P0Q6):
503	mov    -0x30(%rdi), %rcx
504	mov    %rcx, -0x30(%rsi)
505L(P0Q5):
506	mov    -0x28(%rdi), %r10
507	mov    %r10, -0x28(%rsi)
508L(P0Q4):
509	mov    -0x20(%rdi), %r8
510	mov    %r8, -0x20(%rsi)
511L(P0Q3):
512	mov    -0x18(%rdi), %rcx
513	mov    %rcx, -0x18(%rsi)
514L(P0Q2):
515	mov    -0x10(%rdi), %r10
516	mov    %r10, -0x10(%rsi)
517L(P0Q1):
518	mov    -0x8(%rdi), %r8
519	mov    %r8, -0x8(%rsi)
520L(P0Q0):
521	ret
522
523	.p2align 4
524L(P1Q9):
525	mov    -0x49(%rdi), %r8
526	mov    %r8, -0x49(%rsi)
527L(P1Q8):
528	mov    -0x41(%rdi), %rcx
529	mov    %rcx, -0x41(%rsi)
530L(P1Q7):
531	mov    -0x39(%rdi), %r10
532	mov    %r10, -0x39(%rsi)
533L(P1Q6):
534	mov    -0x31(%rdi), %r8
535	mov    %r8, -0x31(%rsi)
536L(P1Q5):
537	mov    -0x29(%rdi), %rcx
538	mov    %rcx, -0x29(%rsi)
539L(P1Q4):
540	mov    -0x21(%rdi), %r10
541	mov    %r10, -0x21(%rsi)
542L(P1Q3):
543	mov    -0x19(%rdi), %r8
544	mov    %r8, -0x19(%rsi)
545L(P1Q2):
546	mov    -0x11(%rdi), %rcx
547	mov    %rcx, -0x11(%rsi)
548L(P1Q1):
549	mov    -0x9(%rdi), %r10
550	mov    %r10, -0x9(%rsi)
551L(P1Q0):
552	movzbq -0x1(%rdi), %r8
553	mov    %r8b, -0x1(%rsi)
554	ret
555
556	.p2align 4
557L(P2Q9):
558	mov    -0x4a(%rdi), %r8
559	mov    %r8, -0x4a(%rsi)
560L(P2Q8):
561	mov    -0x42(%rdi), %rcx
562	mov    %rcx, -0x42(%rsi)
563L(P2Q7):
564	mov    -0x3a(%rdi), %r10
565	mov    %r10, -0x3a(%rsi)
566L(P2Q6):
567	mov    -0x32(%rdi), %r8
568	mov    %r8, -0x32(%rsi)
569L(P2Q5):
570	mov    -0x2a(%rdi), %rcx
571	mov    %rcx, -0x2a(%rsi)
572L(P2Q4):
573	mov    -0x22(%rdi), %r10
574	mov    %r10, -0x22(%rsi)
575L(P2Q3):
576	mov    -0x1a(%rdi), %r8
577	mov    %r8, -0x1a(%rsi)
578L(P2Q2):
579	mov    -0x12(%rdi), %rcx
580	mov    %rcx, -0x12(%rsi)
581L(P2Q1):
582	mov    -0xa(%rdi), %r10
583	mov    %r10, -0xa(%rsi)
584L(P2Q0):
585	movzwq -0x2(%rdi), %r8
586	mov    %r8w, -0x2(%rsi)
587	ret
588
589	.p2align 4
590L(P3Q9):
591	mov    -0x4b(%rdi), %r8
592	mov    %r8, -0x4b(%rsi)
593L(P3Q8):
594	mov    -0x43(%rdi), %rcx
595	mov    %rcx, -0x43(%rsi)
596L(P3Q7):
597	mov    -0x3b(%rdi), %r10
598	mov    %r10, -0x3b(%rsi)
599L(P3Q6):
600	mov    -0x33(%rdi), %r8
601	mov    %r8, -0x33(%rsi)
602L(P3Q5):
603	mov    -0x2b(%rdi), %rcx
604	mov    %rcx, -0x2b(%rsi)
605L(P3Q4):
606	mov    -0x23(%rdi), %r10
607	mov    %r10, -0x23(%rsi)
608L(P3Q3):
609	mov    -0x1b(%rdi), %r8
610	mov    %r8, -0x1b(%rsi)
611L(P3Q2):
612	mov    -0x13(%rdi), %rcx
613	mov    %rcx, -0x13(%rsi)
614L(P3Q1):
615	mov    -0xb(%rdi), %r10
616	mov    %r10, -0xb(%rsi)
617	/*
618	 * These trailing loads/stores have to do all their loads 1st,
619	 * then do the stores.
620	 */
621L(P3Q0):
622	movzwq -0x3(%rdi), %r8
623	movzbq -0x1(%rdi), %r10
624	mov    %r8w, -0x3(%rsi)
625	mov    %r10b, -0x1(%rsi)
626	ret
627
628	.p2align 4
629L(P4Q9):
630	mov    -0x4c(%rdi), %r8
631	mov    %r8, -0x4c(%rsi)
632L(P4Q8):
633	mov    -0x44(%rdi), %rcx
634	mov    %rcx, -0x44(%rsi)
635L(P4Q7):
636	mov    -0x3c(%rdi), %r10
637	mov    %r10, -0x3c(%rsi)
638L(P4Q6):
639	mov    -0x34(%rdi), %r8
640	mov    %r8, -0x34(%rsi)
641L(P4Q5):
642	mov    -0x2c(%rdi), %rcx
643	mov    %rcx, -0x2c(%rsi)
644L(P4Q4):
645	mov    -0x24(%rdi), %r10
646	mov    %r10, -0x24(%rsi)
647L(P4Q3):
648	mov    -0x1c(%rdi), %r8
649	mov    %r8, -0x1c(%rsi)
650L(P4Q2):
651	mov    -0x14(%rdi), %rcx
652	mov    %rcx, -0x14(%rsi)
653L(P4Q1):
654	mov    -0xc(%rdi), %r10
655	mov    %r10, -0xc(%rsi)
656L(P4Q0):
657	mov    -0x4(%rdi), %r8d
658	mov    %r8d, -0x4(%rsi)
659	ret
660
661	.p2align 4
662L(P5Q9):
663	mov    -0x4d(%rdi), %r8
664	mov    %r8, -0x4d(%rsi)
665L(P5Q8):
666	mov    -0x45(%rdi), %rcx
667	mov    %rcx, -0x45(%rsi)
668L(P5Q7):
669	mov    -0x3d(%rdi), %r10
670	mov    %r10, -0x3d(%rsi)
671L(P5Q6):
672	mov    -0x35(%rdi), %r8
673	mov    %r8, -0x35(%rsi)
674L(P5Q5):
675	mov    -0x2d(%rdi), %rcx
676	mov    %rcx, -0x2d(%rsi)
677L(P5Q4):
678	mov    -0x25(%rdi), %r10
679	mov    %r10, -0x25(%rsi)
680L(P5Q3):
681	mov    -0x1d(%rdi), %r8
682	mov    %r8, -0x1d(%rsi)
683L(P5Q2):
684	mov    -0x15(%rdi), %rcx
685	mov    %rcx, -0x15(%rsi)
686L(P5Q1):
687	mov    -0xd(%rdi), %r10
688	mov    %r10, -0xd(%rsi)
689L(P5Q0):
690	mov    -0x5(%rdi), %r8d
691	movzbq -0x1(%rdi), %r10
692	mov    %r8d, -0x5(%rsi)
693	mov    %r10b, -0x1(%rsi)
694	ret
695
696	.p2align 4
697L(P6Q9):
698	mov    -0x4e(%rdi), %r8
699	mov    %r8, -0x4e(%rsi)
700L(P6Q8):
701	mov    -0x46(%rdi), %rcx
702	mov    %rcx, -0x46(%rsi)
703L(P6Q7):
704	mov    -0x3e(%rdi), %r10
705	mov    %r10, -0x3e(%rsi)
706L(P6Q6):
707	mov    -0x36(%rdi), %r8
708	mov    %r8, -0x36(%rsi)
709L(P6Q5):
710	mov    -0x2e(%rdi), %rcx
711	mov    %rcx, -0x2e(%rsi)
712L(P6Q4):
713	mov    -0x26(%rdi), %r10
714	mov    %r10, -0x26(%rsi)
715L(P6Q3):
716	mov    -0x1e(%rdi), %r8
717	mov    %r8, -0x1e(%rsi)
718L(P6Q2):
719	mov    -0x16(%rdi), %rcx
720	mov    %rcx, -0x16(%rsi)
721L(P6Q1):
722	mov    -0xe(%rdi), %r10
723	mov    %r10, -0xe(%rsi)
724L(P6Q0):
725	mov    -0x6(%rdi), %r8d
726	movzwq -0x2(%rdi), %r10
727	mov    %r8d, -0x6(%rsi)
728	mov    %r10w, -0x2(%rsi)
729	ret
730
731	.p2align 4
732L(P7Q9):
733	mov    -0x4f(%rdi), %r8
734	mov    %r8, -0x4f(%rsi)
735L(P7Q8):
736	mov    -0x47(%rdi), %rcx
737	mov    %rcx, -0x47(%rsi)
738L(P7Q7):
739	mov    -0x3f(%rdi), %r10
740	mov    %r10, -0x3f(%rsi)
741L(P7Q6):
742	mov    -0x37(%rdi), %r8
743	mov    %r8, -0x37(%rsi)
744L(P7Q5):
745	mov    -0x2f(%rdi), %rcx
746	mov    %rcx, -0x2f(%rsi)
747L(P7Q4):
748	mov    -0x27(%rdi), %r10
749	mov    %r10, -0x27(%rsi)
750L(P7Q3):
751	mov    -0x1f(%rdi), %r8
752	mov    %r8, -0x1f(%rsi)
753L(P7Q2):
754	mov    -0x17(%rdi), %rcx
755	mov    %rcx, -0x17(%rsi)
756L(P7Q1):
757	mov    -0xf(%rdi), %r10
758	mov    %r10, -0xf(%rsi)
759L(P7Q0):
760	mov    -0x7(%rdi), %r8d
761	movzwq -0x3(%rdi), %r10
762	movzbq -0x1(%rdi), %rcx
763	mov    %r8d, -0x7(%rsi)
764	mov    %r10w, -0x3(%rsi)
765	mov    %cl, -0x1(%rsi)
766	ret
767
768	/*
769	 * For large sizes rep smovq is fastest.
770	 * Transition point determined experimentally as measured on
771	 * Intel Xeon processors (incl. Nehalem and previous generations) and
772	 * AMD Opteron. The transition value is patched at boot time to avoid
773	 * memory reference hit.
774	 */
775	.globl bcopy_patch_start
776bcopy_patch_start:
777	cmpq	$BCOPY_NHM_REP, %rdx
778	.globl bcopy_patch_end
779bcopy_patch_end:
780
781	.p2align 4
782	.globl bcopy_ck_size
783bcopy_ck_size:
784	cmpq	$BCOPY_DFLT_REP, %rdx
785	jae	L(use_rep)
786
787	/*
788	 * Align to a 8-byte boundary. Avoids penalties from unaligned stores
789	 * as well as from stores spanning cachelines.
790	 */
791	test	$0x7, %rsi
792	jz	L(aligned_loop)
793	test	$0x1, %rsi
794	jz	2f
795	movzbq	(%rdi), %r8
796	dec	%rdx
797	inc	%rdi
798	mov	%r8b, (%rsi)
799	inc	%rsi
8002:
801	test	$0x2, %rsi
802	jz	4f
803	movzwq	(%rdi), %r8
804	sub	$0x2, %rdx
805	add	$0x2, %rdi
806	mov	%r8w, (%rsi)
807	add	$0x2, %rsi
8084:
809	test	$0x4, %rsi
810	jz	L(aligned_loop)
811	mov	(%rdi), %r8d
812	sub	$0x4, %rdx
813	add	$0x4, %rdi
814	mov	%r8d, (%rsi)
815	add	$0x4, %rsi
816
817	/*
818	 * Copy 64-bytes per loop
819	 */
820	.p2align 4
821L(aligned_loop):
822	mov	(%rdi), %r8
823	mov	0x8(%rdi), %r10
824	lea	-0x40(%rdx), %rdx
825	mov	%r8, (%rsi)
826	mov	%r10, 0x8(%rsi)
827	mov	0x10(%rdi), %rcx
828	mov	0x18(%rdi), %r8
829	mov	%rcx, 0x10(%rsi)
830	mov	%r8, 0x18(%rsi)
831
832	cmp	$0x40, %rdx
833	mov	0x20(%rdi), %r10
834	mov	0x28(%rdi), %rcx
835	mov	%r10, 0x20(%rsi)
836	mov	%rcx, 0x28(%rsi)
837	mov	0x30(%rdi), %r8
838	mov	0x38(%rdi), %r10
839	lea	0x40(%rdi), %rdi
840	mov	%r8, 0x30(%rsi)
841	mov	%r10, 0x38(%rsi)
842	lea	0x40(%rsi), %rsi
843	jae	L(aligned_loop)
844
845	/*
846	 * Copy remaining bytes (0-63)
847	 */
848L(do_remainder):
849	leaq	L(fwdPxQx)(%rip), %r10
850	addq	%rdx, %rdi
851	addq	%rdx, %rsi
852	movslq	(%r10,%rdx,4), %rcx
853	leaq	(%rcx,%r10,1), %r10
854	jmpq	*%r10
855
856	/*
857	 * Use rep smovq. Clear remainder via unrolled code
858	 */
859	.p2align 4
860L(use_rep):
861	xchgq	%rdi, %rsi		/* %rsi = source, %rdi = destination */
862	movq	%rdx, %rcx		/* %rcx = count */
863	shrq	$3, %rcx		/* 8-byte word count */
864	rep
865	  smovq
866
867	xchgq	%rsi, %rdi		/* %rdi = src, %rsi = destination */
868	andq	$7, %rdx		/* remainder */
869	jnz	L(do_remainder)
870	ret
871#undef	L
872
873#ifdef DEBUG
874	/*
875	 * Setup frame on the run-time stack. The end of the input argument
876	 * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
877	 * always points to the end of the latest allocated stack frame.
878	 * panic(const char *format, ...) is a varargs function. When a
879	 * function taking variable arguments is called, %rax must be set
880	 * to eight times the number of floating point parameters passed
881	 * to the function in SSE registers.
882	 */
883call_panic:
884	pushq	%rbp			/* align stack properly */
885	movq	%rsp, %rbp
886	xorl	%eax, %eax		/* no variable arguments */
887	call	panic			/* %rdi = format string */
888#endif
889	SET_SIZE(bcopy_altentry)
890	SET_SIZE(bcopy)
891
892#elif defined(__i386)
893
894#define	ARG_FROM	4
895#define	ARG_TO		8
896#define	ARG_COUNT	12
897
898	ENTRY(bcopy)
899#ifdef DEBUG
900	movl	ARG_COUNT(%esp), %eax
901	orl	%eax, %eax
902	jz	1f
903	movl	postbootkernelbase, %eax
904	cmpl	%eax, ARG_FROM(%esp)
905	jb	0f
906	cmpl	%eax, ARG_TO(%esp)
907	jnb	1f
9080:	pushl	%ebp
909	movl	%esp, %ebp
910	pushl	$.bcopy_panic_msg
911	call	panic
9121:
913#endif
914do_copy:
915	movl	%esi, %eax		/* save registers */
916	movl	%edi, %edx
917	movl	ARG_COUNT(%esp), %ecx
918	movl	ARG_FROM(%esp), %esi
919	movl	ARG_TO(%esp), %edi
920
921	shrl	$2, %ecx		/* word count */
922	rep
923	  smovl
924	movl	ARG_COUNT(%esp), %ecx
925	andl	$3, %ecx		/* bytes left over */
926	rep
927	  smovb
928	movl	%eax, %esi		/* restore registers */
929	movl	%edx, %edi
930	ret
931	SET_SIZE(bcopy)
932
933#undef	ARG_COUNT
934#undef	ARG_FROM
935#undef	ARG_TO
936
937#endif	/* __i386 */
938#endif	/* __lint */
939
940
941/*
942 * Zero a block of storage, returning an error code if we
943 * take a kernel pagefault which cannot be resolved.
944 * Returns errno value on pagefault error, 0 if all ok
945 */
946
947#if defined(__lint)
948
949/* ARGSUSED */
950int
951kzero(void *addr, size_t count)
952{ return (0); }
953
954#else	/* __lint */
955
956#if defined(__amd64)
957
958	ENTRY(kzero)
959#ifdef DEBUG
960        cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
961        jnb	0f
962        leaq	.kzero_panic_msg(%rip), %rdi
963	jmp	call_panic		/* setup stack and call panic */
9640:
965#endif
966	/*
967	 * pass lofault value as 3rd argument for fault return
968	 */
969	leaq	_kzeroerr(%rip), %rdx
970
971	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
972	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
973	movq	%rdx, T_LOFAULT(%r9)	/* new lofault */
974	call	bzero_altentry
975	xorl	%eax, %eax
976	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
977	ret
978	/*
979	 * A fault during bzero is indicated through an errno value
980	 * in %rax when we iretq to here.
981	 */
982_kzeroerr:
983	addq	$8, %rsp		/* pop bzero_altentry call ret addr */
984	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
985	ret
986	SET_SIZE(kzero)
987
988#elif defined(__i386)
989
990#define	ARG_ADDR	8
991#define	ARG_COUNT	12
992
993	ENTRY(kzero)
994#ifdef DEBUG
995	pushl	%ebp
996	movl	%esp, %ebp
997	movl	postbootkernelbase, %eax
998        cmpl	%eax, ARG_ADDR(%ebp)
999        jnb	0f
1000        pushl   $.kzero_panic_msg
1001        call    panic
10020:	popl	%ebp
1003#endif
1004	lea	_kzeroerr, %eax		/* kzeroerr is lofault value */
1005
1006	pushl	%ebp			/* save stack base */
1007	movl	%esp, %ebp		/* set new stack base */
1008	pushl	%edi			/* save %edi */
1009
1010	mov	%gs:CPU_THREAD, %edx
1011	movl	T_LOFAULT(%edx), %edi
1012	pushl	%edi			/* save the current lofault */
1013	movl	%eax, T_LOFAULT(%edx)	/* new lofault */
1014
1015	movl	ARG_COUNT(%ebp), %ecx	/* get size in bytes */
1016	movl	ARG_ADDR(%ebp), %edi	/* %edi <- address of bytes to clear */
1017	shrl	$2, %ecx		/* Count of double words to zero */
1018	xorl	%eax, %eax		/* sstol val */
1019	rep
1020	  sstol			/* %ecx contains words to clear (%eax=0) */
1021
1022	movl	ARG_COUNT(%ebp), %ecx	/* get size in bytes */
1023	andl	$3, %ecx		/* do mod 4 */
1024	rep
1025	  sstob			/* %ecx contains residual bytes to clear */
1026
1027	/*
1028	 * A fault during kzero is indicated through an errno value
1029	 * in %eax when we iret to here.
1030	 */
1031_kzeroerr:
1032	popl	%edi
1033	movl	%edi, T_LOFAULT(%edx)	/* restore the original lofault */
1034	popl	%edi
1035	popl	%ebp
1036	ret
1037	SET_SIZE(kzero)
1038
1039#undef	ARG_ADDR
1040#undef	ARG_COUNT
1041
1042#endif	/* __i386 */
1043#endif	/* __lint */
1044
1045/*
1046 * Zero a block of storage.
1047 */
1048
1049#if defined(__lint)
1050
1051/* ARGSUSED */
1052void
1053bzero(void *addr, size_t count)
1054{}
1055
1056#else	/* __lint */
1057
1058#if defined(__amd64)
1059
1060	ENTRY(bzero)
1061#ifdef DEBUG
1062	cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
1063	jnb	0f
1064	leaq	.bzero_panic_msg(%rip), %rdi
1065	jmp	call_panic		/* setup stack and call panic */
10660:
1067#endif
1068	ALTENTRY(bzero_altentry)
1069do_zero:
1070#define	L(s) .bzero/**/s
1071	xorl	%eax, %eax
1072
1073	cmpq	$0x50, %rsi		/* 80 */
1074	jae	L(ck_align)
1075
1076	/*
1077	 * Performance data shows many caller's are zeroing small buffers. So
1078	 * for best perf for these sizes unrolled code is used. Store zeros
1079	 * without worrying about alignment.
1080	 */
1081	leaq	L(setPxQx)(%rip), %r10
1082	addq	%rsi, %rdi
1083	movslq	(%r10,%rsi,4), %rcx
1084	leaq	(%rcx,%r10,1), %r10
1085	jmpq	*%r10
1086
1087	.p2align 4
1088L(setPxQx):
1089	.int       L(P0Q0)-L(setPxQx)	/* 0 */
1090	.int       L(P1Q0)-L(setPxQx)
1091	.int       L(P2Q0)-L(setPxQx)
1092	.int       L(P3Q0)-L(setPxQx)
1093	.int       L(P4Q0)-L(setPxQx)
1094	.int       L(P5Q0)-L(setPxQx)
1095	.int       L(P6Q0)-L(setPxQx)
1096	.int       L(P7Q0)-L(setPxQx)
1097
1098	.int       L(P0Q1)-L(setPxQx)	/* 8 */
1099	.int       L(P1Q1)-L(setPxQx)
1100	.int       L(P2Q1)-L(setPxQx)
1101	.int       L(P3Q1)-L(setPxQx)
1102	.int       L(P4Q1)-L(setPxQx)
1103	.int       L(P5Q1)-L(setPxQx)
1104	.int       L(P6Q1)-L(setPxQx)
1105	.int       L(P7Q1)-L(setPxQx)
1106
1107	.int       L(P0Q2)-L(setPxQx)	/* 16 */
1108	.int       L(P1Q2)-L(setPxQx)
1109	.int       L(P2Q2)-L(setPxQx)
1110	.int       L(P3Q2)-L(setPxQx)
1111	.int       L(P4Q2)-L(setPxQx)
1112	.int       L(P5Q2)-L(setPxQx)
1113	.int       L(P6Q2)-L(setPxQx)
1114	.int       L(P7Q2)-L(setPxQx)
1115
1116	.int       L(P0Q3)-L(setPxQx)	/* 24 */
1117	.int       L(P1Q3)-L(setPxQx)
1118	.int       L(P2Q3)-L(setPxQx)
1119	.int       L(P3Q3)-L(setPxQx)
1120	.int       L(P4Q3)-L(setPxQx)
1121	.int       L(P5Q3)-L(setPxQx)
1122	.int       L(P6Q3)-L(setPxQx)
1123	.int       L(P7Q3)-L(setPxQx)
1124
1125	.int       L(P0Q4)-L(setPxQx)	/* 32 */
1126	.int       L(P1Q4)-L(setPxQx)
1127	.int       L(P2Q4)-L(setPxQx)
1128	.int       L(P3Q4)-L(setPxQx)
1129	.int       L(P4Q4)-L(setPxQx)
1130	.int       L(P5Q4)-L(setPxQx)
1131	.int       L(P6Q4)-L(setPxQx)
1132	.int       L(P7Q4)-L(setPxQx)
1133
1134	.int       L(P0Q5)-L(setPxQx)	/* 40 */
1135	.int       L(P1Q5)-L(setPxQx)
1136	.int       L(P2Q5)-L(setPxQx)
1137	.int       L(P3Q5)-L(setPxQx)
1138	.int       L(P4Q5)-L(setPxQx)
1139	.int       L(P5Q5)-L(setPxQx)
1140	.int       L(P6Q5)-L(setPxQx)
1141	.int       L(P7Q5)-L(setPxQx)
1142
1143	.int       L(P0Q6)-L(setPxQx)	/* 48 */
1144	.int       L(P1Q6)-L(setPxQx)
1145	.int       L(P2Q6)-L(setPxQx)
1146	.int       L(P3Q6)-L(setPxQx)
1147	.int       L(P4Q6)-L(setPxQx)
1148	.int       L(P5Q6)-L(setPxQx)
1149	.int       L(P6Q6)-L(setPxQx)
1150	.int       L(P7Q6)-L(setPxQx)
1151
1152	.int       L(P0Q7)-L(setPxQx)	/* 56 */
1153	.int       L(P1Q7)-L(setPxQx)
1154	.int       L(P2Q7)-L(setPxQx)
1155	.int       L(P3Q7)-L(setPxQx)
1156	.int       L(P4Q7)-L(setPxQx)
1157	.int       L(P5Q7)-L(setPxQx)
1158	.int       L(P6Q7)-L(setPxQx)
1159	.int       L(P7Q7)-L(setPxQx)
1160
1161	.int       L(P0Q8)-L(setPxQx)	/* 64 */
1162	.int       L(P1Q8)-L(setPxQx)
1163	.int       L(P2Q8)-L(setPxQx)
1164	.int       L(P3Q8)-L(setPxQx)
1165	.int       L(P4Q8)-L(setPxQx)
1166	.int       L(P5Q8)-L(setPxQx)
1167	.int       L(P6Q8)-L(setPxQx)
1168	.int       L(P7Q8)-L(setPxQx)
1169
1170	.int       L(P0Q9)-L(setPxQx)	/* 72 */
1171	.int       L(P1Q9)-L(setPxQx)
1172	.int       L(P2Q9)-L(setPxQx)
1173	.int       L(P3Q9)-L(setPxQx)
1174	.int       L(P4Q9)-L(setPxQx)
1175	.int       L(P5Q9)-L(setPxQx)
1176	.int       L(P6Q9)-L(setPxQx)
1177	.int       L(P7Q9)-L(setPxQx)	/* 79 */
1178
1179	.p2align 4
1180L(P0Q9): mov    %rax, -0x48(%rdi)
1181L(P0Q8): mov    %rax, -0x40(%rdi)
1182L(P0Q7): mov    %rax, -0x38(%rdi)
1183L(P0Q6): mov    %rax, -0x30(%rdi)
1184L(P0Q5): mov    %rax, -0x28(%rdi)
1185L(P0Q4): mov    %rax, -0x20(%rdi)
1186L(P0Q3): mov    %rax, -0x18(%rdi)
1187L(P0Q2): mov    %rax, -0x10(%rdi)
1188L(P0Q1): mov    %rax, -0x8(%rdi)
1189L(P0Q0):
1190	 ret
1191
1192	.p2align 4
1193L(P1Q9): mov    %rax, -0x49(%rdi)
1194L(P1Q8): mov    %rax, -0x41(%rdi)
1195L(P1Q7): mov    %rax, -0x39(%rdi)
1196L(P1Q6): mov    %rax, -0x31(%rdi)
1197L(P1Q5): mov    %rax, -0x29(%rdi)
1198L(P1Q4): mov    %rax, -0x21(%rdi)
1199L(P1Q3): mov    %rax, -0x19(%rdi)
1200L(P1Q2): mov    %rax, -0x11(%rdi)
1201L(P1Q1): mov    %rax, -0x9(%rdi)
1202L(P1Q0): mov    %al, -0x1(%rdi)
1203	 ret
1204
1205	.p2align 4
1206L(P2Q9): mov    %rax, -0x4a(%rdi)
1207L(P2Q8): mov    %rax, -0x42(%rdi)
1208L(P2Q7): mov    %rax, -0x3a(%rdi)
1209L(P2Q6): mov    %rax, -0x32(%rdi)
1210L(P2Q5): mov    %rax, -0x2a(%rdi)
1211L(P2Q4): mov    %rax, -0x22(%rdi)
1212L(P2Q3): mov    %rax, -0x1a(%rdi)
1213L(P2Q2): mov    %rax, -0x12(%rdi)
1214L(P2Q1): mov    %rax, -0xa(%rdi)
1215L(P2Q0): mov    %ax, -0x2(%rdi)
1216	 ret
1217
1218	.p2align 4
1219L(P3Q9): mov    %rax, -0x4b(%rdi)
1220L(P3Q8): mov    %rax, -0x43(%rdi)
1221L(P3Q7): mov    %rax, -0x3b(%rdi)
1222L(P3Q6): mov    %rax, -0x33(%rdi)
1223L(P3Q5): mov    %rax, -0x2b(%rdi)
1224L(P3Q4): mov    %rax, -0x23(%rdi)
1225L(P3Q3): mov    %rax, -0x1b(%rdi)
1226L(P3Q2): mov    %rax, -0x13(%rdi)
1227L(P3Q1): mov    %rax, -0xb(%rdi)
1228L(P3Q0): mov    %ax, -0x3(%rdi)
1229	 mov    %al, -0x1(%rdi)
1230	 ret
1231
1232	.p2align 4
1233L(P4Q9): mov    %rax, -0x4c(%rdi)
1234L(P4Q8): mov    %rax, -0x44(%rdi)
1235L(P4Q7): mov    %rax, -0x3c(%rdi)
1236L(P4Q6): mov    %rax, -0x34(%rdi)
1237L(P4Q5): mov    %rax, -0x2c(%rdi)
1238L(P4Q4): mov    %rax, -0x24(%rdi)
1239L(P4Q3): mov    %rax, -0x1c(%rdi)
1240L(P4Q2): mov    %rax, -0x14(%rdi)
1241L(P4Q1): mov    %rax, -0xc(%rdi)
1242L(P4Q0): mov    %eax, -0x4(%rdi)
1243	 ret
1244
1245	.p2align 4
1246L(P5Q9): mov    %rax, -0x4d(%rdi)
1247L(P5Q8): mov    %rax, -0x45(%rdi)
1248L(P5Q7): mov    %rax, -0x3d(%rdi)
1249L(P5Q6): mov    %rax, -0x35(%rdi)
1250L(P5Q5): mov    %rax, -0x2d(%rdi)
1251L(P5Q4): mov    %rax, -0x25(%rdi)
1252L(P5Q3): mov    %rax, -0x1d(%rdi)
1253L(P5Q2): mov    %rax, -0x15(%rdi)
1254L(P5Q1): mov    %rax, -0xd(%rdi)
1255L(P5Q0): mov    %eax, -0x5(%rdi)
1256	 mov    %al, -0x1(%rdi)
1257	 ret
1258
1259	.p2align 4
1260L(P6Q9): mov    %rax, -0x4e(%rdi)
1261L(P6Q8): mov    %rax, -0x46(%rdi)
1262L(P6Q7): mov    %rax, -0x3e(%rdi)
1263L(P6Q6): mov    %rax, -0x36(%rdi)
1264L(P6Q5): mov    %rax, -0x2e(%rdi)
1265L(P6Q4): mov    %rax, -0x26(%rdi)
1266L(P6Q3): mov    %rax, -0x1e(%rdi)
1267L(P6Q2): mov    %rax, -0x16(%rdi)
1268L(P6Q1): mov    %rax, -0xe(%rdi)
1269L(P6Q0): mov    %eax, -0x6(%rdi)
1270	 mov    %ax, -0x2(%rdi)
1271	 ret
1272
1273	.p2align 4
1274L(P7Q9): mov    %rax, -0x4f(%rdi)
1275L(P7Q8): mov    %rax, -0x47(%rdi)
1276L(P7Q7): mov    %rax, -0x3f(%rdi)
1277L(P7Q6): mov    %rax, -0x37(%rdi)
1278L(P7Q5): mov    %rax, -0x2f(%rdi)
1279L(P7Q4): mov    %rax, -0x27(%rdi)
1280L(P7Q3): mov    %rax, -0x1f(%rdi)
1281L(P7Q2): mov    %rax, -0x17(%rdi)
1282L(P7Q1): mov    %rax, -0xf(%rdi)
1283L(P7Q0): mov    %eax, -0x7(%rdi)
1284	 mov    %ax, -0x3(%rdi)
1285	 mov    %al, -0x1(%rdi)
1286	 ret
1287
1288	/*
1289	 * Align to a 16-byte boundary. Avoids penalties from unaligned stores
1290	 * as well as from stores spanning cachelines. Note 16-byte alignment
1291	 * is better in case where rep sstosq is used.
1292	 */
1293	.p2align 4
1294L(ck_align):
1295	test	$0xf, %rdi
1296	jz	L(aligned_now)
1297	test	$1, %rdi
1298	jz	2f
1299	mov	%al, (%rdi)
1300	dec	%rsi
1301	lea	1(%rdi),%rdi
13022:
1303	test	$2, %rdi
1304	jz	4f
1305	mov	%ax, (%rdi)
1306	sub	$2, %rsi
1307	lea	2(%rdi),%rdi
13084:
1309	test	$4, %rdi
1310	jz	8f
1311	mov	%eax, (%rdi)
1312	sub	$4, %rsi
1313	lea	4(%rdi),%rdi
13148:
1315	test	$8, %rdi
1316	jz	L(aligned_now)
1317	mov	%rax, (%rdi)
1318	sub	$8, %rsi
1319	lea	8(%rdi),%rdi
1320
1321	/*
1322	 * For large sizes rep sstoq is fastest.
1323	 * Transition point determined experimentally as measured on
1324	 * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
1325	 */
1326L(aligned_now):
1327	cmp	$BZERO_USE_REP, %rsi
1328	ja	L(use_rep)
1329
1330	/*
1331	 * zero 64-bytes per loop
1332	 */
1333	.p2align 4
1334L(bzero_loop):
1335	leaq	-0x40(%rsi), %rsi
1336	cmpq	$0x40, %rsi
1337	movq	%rax, (%rdi)
1338	movq	%rax, 0x8(%rdi)
1339	movq	%rax, 0x10(%rdi)
1340	movq	%rax, 0x18(%rdi)
1341	movq	%rax, 0x20(%rdi)
1342	movq	%rax, 0x28(%rdi)
1343	movq	%rax, 0x30(%rdi)
1344	movq	%rax, 0x38(%rdi)
1345	leaq	0x40(%rdi), %rdi
1346	jae	L(bzero_loop)
1347
1348	/*
1349	 * Clear any remaining bytes..
1350	 */
13519:
1352	leaq	L(setPxQx)(%rip), %r10
1353	addq	%rsi, %rdi
1354	movslq	(%r10,%rsi,4), %rcx
1355	leaq	(%rcx,%r10,1), %r10
1356	jmpq	*%r10
1357
1358	/*
1359	 * Use rep sstoq. Clear any remainder via unrolled code
1360	 */
1361	.p2align 4
1362L(use_rep):
1363	movq	%rsi, %rcx		/* get size in bytes */
1364	shrq	$3, %rcx		/* count of 8-byte words to zero */
1365	rep
1366	  sstoq				/* %rcx = words to clear (%rax=0) */
1367	andq	$7, %rsi		/* remaining bytes */
1368	jnz	9b
1369	ret
1370#undef	L
1371	SET_SIZE(bzero_altentry)
1372	SET_SIZE(bzero)
1373
1374#elif defined(__i386)
1375
1376#define	ARG_ADDR	4
1377#define	ARG_COUNT	8
1378
1379	ENTRY(bzero)
1380#ifdef DEBUG
1381	movl	postbootkernelbase, %eax
1382	cmpl	%eax, ARG_ADDR(%esp)
1383	jnb	0f
1384	pushl	%ebp
1385	movl	%esp, %ebp
1386	pushl	$.bzero_panic_msg
1387	call	panic
13880:
1389#endif
1390do_zero:
1391	movl	%edi, %edx
1392	movl	ARG_COUNT(%esp), %ecx
1393	movl	ARG_ADDR(%esp), %edi
1394	shrl	$2, %ecx
1395	xorl	%eax, %eax
1396	rep
1397	  sstol
1398	movl	ARG_COUNT(%esp), %ecx
1399	andl	$3, %ecx
1400	rep
1401	  sstob
1402	movl	%edx, %edi
1403	ret
1404	SET_SIZE(bzero)
1405
1406#undef	ARG_ADDR
1407#undef	ARG_COUNT
1408
1409#endif	/* __i386 */
1410#endif	/* __lint */
1411
1412/*
1413 * Transfer data to and from user space -
1414 * Note that these routines can cause faults
1415 * It is assumed that the kernel has nothing at
1416 * less than KERNELBASE in the virtual address space.
1417 *
1418 * Note that copyin(9F) and copyout(9F) are part of the
1419 * DDI/DKI which specifies that they return '-1' on "errors."
1420 *
1421 * Sigh.
1422 *
1423 * So there's two extremely similar routines - xcopyin_nta() and
1424 * xcopyout_nta() which return the errno that we've faithfully computed.
1425 * This allows other callers (e.g. uiomove(9F)) to work correctly.
1426 * Given that these are used pretty heavily, we expand the calling
1427 * sequences inline for all flavours (rather than making wrappers).
1428 */
1429
1430/*
1431 * Copy user data to kernel space.
1432 */
1433
1434#if defined(__lint)
1435
1436/* ARGSUSED */
1437int
1438copyin(const void *uaddr, void *kaddr, size_t count)
1439{ return (0); }
1440
1441#else	/* lint */
1442
1443#if defined(__amd64)
1444
1445	ENTRY(copyin)
1446	pushq	%rbp
1447	movq	%rsp, %rbp
1448	subq	$24, %rsp
1449
1450	/*
1451	 * save args in case we trap and need to rerun as a copyop
1452	 */
1453	movq	%rdi, (%rsp)
1454	movq	%rsi, 0x8(%rsp)
1455	movq	%rdx, 0x10(%rsp)
1456
1457	movq	kernelbase(%rip), %rax
1458#ifdef DEBUG
1459	cmpq	%rax, %rsi		/* %rsi = kaddr */
1460	jnb	1f
1461	leaq	.copyin_panic_msg(%rip), %rdi
1462	xorl	%eax, %eax
1463	call	panic
14641:
1465#endif
1466	/*
1467	 * pass lofault value as 4th argument to do_copy_fault
1468	 */
1469	leaq	_copyin_err(%rip), %rcx
1470
1471	movq	%gs:CPU_THREAD, %r9
1472	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1473	jb	do_copy_fault
1474	jmp	3f
1475
1476_copyin_err:
1477	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
1478	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
14793:
1480	movq	T_COPYOPS(%r9), %rax
1481	cmpq	$0, %rax
1482	jz	2f
1483	/*
1484	 * reload args for the copyop
1485	 */
1486	movq	(%rsp), %rdi
1487	movq	0x8(%rsp), %rsi
1488	movq	0x10(%rsp), %rdx
1489	leave
1490	jmp	*CP_COPYIN(%rax)
1491
14922:	movl	$-1, %eax
1493	leave
1494	ret
1495	SET_SIZE(copyin)
1496
1497#elif defined(__i386)
1498
1499#define	ARG_UADDR	4
1500#define	ARG_KADDR	8
1501
1502	ENTRY(copyin)
1503	movl	kernelbase, %ecx
1504#ifdef DEBUG
1505	cmpl	%ecx, ARG_KADDR(%esp)
1506	jnb	1f
1507	pushl	%ebp
1508	movl	%esp, %ebp
1509	pushl	$.copyin_panic_msg
1510	call	panic
15111:
1512#endif
1513	lea	_copyin_err, %eax
1514
1515	movl	%gs:CPU_THREAD, %edx
1516	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1517	jb	do_copy_fault
1518	jmp	3f
1519
1520_copyin_err:
1521	popl	%ecx
1522	popl	%edi
1523	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1524	popl	%esi
1525	popl	%ebp
15263:
1527	movl	T_COPYOPS(%edx), %eax
1528	cmpl	$0, %eax
1529	jz	2f
1530	jmp	*CP_COPYIN(%eax)
1531
15322:	movl	$-1, %eax
1533	ret
1534	SET_SIZE(copyin)
1535
1536#undef	ARG_UADDR
1537#undef	ARG_KADDR
1538
1539#endif	/* __i386 */
1540#endif	/* __lint */
1541
1542#if defined(__lint)
1543
1544/* ARGSUSED */
1545int
1546xcopyin_nta(const void *uaddr, void *kaddr, size_t count, int copy_cached)
1547{ return (0); }
1548
1549#else	/* __lint */
1550
1551#if defined(__amd64)
1552
1553	ENTRY(xcopyin_nta)
1554	pushq	%rbp
1555	movq	%rsp, %rbp
1556	subq	$24, %rsp
1557
1558	/*
1559	 * save args in case we trap and need to rerun as a copyop
1560	 * %rcx is consumed in this routine so we don't need to save
1561	 * it.
1562	 */
1563	movq	%rdi, (%rsp)
1564	movq	%rsi, 0x8(%rsp)
1565	movq	%rdx, 0x10(%rsp)
1566
1567	movq	kernelbase(%rip), %rax
1568#ifdef DEBUG
1569	cmpq	%rax, %rsi		/* %rsi = kaddr */
1570	jnb	1f
1571	leaq	.xcopyin_panic_msg(%rip), %rdi
1572	xorl	%eax, %eax
1573	call	panic
15741:
1575#endif
1576	movq	%gs:CPU_THREAD, %r9
1577	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1578	jae	4f
1579	cmpq	$0, %rcx		/* No non-temporal access? */
1580	/*
1581	 * pass lofault value as 4th argument to do_copy_fault
1582	 */
1583	leaq	_xcopyin_err(%rip), %rcx	/* doesn't set rflags */
1584	jnz	do_copy_fault		/* use regular access */
1585	/*
1586	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1587	 */
1588	cmpq	$XCOPY_MIN_SIZE, %rdx
1589	jb	do_copy_fault
1590
1591	/*
1592	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1593	 * count is COUNT_ALIGN_SIZE aligned.
1594	 */
1595	movq	%rdi, %r10
1596	orq	%rsi, %r10
1597	andq	$NTA_ALIGN_MASK, %r10
1598	orq	%rdx, %r10
1599	andq	$COUNT_ALIGN_MASK, %r10
1600	jnz	do_copy_fault
1601	leaq	_xcopyin_nta_err(%rip), %rcx	/* doesn't set rflags */
1602	jmp	do_copy_fault_nta	/* use non-temporal access */
1603
16044:
1605	movl	$EFAULT, %eax
1606	jmp	3f
1607
1608	/*
1609	 * A fault during do_copy_fault or do_copy_fault_nta is
1610	 * indicated through an errno value in %rax and we iret from the
1611	 * trap handler to here.
1612	 */
1613_xcopyin_err:
1614	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
1615_xcopyin_nta_err:
1616	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
16173:
1618	movq	T_COPYOPS(%r9), %r8
1619	cmpq	$0, %r8
1620	jz	2f
1621
1622	/*
1623	 * reload args for the copyop
1624	 */
1625	movq	(%rsp), %rdi
1626	movq	0x8(%rsp), %rsi
1627	movq	0x10(%rsp), %rdx
1628	leave
1629	jmp	*CP_XCOPYIN(%r8)
1630
16312:	leave
1632	ret
1633	SET_SIZE(xcopyin_nta)
1634
1635#elif defined(__i386)
1636
1637#define	ARG_UADDR	4
1638#define	ARG_KADDR	8
1639#define	ARG_COUNT	12
1640#define	ARG_CACHED	16
1641
1642	.globl	use_sse_copy
1643
1644	ENTRY(xcopyin_nta)
1645	movl	kernelbase, %ecx
1646	lea	_xcopyin_err, %eax
1647	movl	%gs:CPU_THREAD, %edx
1648	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1649	jae	4f
1650
1651	cmpl	$0, use_sse_copy	/* no sse support */
1652	jz	do_copy_fault
1653
1654	cmpl	$0, ARG_CACHED(%esp)	/* copy_cached hint set? */
1655	jnz	do_copy_fault
1656
1657	/*
1658	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1659	 */
1660	cmpl	$XCOPY_MIN_SIZE, ARG_COUNT(%esp)
1661	jb	do_copy_fault
1662
1663	/*
1664	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1665	 * count is COUNT_ALIGN_SIZE aligned.
1666	 */
1667	movl	ARG_UADDR(%esp), %ecx
1668	orl	ARG_KADDR(%esp), %ecx
1669	andl	$NTA_ALIGN_MASK, %ecx
1670	orl	ARG_COUNT(%esp), %ecx
1671	andl	$COUNT_ALIGN_MASK, %ecx
1672	jnz	do_copy_fault
1673
1674	jmp	do_copy_fault_nta	/* use regular access */
1675
16764:
1677	movl	$EFAULT, %eax
1678	jmp	3f
1679
1680	/*
1681	 * A fault during do_copy_fault or do_copy_fault_nta is
1682	 * indicated through an errno value in %eax and we iret from the
1683	 * trap handler to here.
1684	 */
1685_xcopyin_err:
1686	popl	%ecx
1687	popl	%edi
1688	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1689	popl	%esi
1690	popl	%ebp
16913:
1692	cmpl	$0, T_COPYOPS(%edx)
1693	jz	2f
1694	movl	T_COPYOPS(%edx), %eax
1695	jmp	*CP_XCOPYIN(%eax)
1696
16972:	rep; 	ret	/* use 2 byte return instruction when branch target */
1698			/* AMD Software Optimization Guide - Section 6.2 */
1699	SET_SIZE(xcopyin_nta)
1700
1701#undef	ARG_UADDR
1702#undef	ARG_KADDR
1703#undef	ARG_COUNT
1704#undef	ARG_CACHED
1705
1706#endif	/* __i386 */
1707#endif	/* __lint */
1708
1709/*
1710 * Copy kernel data to user space.
1711 */
1712
1713#if defined(__lint)
1714
1715/* ARGSUSED */
1716int
1717copyout(const void *kaddr, void *uaddr, size_t count)
1718{ return (0); }
1719
1720#else	/* __lint */
1721
1722#if defined(__amd64)
1723
1724	ENTRY(copyout)
1725	pushq	%rbp
1726	movq	%rsp, %rbp
1727	subq	$24, %rsp
1728
1729	/*
1730	 * save args in case we trap and need to rerun as a copyop
1731	 */
1732	movq	%rdi, (%rsp)
1733	movq	%rsi, 0x8(%rsp)
1734	movq	%rdx, 0x10(%rsp)
1735
1736	movq	kernelbase(%rip), %rax
1737#ifdef DEBUG
1738	cmpq	%rax, %rdi		/* %rdi = kaddr */
1739	jnb	1f
1740	leaq	.copyout_panic_msg(%rip), %rdi
1741	xorl	%eax, %eax
1742	call	panic
17431:
1744#endif
1745	/*
1746	 * pass lofault value as 4th argument to do_copy_fault
1747	 */
1748	leaq	_copyout_err(%rip), %rcx
1749
1750	movq	%gs:CPU_THREAD, %r9
1751	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1752	jb	do_copy_fault
1753	jmp	3f
1754
1755_copyout_err:
1756	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
1757	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
17583:
1759	movq	T_COPYOPS(%r9), %rax
1760	cmpq	$0, %rax
1761	jz	2f
1762
1763	/*
1764	 * reload args for the copyop
1765	 */
1766	movq	(%rsp), %rdi
1767	movq	0x8(%rsp), %rsi
1768	movq	0x10(%rsp), %rdx
1769	leave
1770	jmp	*CP_COPYOUT(%rax)
1771
17722:	movl	$-1, %eax
1773	leave
1774	ret
1775	SET_SIZE(copyout)
1776
1777#elif defined(__i386)
1778
1779#define	ARG_KADDR	4
1780#define	ARG_UADDR	8
1781
1782	ENTRY(copyout)
1783	movl	kernelbase, %ecx
1784#ifdef DEBUG
1785	cmpl	%ecx, ARG_KADDR(%esp)
1786	jnb	1f
1787	pushl	%ebp
1788	movl	%esp, %ebp
1789	pushl	$.copyout_panic_msg
1790	call	panic
17911:
1792#endif
1793	lea	_copyout_err, %eax
1794	movl	%gs:CPU_THREAD, %edx
1795	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1796	jb	do_copy_fault
1797	jmp	3f
1798
1799_copyout_err:
1800	popl	%ecx
1801	popl	%edi
1802	movl	%ecx, T_LOFAULT(%edx)	/* restore original lofault */
1803	popl	%esi
1804	popl	%ebp
18053:
1806	movl	T_COPYOPS(%edx), %eax
1807	cmpl	$0, %eax
1808	jz	2f
1809	jmp	*CP_COPYOUT(%eax)
1810
18112:	movl	$-1, %eax
1812	ret
1813	SET_SIZE(copyout)
1814
1815#undef	ARG_UADDR
1816#undef	ARG_KADDR
1817
1818#endif	/* __i386 */
1819#endif	/* __lint */
1820
1821#if defined(__lint)
1822
1823/* ARGSUSED */
1824int
1825xcopyout_nta(const void *kaddr, void *uaddr, size_t count, int copy_cached)
1826{ return (0); }
1827
1828#else	/* __lint */
1829
1830#if defined(__amd64)
1831
1832	ENTRY(xcopyout_nta)
1833	pushq	%rbp
1834	movq	%rsp, %rbp
1835	subq	$24, %rsp
1836
1837	/*
1838	 * save args in case we trap and need to rerun as a copyop
1839	 */
1840	movq	%rdi, (%rsp)
1841	movq	%rsi, 0x8(%rsp)
1842	movq	%rdx, 0x10(%rsp)
1843
1844	movq	kernelbase(%rip), %rax
1845#ifdef DEBUG
1846	cmpq	%rax, %rdi		/* %rdi = kaddr */
1847	jnb	1f
1848	leaq	.xcopyout_panic_msg(%rip), %rdi
1849	xorl	%eax, %eax
1850	call	panic
18511:
1852#endif
1853	movq	%gs:CPU_THREAD, %r9
1854	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1855	jae	4f
1856
1857	cmpq	$0, %rcx		/* No non-temporal access? */
1858	/*
1859	 * pass lofault value as 4th argument to do_copy_fault
1860	 */
1861	leaq	_xcopyout_err(%rip), %rcx
1862	jnz	do_copy_fault
1863	/*
1864	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1865	 */
1866	cmpq	$XCOPY_MIN_SIZE, %rdx
1867	jb	do_copy_fault
1868
1869	/*
1870	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1871	 * count is COUNT_ALIGN_SIZE aligned.
1872	 */
1873	movq	%rdi, %r10
1874	orq	%rsi, %r10
1875	andq	$NTA_ALIGN_MASK, %r10
1876	orq	%rdx, %r10
1877	andq	$COUNT_ALIGN_MASK, %r10
1878	jnz	do_copy_fault
1879	leaq	_xcopyout_nta_err(%rip), %rcx
1880	jmp	do_copy_fault_nta
1881
18824:
1883	movl	$EFAULT, %eax
1884	jmp	3f
1885
1886	/*
1887	 * A fault during do_copy_fault or do_copy_fault_nta is
1888	 * indicated through an errno value in %rax and we iret from the
1889	 * trap handler to here.
1890	 */
1891_xcopyout_err:
1892	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
1893_xcopyout_nta_err:
1894	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
18953:
1896	movq	T_COPYOPS(%r9), %r8
1897	cmpq	$0, %r8
1898	jz	2f
1899
1900	/*
1901	 * reload args for the copyop
1902	 */
1903	movq	(%rsp), %rdi
1904	movq	0x8(%rsp), %rsi
1905	movq	0x10(%rsp), %rdx
1906	leave
1907	jmp	*CP_XCOPYOUT(%r8)
1908
19092:	leave
1910	ret
1911	SET_SIZE(xcopyout_nta)
1912
1913#elif defined(__i386)
1914
1915#define	ARG_KADDR	4
1916#define	ARG_UADDR	8
1917#define	ARG_COUNT	12
1918#define	ARG_CACHED	16
1919
1920	ENTRY(xcopyout_nta)
1921	movl	kernelbase, %ecx
1922	lea	_xcopyout_err, %eax
1923	movl	%gs:CPU_THREAD, %edx
1924	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
1925	jae	4f
1926
1927	cmpl	$0, use_sse_copy	/* no sse support */
1928	jz	do_copy_fault
1929
1930	cmpl	$0, ARG_CACHED(%esp)	/* copy_cached hint set? */
1931	jnz	do_copy_fault
1932
1933	/*
1934	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1935	 */
1936	cmpl	$XCOPY_MIN_SIZE, %edx
1937	jb	do_copy_fault
1938
1939	/*
1940	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1941	 * count is COUNT_ALIGN_SIZE aligned.
1942	 */
1943	movl	ARG_UADDR(%esp), %ecx
1944	orl	ARG_KADDR(%esp), %ecx
1945	andl	$NTA_ALIGN_MASK, %ecx
1946	orl	ARG_COUNT(%esp), %ecx
1947	andl	$COUNT_ALIGN_MASK, %ecx
1948	jnz	do_copy_fault
1949	jmp	do_copy_fault_nta
1950
19514:
1952	movl	$EFAULT, %eax
1953	jmp	3f
1954
1955	/*
1956	 * A fault during do_copy_fault or do_copy_fault_nta is
1957	 * indicated through an errno value in %eax and we iret from the
1958	 * trap handler to here.
1959	 */
1960_xcopyout_err:
1961	/ restore the original lofault
1962	popl	%ecx
1963	popl	%edi
1964	movl	%ecx, T_LOFAULT(%edx)	/ original lofault
1965	popl	%esi
1966	popl	%ebp
19673:
1968	cmpl	$0, T_COPYOPS(%edx)
1969	jz	2f
1970	movl	T_COPYOPS(%edx), %eax
1971	jmp	*CP_XCOPYOUT(%eax)
1972
19732:	rep;	ret	/* use 2 byte return instruction when branch target */
1974			/* AMD Software Optimization Guide - Section 6.2 */
1975	SET_SIZE(xcopyout_nta)
1976
1977#undef	ARG_UADDR
1978#undef	ARG_KADDR
1979#undef	ARG_COUNT
1980#undef	ARG_CACHED
1981
1982#endif	/* __i386 */
1983#endif	/* __lint */
1984
1985/*
1986 * Copy a null terminated string from one point to another in
1987 * the kernel address space.
1988 */
1989
1990#if defined(__lint)
1991
1992/* ARGSUSED */
1993int
1994copystr(const char *from, char *to, size_t maxlength, size_t *lencopied)
1995{ return (0); }
1996
1997#else	/* __lint */
1998
1999#if defined(__amd64)
2000
2001	ENTRY(copystr)
2002	pushq	%rbp
2003	movq	%rsp, %rbp
2004#ifdef DEBUG
2005	movq	kernelbase(%rip), %rax
2006	cmpq	%rax, %rdi		/* %rdi = from */
2007	jb	0f
2008	cmpq	%rax, %rsi		/* %rsi = to */
2009	jnb	1f
20100:	leaq	.copystr_panic_msg(%rip), %rdi
2011	xorl	%eax, %eax
2012	call	panic
20131:
2014#endif
2015	movq	%gs:CPU_THREAD, %r9
2016	movq	T_LOFAULT(%r9), %r8	/* pass current lofault value as */
2017					/* 5th argument to do_copystr */
2018do_copystr:
2019	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
2020	movq    T_LOFAULT(%r9), %r11	/* save the current lofault */
2021	movq	%r8, T_LOFAULT(%r9)	/* new lofault */
2022
2023	movq	%rdx, %r8		/* save maxlength */
2024
2025	cmpq	$0, %rdx		/* %rdx = maxlength */
2026	je	copystr_enametoolong	/* maxlength == 0 */
2027
2028copystr_loop:
2029	decq	%r8
2030	movb	(%rdi), %al
2031	incq	%rdi
2032	movb	%al, (%rsi)
2033	incq	%rsi
2034	cmpb	$0, %al
2035	je	copystr_null		/* null char */
2036	cmpq	$0, %r8
2037	jne	copystr_loop
2038
2039copystr_enametoolong:
2040	movl	$ENAMETOOLONG, %eax
2041	jmp	copystr_out
2042
2043copystr_null:
2044	xorl	%eax, %eax		/* no error */
2045
2046copystr_out:
2047	cmpq	$0, %rcx		/* want length? */
2048	je	copystr_done		/* no */
2049	subq	%r8, %rdx		/* compute length and store it */
2050	movq	%rdx, (%rcx)
2051
2052copystr_done:
2053	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
2054	leave
2055	ret
2056	SET_SIZE(copystr)
2057
2058#elif defined(__i386)
2059
2060#define	ARG_FROM	8
2061#define	ARG_TO		12
2062#define	ARG_MAXLEN	16
2063#define	ARG_LENCOPIED	20
2064
2065	ENTRY(copystr)
2066#ifdef DEBUG
2067	pushl	%ebp
2068	movl	%esp, %ebp
2069	movl	kernelbase, %eax
2070	cmpl	%eax, ARG_FROM(%esp)
2071	jb	0f
2072	cmpl	%eax, ARG_TO(%esp)
2073	jnb	1f
20740:	pushl	$.copystr_panic_msg
2075	call	panic
20761:	popl	%ebp
2077#endif
2078	/* get the current lofault address */
2079	movl	%gs:CPU_THREAD, %eax
2080	movl	T_LOFAULT(%eax), %eax
2081do_copystr:
2082	pushl	%ebp			/* setup stack frame */
2083	movl	%esp, %ebp
2084	pushl	%ebx			/* save registers */
2085	pushl	%edi
2086
2087	movl	%gs:CPU_THREAD, %ebx
2088	movl	T_LOFAULT(%ebx), %edi
2089	pushl	%edi			/* save the current lofault */
2090	movl	%eax, T_LOFAULT(%ebx)	/* new lofault */
2091
2092	movl	ARG_MAXLEN(%ebp), %ecx
2093	cmpl	$0, %ecx
2094	je	copystr_enametoolong	/* maxlength == 0 */
2095
2096	movl	ARG_FROM(%ebp), %ebx	/* source address */
2097	movl	ARG_TO(%ebp), %edx	/* destination address */
2098
2099copystr_loop:
2100	decl	%ecx
2101	movb	(%ebx), %al
2102	incl	%ebx
2103	movb	%al, (%edx)
2104	incl	%edx
2105	cmpb	$0, %al
2106	je	copystr_null		/* null char */
2107	cmpl	$0, %ecx
2108	jne	copystr_loop
2109
2110copystr_enametoolong:
2111	movl	$ENAMETOOLONG, %eax
2112	jmp	copystr_out
2113
2114copystr_null:
2115	xorl	%eax, %eax		/* no error */
2116
2117copystr_out:
2118	cmpl	$0, ARG_LENCOPIED(%ebp)	/* want length? */
2119	je	copystr_done		/* no */
2120	movl	ARG_MAXLEN(%ebp), %edx
2121	subl	%ecx, %edx		/* compute length and store it */
2122	movl	ARG_LENCOPIED(%ebp), %ecx
2123	movl	%edx, (%ecx)
2124
2125copystr_done:
2126	popl	%edi
2127	movl	%gs:CPU_THREAD, %ebx
2128	movl	%edi, T_LOFAULT(%ebx)	/* restore the original lofault */
2129
2130	popl	%edi
2131	popl	%ebx
2132	popl	%ebp
2133	ret
2134	SET_SIZE(copystr)
2135
2136#undef	ARG_FROM
2137#undef	ARG_TO
2138#undef	ARG_MAXLEN
2139#undef	ARG_LENCOPIED
2140
2141#endif	/* __i386 */
2142#endif	/* __lint */
2143
2144/*
2145 * Copy a null terminated string from the user address space into
2146 * the kernel address space.
2147 */
2148
2149#if defined(__lint)
2150
2151/* ARGSUSED */
2152int
2153copyinstr(const char *uaddr, char *kaddr, size_t maxlength,
2154    size_t *lencopied)
2155{ return (0); }
2156
2157#else	/* __lint */
2158
2159#if defined(__amd64)
2160
2161	ENTRY(copyinstr)
2162	pushq	%rbp
2163	movq	%rsp, %rbp
2164	subq	$32, %rsp
2165
2166	/*
2167	 * save args in case we trap and need to rerun as a copyop
2168	 */
2169	movq	%rdi, (%rsp)
2170	movq	%rsi, 0x8(%rsp)
2171	movq	%rdx, 0x10(%rsp)
2172	movq	%rcx, 0x18(%rsp)
2173
2174	movq	kernelbase(%rip), %rax
2175#ifdef DEBUG
2176	cmpq	%rax, %rsi		/* %rsi = kaddr */
2177	jnb	1f
2178	leaq	.copyinstr_panic_msg(%rip), %rdi
2179	xorl	%eax, %eax
2180	call	panic
21811:
2182#endif
2183	/*
2184	 * pass lofault value as 5th argument to do_copystr
2185	 */
2186	leaq	_copyinstr_error(%rip), %r8
2187
2188	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
2189	jb	do_copystr
2190	movq	%gs:CPU_THREAD, %r9
2191	jmp	3f
2192
2193_copyinstr_error:
2194	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
21953:
2196	movq	T_COPYOPS(%r9), %rax
2197	cmpq	$0, %rax
2198	jz	2f
2199
2200	/*
2201	 * reload args for the copyop
2202	 */
2203	movq	(%rsp), %rdi
2204	movq	0x8(%rsp), %rsi
2205	movq	0x10(%rsp), %rdx
2206	movq	0x18(%rsp), %rcx
2207	leave
2208	jmp	*CP_COPYINSTR(%rax)
2209
22102:	movl	$EFAULT, %eax		/* return EFAULT */
2211	leave
2212	ret
2213	SET_SIZE(copyinstr)
2214
2215#elif defined(__i386)
2216
2217#define	ARG_UADDR	4
2218#define	ARG_KADDR	8
2219
2220	ENTRY(copyinstr)
2221	movl	kernelbase, %ecx
2222#ifdef DEBUG
2223	cmpl	%ecx, ARG_KADDR(%esp)
2224	jnb	1f
2225	pushl	%ebp
2226	movl	%esp, %ebp
2227	pushl	$.copyinstr_panic_msg
2228	call	panic
22291:
2230#endif
2231	lea	_copyinstr_error, %eax
2232	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
2233	jb	do_copystr
2234	movl	%gs:CPU_THREAD, %edx
2235	jmp	3f
2236
2237_copyinstr_error:
2238	popl	%edi
2239	movl	%gs:CPU_THREAD, %edx
2240	movl	%edi, T_LOFAULT(%edx)	/* original lofault */
2241
2242	popl	%edi
2243	popl	%ebx
2244	popl	%ebp
22453:
2246	movl	T_COPYOPS(%edx), %eax
2247	cmpl	$0, %eax
2248	jz	2f
2249	jmp	*CP_COPYINSTR(%eax)
2250
22512:	movl	$EFAULT, %eax		/* return EFAULT */
2252	ret
2253	SET_SIZE(copyinstr)
2254
2255#undef	ARG_UADDR
2256#undef	ARG_KADDR
2257
2258#endif	/* __i386 */
2259#endif	/* __lint */
2260
2261/*
2262 * Copy a null terminated string from the kernel
2263 * address space to the user address space.
2264 */
2265
2266#if defined(__lint)
2267
2268/* ARGSUSED */
2269int
2270copyoutstr(const char *kaddr, char *uaddr, size_t maxlength,
2271    size_t *lencopied)
2272{ return (0); }
2273
2274#else	/* __lint */
2275
2276#if defined(__amd64)
2277
2278	ENTRY(copyoutstr)
2279	pushq	%rbp
2280	movq	%rsp, %rbp
2281	subq	$32, %rsp
2282
2283	/*
2284	 * save args in case we trap and need to rerun as a copyop
2285	 */
2286	movq	%rdi, (%rsp)
2287	movq	%rsi, 0x8(%rsp)
2288	movq	%rdx, 0x10(%rsp)
2289	movq	%rcx, 0x18(%rsp)
2290
2291	movq	kernelbase(%rip), %rax
2292#ifdef DEBUG
2293	cmpq	%rax, %rdi		/* %rdi = kaddr */
2294	jnb	1f
2295	leaq	.copyoutstr_panic_msg(%rip), %rdi
2296	jmp	call_panic		/* setup stack and call panic */
22971:
2298#endif
2299	/*
2300	 * pass lofault value as 5th argument to do_copystr
2301	 */
2302	leaq	_copyoutstr_error(%rip), %r8
2303
2304	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
2305	jb	do_copystr
2306	movq	%gs:CPU_THREAD, %r9
2307	jmp	3f
2308
2309_copyoutstr_error:
2310	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
23113:
2312	movq	T_COPYOPS(%r9), %rax
2313	cmpq	$0, %rax
2314	jz	2f
2315
2316	/*
2317	 * reload args for the copyop
2318	 */
2319	movq	(%rsp), %rdi
2320	movq	0x8(%rsp), %rsi
2321	movq	0x10(%rsp), %rdx
2322	movq	0x18(%rsp), %rcx
2323	leave
2324	jmp	*CP_COPYOUTSTR(%rax)
2325
23262:	movl	$EFAULT, %eax		/* return EFAULT */
2327	leave
2328	ret
2329	SET_SIZE(copyoutstr)
2330
2331#elif defined(__i386)
2332
2333#define	ARG_KADDR	4
2334#define	ARG_UADDR	8
2335
2336	ENTRY(copyoutstr)
2337	movl	kernelbase, %ecx
2338#ifdef DEBUG
2339	cmpl	%ecx, ARG_KADDR(%esp)
2340	jnb	1f
2341	pushl	%ebp
2342	movl	%esp, %ebp
2343	pushl	$.copyoutstr_panic_msg
2344	call	panic
23451:
2346#endif
2347	lea	_copyoutstr_error, %eax
2348	cmpl	%ecx, ARG_UADDR(%esp)	/* test uaddr < kernelbase */
2349	jb	do_copystr
2350	movl	%gs:CPU_THREAD, %edx
2351	jmp	3f
2352
2353_copyoutstr_error:
2354	popl	%edi
2355	movl	%gs:CPU_THREAD, %edx
2356	movl	%edi, T_LOFAULT(%edx)	/* restore the original lofault */
2357
2358	popl	%edi
2359	popl	%ebx
2360	popl	%ebp
23613:
2362	movl	T_COPYOPS(%edx), %eax
2363	cmpl	$0, %eax
2364	jz	2f
2365	jmp	*CP_COPYOUTSTR(%eax)
2366
23672:	movl	$EFAULT, %eax		/* return EFAULT */
2368	ret
2369	SET_SIZE(copyoutstr)
2370
2371#undef	ARG_KADDR
2372#undef	ARG_UADDR
2373
2374#endif	/* __i386 */
2375#endif	/* __lint */
2376
2377/*
2378 * Since all of the fuword() variants are so similar, we have a macro to spit
2379 * them out.  This allows us to create DTrace-unobservable functions easily.
2380 */
2381
2382#if defined(__lint)
2383
2384#if defined(__amd64)
2385
2386/* ARGSUSED */
2387int
2388fuword64(const void *addr, uint64_t *dst)
2389{ return (0); }
2390
2391#endif
2392
2393/* ARGSUSED */
2394int
2395fuword32(const void *addr, uint32_t *dst)
2396{ return (0); }
2397
2398/* ARGSUSED */
2399int
2400fuword16(const void *addr, uint16_t *dst)
2401{ return (0); }
2402
2403/* ARGSUSED */
2404int
2405fuword8(const void *addr, uint8_t *dst)
2406{ return (0); }
2407
2408#else	/* __lint */
2409
2410#if defined(__amd64)
2411
2412/*
2413 * (Note that we don't save and reload the arguments here
2414 * because their values are not altered in the copy path)
2415 */
2416
2417#define	FUWORD(NAME, INSTR, REG, COPYOP)	\
2418	ENTRY(NAME)				\
2419	movq	%gs:CPU_THREAD, %r9;		\
2420	cmpq	kernelbase(%rip), %rdi;		\
2421	jae	1f;				\
2422	leaq	_flt_/**/NAME, %rdx;		\
2423	movq	%rdx, T_LOFAULT(%r9);		\
2424	INSTR	(%rdi), REG;			\
2425	movq	$0, T_LOFAULT(%r9);		\
2426	INSTR	REG, (%rsi);			\
2427	xorl	%eax, %eax;			\
2428	ret;					\
2429_flt_/**/NAME:					\
2430	movq	$0, T_LOFAULT(%r9);		\
24311:						\
2432	movq	T_COPYOPS(%r9), %rax;		\
2433	cmpq	$0, %rax;			\
2434	jz	2f;				\
2435	jmp	*COPYOP(%rax);			\
24362:						\
2437	movl	$-1, %eax;			\
2438	ret;					\
2439	SET_SIZE(NAME)
2440
2441	FUWORD(fuword64, movq, %rax, CP_FUWORD64)
2442	FUWORD(fuword32, movl, %eax, CP_FUWORD32)
2443	FUWORD(fuword16, movw, %ax, CP_FUWORD16)
2444	FUWORD(fuword8, movb, %al, CP_FUWORD8)
2445
2446#elif defined(__i386)
2447
2448#define	FUWORD(NAME, INSTR, REG, COPYOP)	\
2449	ENTRY(NAME)				\
2450	movl	%gs:CPU_THREAD, %ecx;		\
2451	movl	kernelbase, %eax;		\
2452	cmpl	%eax, 4(%esp);			\
2453	jae	1f;				\
2454	lea	_flt_/**/NAME, %edx;		\
2455	movl	%edx, T_LOFAULT(%ecx);		\
2456	movl	4(%esp), %eax;			\
2457	movl	8(%esp), %edx;			\
2458	INSTR	(%eax), REG;			\
2459	movl	$0, T_LOFAULT(%ecx);		\
2460	INSTR	REG, (%edx);			\
2461	xorl	%eax, %eax;			\
2462	ret;					\
2463_flt_/**/NAME:					\
2464	movl	$0, T_LOFAULT(%ecx);		\
24651:						\
2466	movl	T_COPYOPS(%ecx), %eax;		\
2467	cmpl	$0, %eax;			\
2468	jz	2f;				\
2469	jmp	*COPYOP(%eax);			\
24702:						\
2471	movl	$-1, %eax;			\
2472	ret;					\
2473	SET_SIZE(NAME)
2474
2475	FUWORD(fuword32, movl, %eax, CP_FUWORD32)
2476	FUWORD(fuword16, movw, %ax, CP_FUWORD16)
2477	FUWORD(fuword8, movb, %al, CP_FUWORD8)
2478
2479#endif	/* __i386 */
2480
2481#undef	FUWORD
2482
2483#endif	/* __lint */
2484
2485/*
2486 * Set user word.
2487 */
2488
2489#if defined(__lint)
2490
2491#if defined(__amd64)
2492
2493/* ARGSUSED */
2494int
2495suword64(void *addr, uint64_t value)
2496{ return (0); }
2497
2498#endif
2499
2500/* ARGSUSED */
2501int
2502suword32(void *addr, uint32_t value)
2503{ return (0); }
2504
2505/* ARGSUSED */
2506int
2507suword16(void *addr, uint16_t value)
2508{ return (0); }
2509
2510/* ARGSUSED */
2511int
2512suword8(void *addr, uint8_t value)
2513{ return (0); }
2514
2515#else	/* lint */
2516
2517#if defined(__amd64)
2518
2519/*
2520 * (Note that we don't save and reload the arguments here
2521 * because their values are not altered in the copy path)
2522 */
2523
2524#define	SUWORD(NAME, INSTR, REG, COPYOP)	\
2525	ENTRY(NAME)				\
2526	movq	%gs:CPU_THREAD, %r9;		\
2527	cmpq	kernelbase(%rip), %rdi;		\
2528	jae	1f;				\
2529	leaq	_flt_/**/NAME, %rdx;		\
2530	movq	%rdx, T_LOFAULT(%r9);		\
2531	INSTR	REG, (%rdi);			\
2532	movq	$0, T_LOFAULT(%r9);		\
2533	xorl	%eax, %eax;			\
2534	ret;					\
2535_flt_/**/NAME:					\
2536	movq	$0, T_LOFAULT(%r9);		\
25371:						\
2538	movq	T_COPYOPS(%r9), %rax;		\
2539	cmpq	$0, %rax;			\
2540	jz	3f;				\
2541	jmp	*COPYOP(%rax);			\
25423:						\
2543	movl	$-1, %eax;			\
2544	ret;					\
2545	SET_SIZE(NAME)
2546
2547	SUWORD(suword64, movq, %rsi, CP_SUWORD64)
2548	SUWORD(suword32, movl, %esi, CP_SUWORD32)
2549	SUWORD(suword16, movw, %si, CP_SUWORD16)
2550	SUWORD(suword8, movb, %sil, CP_SUWORD8)
2551
2552#elif defined(__i386)
2553
2554#define	SUWORD(NAME, INSTR, REG, COPYOP)	\
2555	ENTRY(NAME)				\
2556	movl	%gs:CPU_THREAD, %ecx;		\
2557	movl	kernelbase, %eax;		\
2558	cmpl	%eax, 4(%esp);			\
2559	jae	1f;				\
2560	lea	_flt_/**/NAME, %edx;		\
2561	movl	%edx, T_LOFAULT(%ecx);		\
2562	movl	4(%esp), %eax;			\
2563	movl	8(%esp), %edx;			\
2564	INSTR	REG, (%eax);			\
2565	movl	$0, T_LOFAULT(%ecx);		\
2566	xorl	%eax, %eax;			\
2567	ret;					\
2568_flt_/**/NAME:					\
2569	movl	$0, T_LOFAULT(%ecx);		\
25701:						\
2571	movl	T_COPYOPS(%ecx), %eax;		\
2572	cmpl	$0, %eax;			\
2573	jz	3f;				\
2574	movl	COPYOP(%eax), %ecx;		\
2575	jmp	*%ecx;				\
25763:						\
2577	movl	$-1, %eax;			\
2578	ret;					\
2579	SET_SIZE(NAME)
2580
2581	SUWORD(suword32, movl, %edx, CP_SUWORD32)
2582	SUWORD(suword16, movw, %dx, CP_SUWORD16)
2583	SUWORD(suword8, movb, %dl, CP_SUWORD8)
2584
2585#endif	/* __i386 */
2586
2587#undef	SUWORD
2588
2589#endif	/* __lint */
2590
2591#if defined(__lint)
2592
2593#if defined(__amd64)
2594
2595/*ARGSUSED*/
2596void
2597fuword64_noerr(const void *addr, uint64_t *dst)
2598{}
2599
2600#endif
2601
2602/*ARGSUSED*/
2603void
2604fuword32_noerr(const void *addr, uint32_t *dst)
2605{}
2606
2607/*ARGSUSED*/
2608void
2609fuword8_noerr(const void *addr, uint8_t *dst)
2610{}
2611
2612/*ARGSUSED*/
2613void
2614fuword16_noerr(const void *addr, uint16_t *dst)
2615{}
2616
2617#else   /* __lint */
2618
2619#if defined(__amd64)
2620
2621#define	FUWORD_NOERR(NAME, INSTR, REG)		\
2622	ENTRY(NAME)				\
2623	cmpq	kernelbase(%rip), %rdi;		\
2624	cmovnbq	kernelbase(%rip), %rdi;		\
2625	INSTR	(%rdi), REG;			\
2626	INSTR	REG, (%rsi);			\
2627	ret;					\
2628	SET_SIZE(NAME)
2629
2630	FUWORD_NOERR(fuword64_noerr, movq, %rax)
2631	FUWORD_NOERR(fuword32_noerr, movl, %eax)
2632	FUWORD_NOERR(fuword16_noerr, movw, %ax)
2633	FUWORD_NOERR(fuword8_noerr, movb, %al)
2634
2635#elif defined(__i386)
2636
2637#define	FUWORD_NOERR(NAME, INSTR, REG)		\
2638	ENTRY(NAME)				\
2639	movl	4(%esp), %eax;			\
2640	cmpl	kernelbase, %eax;		\
2641	jb	1f;				\
2642	movl	kernelbase, %eax;		\
26431:	movl	8(%esp), %edx;			\
2644	INSTR	(%eax), REG;			\
2645	INSTR	REG, (%edx);			\
2646	ret;					\
2647	SET_SIZE(NAME)
2648
2649	FUWORD_NOERR(fuword32_noerr, movl, %ecx)
2650	FUWORD_NOERR(fuword16_noerr, movw, %cx)
2651	FUWORD_NOERR(fuword8_noerr, movb, %cl)
2652
2653#endif	/* __i386 */
2654
2655#undef	FUWORD_NOERR
2656
2657#endif	/* __lint */
2658
2659#if defined(__lint)
2660
2661#if defined(__amd64)
2662
2663/*ARGSUSED*/
2664void
2665suword64_noerr(void *addr, uint64_t value)
2666{}
2667
2668#endif
2669
2670/*ARGSUSED*/
2671void
2672suword32_noerr(void *addr, uint32_t value)
2673{}
2674
2675/*ARGSUSED*/
2676void
2677suword16_noerr(void *addr, uint16_t value)
2678{}
2679
2680/*ARGSUSED*/
2681void
2682suword8_noerr(void *addr, uint8_t value)
2683{}
2684
2685#else	/* lint */
2686
2687#if defined(__amd64)
2688
2689#define	SUWORD_NOERR(NAME, INSTR, REG)		\
2690	ENTRY(NAME)				\
2691	cmpq	kernelbase(%rip), %rdi;		\
2692	cmovnbq	kernelbase(%rip), %rdi;		\
2693	INSTR	REG, (%rdi);			\
2694	ret;					\
2695	SET_SIZE(NAME)
2696
2697	SUWORD_NOERR(suword64_noerr, movq, %rsi)
2698	SUWORD_NOERR(suword32_noerr, movl, %esi)
2699	SUWORD_NOERR(suword16_noerr, movw, %si)
2700	SUWORD_NOERR(suword8_noerr, movb, %sil)
2701
2702#elif defined(__i386)
2703
2704#define	SUWORD_NOERR(NAME, INSTR, REG)		\
2705	ENTRY(NAME)				\
2706	movl	4(%esp), %eax;			\
2707	cmpl	kernelbase, %eax;		\
2708	jb	1f;				\
2709	movl	kernelbase, %eax;		\
27101:						\
2711	movl	8(%esp), %edx;			\
2712	INSTR	REG, (%eax);			\
2713	ret;					\
2714	SET_SIZE(NAME)
2715
2716	SUWORD_NOERR(suword32_noerr, movl, %edx)
2717	SUWORD_NOERR(suword16_noerr, movw, %dx)
2718	SUWORD_NOERR(suword8_noerr, movb, %dl)
2719
2720#endif	/* __i386 */
2721
2722#undef	SUWORD_NOERR
2723
2724#endif	/* lint */
2725
2726
2727#if defined(__lint)
2728
2729/*ARGSUSED*/
2730int
2731subyte(void *addr, uchar_t value)
2732{ return (0); }
2733
2734/*ARGSUSED*/
2735void
2736subyte_noerr(void *addr, uchar_t value)
2737{}
2738
2739/*ARGSUSED*/
2740int
2741fulword(const void *addr, ulong_t *valuep)
2742{ return (0); }
2743
2744/*ARGSUSED*/
2745void
2746fulword_noerr(const void *addr, ulong_t *valuep)
2747{}
2748
2749/*ARGSUSED*/
2750int
2751sulword(void *addr, ulong_t valuep)
2752{ return (0); }
2753
2754/*ARGSUSED*/
2755void
2756sulword_noerr(void *addr, ulong_t valuep)
2757{}
2758
2759#else
2760
2761	.weak	subyte
2762	subyte=suword8
2763	.weak	subyte_noerr
2764	subyte_noerr=suword8_noerr
2765
2766#if defined(__amd64)
2767
2768	.weak	fulword
2769	fulword=fuword64
2770	.weak	fulword_noerr
2771	fulword_noerr=fuword64_noerr
2772	.weak	sulword
2773	sulword=suword64
2774	.weak	sulword_noerr
2775	sulword_noerr=suword64_noerr
2776
2777#elif defined(__i386)
2778
2779	.weak	fulword
2780	fulword=fuword32
2781	.weak	fulword_noerr
2782	fulword_noerr=fuword32_noerr
2783	.weak	sulword
2784	sulword=suword32
2785	.weak	sulword_noerr
2786	sulword_noerr=suword32_noerr
2787
2788#endif /* __i386 */
2789
2790#endif /* __lint */
2791
2792#if defined(__lint)
2793
2794/*
2795 * Copy a block of storage - must not overlap (from + len <= to).
2796 * No fault handler installed (to be called under on_fault())
2797 */
2798
2799/* ARGSUSED */
2800void
2801copyout_noerr(const void *kfrom, void *uto, size_t count)
2802{}
2803
2804/* ARGSUSED */
2805void
2806copyin_noerr(const void *ufrom, void *kto, size_t count)
2807{}
2808
2809/*
2810 * Zero a block of storage in user space
2811 */
2812
2813/* ARGSUSED */
2814void
2815uzero(void *addr, size_t count)
2816{}
2817
2818/*
2819 * copy a block of storage in user space
2820 */
2821
2822/* ARGSUSED */
2823void
2824ucopy(const void *ufrom, void *uto, size_t ulength)
2825{}
2826
2827/*
2828 * copy a string in user space
2829 */
2830
2831/* ARGSUSED */
2832void
2833ucopystr(const char *ufrom, char *uto, size_t umaxlength, size_t *lencopied)
2834{}
2835
2836#else /* __lint */
2837
2838#if defined(__amd64)
2839
2840	ENTRY(copyin_noerr)
2841	movq	kernelbase(%rip), %rax
2842#ifdef DEBUG
2843	cmpq	%rax, %rsi		/* %rsi = kto */
2844	jae	1f
2845	leaq	.cpyin_ne_pmsg(%rip), %rdi
2846	jmp	call_panic		/* setup stack and call panic */
28471:
2848#endif
2849	cmpq	%rax, %rdi		/* ufrom < kernelbase */
2850	jb	do_copy
2851	movq	%rax, %rdi		/* force fault at kernelbase */
2852	jmp	do_copy
2853	SET_SIZE(copyin_noerr)
2854
2855	ENTRY(copyout_noerr)
2856	movq	kernelbase(%rip), %rax
2857#ifdef DEBUG
2858	cmpq	%rax, %rdi		/* %rdi = kfrom */
2859	jae	1f
2860	leaq	.cpyout_ne_pmsg(%rip), %rdi
2861	jmp	call_panic		/* setup stack and call panic */
28621:
2863#endif
2864	cmpq	%rax, %rsi		/* uto < kernelbase */
2865	jb	do_copy
2866	movq	%rax, %rsi		/* force fault at kernelbase */
2867	jmp	do_copy
2868	SET_SIZE(copyout_noerr)
2869
2870	ENTRY(uzero)
2871	movq	kernelbase(%rip), %rax
2872	cmpq	%rax, %rdi
2873	jb	do_zero
2874	movq	%rax, %rdi	/* force fault at kernelbase */
2875	jmp	do_zero
2876	SET_SIZE(uzero)
2877
2878	ENTRY(ucopy)
2879	movq	kernelbase(%rip), %rax
2880	cmpq	%rax, %rdi
2881	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
2882	cmpq	%rax, %rsi
2883	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
2884	jmp	do_copy
2885	SET_SIZE(ucopy)
2886
2887	ENTRY(ucopystr)
2888	pushq	%rbp
2889	movq	%rsp, %rbp
2890	movq	kernelbase(%rip), %rax
2891	cmpq	%rax, %rdi
2892	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
2893	cmpq	%rax, %rsi
2894	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
2895	/* do_copystr expects lofault address in %r8 */
2896	movq	%gs:CPU_THREAD, %r8
2897	movq	T_LOFAULT(%r8), %r8
2898	jmp	do_copystr
2899	SET_SIZE(ucopystr)
2900
2901#elif defined(__i386)
2902
2903	ENTRY(copyin_noerr)
2904	movl	kernelbase, %eax
2905#ifdef DEBUG
2906	cmpl	%eax, 8(%esp)
2907	jae	1f
2908	pushl	$.cpyin_ne_pmsg
2909	call	panic
29101:
2911#endif
2912	cmpl	%eax, 4(%esp)
2913	jb	do_copy
2914	movl	%eax, 4(%esp)	/* force fault at kernelbase */
2915	jmp	do_copy
2916	SET_SIZE(copyin_noerr)
2917
2918	ENTRY(copyout_noerr)
2919	movl	kernelbase, %eax
2920#ifdef DEBUG
2921	cmpl	%eax, 4(%esp)
2922	jae	1f
2923	pushl	$.cpyout_ne_pmsg
2924	call	panic
29251:
2926#endif
2927	cmpl	%eax, 8(%esp)
2928	jb	do_copy
2929	movl	%eax, 8(%esp)	/* force fault at kernelbase */
2930	jmp	do_copy
2931	SET_SIZE(copyout_noerr)
2932
2933	ENTRY(uzero)
2934	movl	kernelbase, %eax
2935	cmpl	%eax, 4(%esp)
2936	jb	do_zero
2937	movl	%eax, 4(%esp)	/* force fault at kernelbase */
2938	jmp	do_zero
2939	SET_SIZE(uzero)
2940
2941	ENTRY(ucopy)
2942	movl	kernelbase, %eax
2943	cmpl	%eax, 4(%esp)
2944	jb	1f
2945	movl	%eax, 4(%esp)	/* force fault at kernelbase */
29461:
2947	cmpl	%eax, 8(%esp)
2948	jb	do_copy
2949	movl	%eax, 8(%esp)	/* force fault at kernelbase */
2950	jmp	do_copy
2951	SET_SIZE(ucopy)
2952
2953	ENTRY(ucopystr)
2954	movl	kernelbase, %eax
2955	cmpl	%eax, 4(%esp)
2956	jb	1f
2957	movl	%eax, 4(%esp)	/* force fault at kernelbase */
29581:
2959	cmpl	%eax, 8(%esp)
2960	jb	2f
2961	movl	%eax, 8(%esp)	/* force fault at kernelbase */
29622:
2963	/* do_copystr expects the lofault address in %eax */
2964	movl	%gs:CPU_THREAD, %eax
2965	movl	T_LOFAULT(%eax), %eax
2966	jmp	do_copystr
2967	SET_SIZE(ucopystr)
2968
2969#endif	/* __i386 */
2970
2971#ifdef DEBUG
2972	.data
2973.kcopy_panic_msg:
2974	.string "kcopy: arguments below kernelbase"
2975.bcopy_panic_msg:
2976	.string "bcopy: arguments below kernelbase"
2977.kzero_panic_msg:
2978        .string "kzero: arguments below kernelbase"
2979.bzero_panic_msg:
2980	.string	"bzero: arguments below kernelbase"
2981.copyin_panic_msg:
2982	.string "copyin: kaddr argument below kernelbase"
2983.xcopyin_panic_msg:
2984	.string	"xcopyin: kaddr argument below kernelbase"
2985.copyout_panic_msg:
2986	.string "copyout: kaddr argument below kernelbase"
2987.xcopyout_panic_msg:
2988	.string	"xcopyout: kaddr argument below kernelbase"
2989.copystr_panic_msg:
2990	.string	"copystr: arguments in user space"
2991.copyinstr_panic_msg:
2992	.string	"copyinstr: kaddr argument not in kernel address space"
2993.copyoutstr_panic_msg:
2994	.string	"copyoutstr: kaddr argument not in kernel address space"
2995.cpyin_ne_pmsg:
2996	.string "copyin_noerr: argument not in kernel address space"
2997.cpyout_ne_pmsg:
2998	.string "copyout_noerr: argument not in kernel address space"
2999#endif
3000
3001#endif	/* __lint */
3002