xref: /illumos-gate/usr/src/uts/intel/ml/copy.S (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright (c) 2009, Intel Corporation
28 * All rights reserved.
29 */
30
31/*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
32/*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T		*/
33/*         All Rights Reserved						*/
34
35/*       Copyright (c) 1987, 1988 Microsoft Corporation			*/
36/*         All Rights Reserved						*/
37
38/*
39 * Copyright 2020 Joyent, Inc.
40 */
41
42#include <sys/errno.h>
43#include <sys/asm_linkage.h>
44
45#include "assym.h"
46
47#define	KCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
48#define	XCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
49/*
50 * Non-temopral access (NTA) alignment requirement
51 */
52#define	NTA_ALIGN_SIZE	4	/* Must be at least 4-byte aligned */
53#define	NTA_ALIGN_MASK	_CONST(NTA_ALIGN_SIZE-1)
54#define	COUNT_ALIGN_SIZE	16	/* Must be at least 16-byte aligned */
55#define	COUNT_ALIGN_MASK	_CONST(COUNT_ALIGN_SIZE-1)
56
57/*
58 * With the introduction of Broadwell, Intel has introduced supervisor mode
59 * access protection -- SMAP. SMAP forces the kernel to set certain bits to
60 * enable access of user pages (AC in rflags, defines as PS_ACHK in
61 * <sys/psw.h>). One of the challenges is that the implementation of many of the
62 * userland copy routines directly use the kernel ones. For example, copyin and
63 * copyout simply go and jump to the do_copy_fault label and traditionally let
64 * those deal with the return for them. In fact, changing that is a can of frame
65 * pointers.
66 *
67 * Rules and Constraints:
68 *
69 * 1. For anything that's not in copy.s, we have it do explicit smap_disable()
70 * or smap_enable() calls.  This is restricted to the following three places:
71 * DTrace, resume() in swtch.s and on_fault/no_fault. If you want to add it
72 * somewhere else, we should be thinking twice.
73 *
74 * 2. We try to toggle this at the smallest window possible. This means that if
75 * we take a fault, need to try to use a copyop in copyin() or copyout(), or any
76 * other function, we will always leave with SMAP enabled (the kernel cannot
77 * access user pages).
78 *
79 * 3. None of the *_noerr() or ucopy/uzero routines should toggle SMAP. They are
80 * explicitly only allowed to be called while in an on_fault()/no_fault() handler,
81 * which already takes care of ensuring that SMAP is enabled and disabled. Note
82 * this means that when under an on_fault()/no_fault() handler, one must not
83 * call the non-*_noerr() routines.
84 *
85 * 4. The first thing we should do after coming out of an lofault handler is to
86 * make sure that we call smap_enable() again to ensure that we are safely
87 * protected, as more often than not, we will have disabled smap to get there.
88 *
89 * 5. smap_enable() and smap_disable() don't exist: calls to these functions
90 * generate runtime relocations, that are then processed into the necessary
91 * clac/stac, via the krtld hotinlines mechanism and hotinline_smap().
92 *
93 * 6. For any inline user of SMAP, the appropriate SMAP_ENABLE_INSTR and
94 * SMAP_DISABLE_INSTR macro should be used. If the number of these is changed,
95 * you must update the constants SMAP_ENABLE_COUNT and SMAP_DISABLE_COUNT below.
96 *
97 * 7. Generally this .s file is processed by a K&R style cpp. This means that it
98 * really has a lot of feelings about whitespace. In particular, if you have a
99 * macro FOO with the arguments FOO(1, 3), the second argument is in fact ' 3'.
100 *
101 * 8. In general, the kernel has its own value for rflags that gets used. This
102 * is maintained in a few different places which vary based on how the thread
103 * comes into existence and whether it's a user thread. In general, when the
104 * kernel takes a trap, it always will set ourselves to a known set of flags,
105 * mainly as part of ENABLE_INTR_FLAGS and F_OFF and F_ON. These ensure that
106 * PS_ACHK is cleared for us. In addition, when using the sysenter instruction,
107 * we mask off PS_ACHK off via the AMD_SFMASK MSR. See init_cpu_syscall() for
108 * where that gets masked off.
109 */
110
111/*
112 * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
113 * "rep smovq" for large sizes. Performance data shows that many calls to
114 * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
115 * these small sizes unrolled code is used. For medium sizes loops writing
116 * 64-bytes per loop are used. Transition points were determined experimentally.
117 */
118#define BZERO_USE_REP	(1024)
119#define BCOPY_DFLT_REP	(128)
120#define	BCOPY_NHM_REP	(768)
121
122/*
123 * Copy a block of storage, returning an error code if `from' or
124 * `to' takes a kernel pagefault which cannot be resolved.
125 * Returns errno value on pagefault error, 0 if all ok
126 */
127
128/*
129 * I'm sorry about these macros, but copy.s is unsurprisingly sensitive to
130 * additional call instructions.
131 */
132#define	SMAP_DISABLE_COUNT	16
133#define	SMAP_ENABLE_COUNT	26
134
135#define	SMAP_DISABLE_INSTR(ITER)		\
136	.globl	_smap_disable_patch_##ITER;	\
137	_smap_disable_patch_##ITER##:;	\
138	nop; nop; nop;
139
140#define	SMAP_ENABLE_INSTR(ITER)			\
141	.globl	_smap_enable_patch_##ITER;	\
142	_smap_enable_patch_##ITER##:;	\
143	nop; nop; nop;
144
145	.globl	kernelbase
146	.globl	postbootkernelbase
147
148	ENTRY(kcopy)
149	pushq	%rbp
150	movq	%rsp, %rbp
151#ifdef DEBUG
152	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
153	jb	0f
154	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
155	jnb	1f
1560:	leaq	.kcopy_panic_msg(%rip), %rdi
157	xorl	%eax, %eax
158	call	panic
1591:
160#endif
161	/*
162	 * pass lofault value as 4th argument to do_copy_fault
163	 */
164	leaq	_kcopy_copyerr(%rip), %rcx
165	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
166
167do_copy_fault:
168	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
169	movq	%rcx, T_LOFAULT(%r9)	/* new lofault */
170	call	bcopy_altentry
171	xorl	%eax, %eax		/* return 0 (success) */
172	SMAP_ENABLE_INSTR(0)
173
174	/*
175	 * A fault during do_copy_fault is indicated through an errno value
176	 * in %rax and we iretq from the trap handler to here.
177	 */
178_kcopy_copyerr:
179	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
180	leave
181	ret
182	SET_SIZE(kcopy)
183
184#undef	ARG_FROM
185#undef	ARG_TO
186#undef	ARG_COUNT
187
188#define	COPY_LOOP_INIT(src, dst, cnt)	\
189	addq	cnt, src;			\
190	addq	cnt, dst;			\
191	shrq	$3, cnt;			\
192	neg	cnt
193
194	/* Copy 16 bytes per loop.  Uses %rax and %r8 */
195#define	COPY_LOOP_BODY(src, dst, cnt)	\
196	prefetchnta	0x100(src, cnt, 8);	\
197	movq	(src, cnt, 8), %rax;		\
198	movq	0x8(src, cnt, 8), %r8;		\
199	movnti	%rax, (dst, cnt, 8);		\
200	movnti	%r8, 0x8(dst, cnt, 8);		\
201	addq	$2, cnt
202
203	ENTRY(kcopy_nta)
204	pushq	%rbp
205	movq	%rsp, %rbp
206#ifdef DEBUG
207	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
208	jb	0f
209	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
210	jnb	1f
2110:	leaq	.kcopy_panic_msg(%rip), %rdi
212	xorl	%eax, %eax
213	call	panic
2141:
215#endif
216
217	movq	%gs:CPU_THREAD, %r9
218	cmpq	$0, %rcx		/* No non-temporal access? */
219	/*
220	 * pass lofault value as 4th argument to do_copy_fault
221	 */
222	leaq	_kcopy_nta_copyerr(%rip), %rcx	/* doesn't set rflags */
223	jnz	do_copy_fault		/* use regular access */
224	/*
225	 * Make sure cnt is >= KCOPY_MIN_SIZE
226	 */
227	cmpq	$KCOPY_MIN_SIZE, %rdx
228	jb	do_copy_fault
229
230	/*
231	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
232	 * count is COUNT_ALIGN_SIZE aligned.
233	 */
234	movq	%rdi, %r10
235	orq	%rsi, %r10
236	andq	$NTA_ALIGN_MASK, %r10
237	orq	%rdx, %r10
238	andq	$COUNT_ALIGN_MASK, %r10
239	jnz	do_copy_fault
240
241	ALTENTRY(do_copy_fault_nta)
242	movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
243	movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
244	movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
245
246	/*
247	 * COPY_LOOP_BODY uses %rax and %r8
248	 */
249	COPY_LOOP_INIT(%rdi, %rsi, %rdx)
2502:	COPY_LOOP_BODY(%rdi, %rsi, %rdx)
251	jnz	2b
252
253	mfence
254	xorl	%eax, %eax		/* return 0 (success) */
255	SMAP_ENABLE_INSTR(1)
256
257_kcopy_nta_copyerr:
258	movq	%r11, T_LOFAULT(%r9)    /* restore original lofault */
259	leave
260	ret
261	SET_SIZE(do_copy_fault_nta)
262	SET_SIZE(kcopy_nta)
263
264	ENTRY(bcopy)
265#ifdef DEBUG
266	orq	%rdx, %rdx		/* %rdx = count */
267	jz	1f
268	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
269	jb	0f
270	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
271	jnb	1f
2720:	leaq	.bcopy_panic_msg(%rip), %rdi
273	jmp	call_panic		/* setup stack and call panic */
2741:
275#endif
276	/*
277	 * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
278	 * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
279	 * uses these registers in future they must be saved and restored.
280	 */
281	ALTENTRY(bcopy_altentry)
282do_copy:
283#define	L(s) .bcopy##s
284	cmpq	$0x50, %rdx		/* 80 */
285	jae	bcopy_ck_size
286
287	/*
288	 * Performance data shows many caller's copy small buffers. So for
289	 * best perf for these sizes unrolled code is used. Store data without
290	 * worrying about alignment.
291	 */
292	leaq	L(fwdPxQx)(%rip), %r10
293	addq	%rdx, %rdi
294	addq	%rdx, %rsi
295	movslq	(%r10,%rdx,4), %rcx
296	leaq	(%rcx,%r10,1), %r10
297	INDIRECT_JMP_REG(r10)
298
299	.p2align 4
300L(fwdPxQx):
301	.int       L(P0Q0)-L(fwdPxQx)	/* 0 */
302	.int       L(P1Q0)-L(fwdPxQx)
303	.int       L(P2Q0)-L(fwdPxQx)
304	.int       L(P3Q0)-L(fwdPxQx)
305	.int       L(P4Q0)-L(fwdPxQx)
306	.int       L(P5Q0)-L(fwdPxQx)
307	.int       L(P6Q0)-L(fwdPxQx)
308	.int       L(P7Q0)-L(fwdPxQx)
309
310	.int       L(P0Q1)-L(fwdPxQx)	/* 8 */
311	.int       L(P1Q1)-L(fwdPxQx)
312	.int       L(P2Q1)-L(fwdPxQx)
313	.int       L(P3Q1)-L(fwdPxQx)
314	.int       L(P4Q1)-L(fwdPxQx)
315	.int       L(P5Q1)-L(fwdPxQx)
316	.int       L(P6Q1)-L(fwdPxQx)
317	.int       L(P7Q1)-L(fwdPxQx)
318
319	.int       L(P0Q2)-L(fwdPxQx)	/* 16 */
320	.int       L(P1Q2)-L(fwdPxQx)
321	.int       L(P2Q2)-L(fwdPxQx)
322	.int       L(P3Q2)-L(fwdPxQx)
323	.int       L(P4Q2)-L(fwdPxQx)
324	.int       L(P5Q2)-L(fwdPxQx)
325	.int       L(P6Q2)-L(fwdPxQx)
326	.int       L(P7Q2)-L(fwdPxQx)
327
328	.int       L(P0Q3)-L(fwdPxQx)	/* 24 */
329	.int       L(P1Q3)-L(fwdPxQx)
330	.int       L(P2Q3)-L(fwdPxQx)
331	.int       L(P3Q3)-L(fwdPxQx)
332	.int       L(P4Q3)-L(fwdPxQx)
333	.int       L(P5Q3)-L(fwdPxQx)
334	.int       L(P6Q3)-L(fwdPxQx)
335	.int       L(P7Q3)-L(fwdPxQx)
336
337	.int       L(P0Q4)-L(fwdPxQx)	/* 32 */
338	.int       L(P1Q4)-L(fwdPxQx)
339	.int       L(P2Q4)-L(fwdPxQx)
340	.int       L(P3Q4)-L(fwdPxQx)
341	.int       L(P4Q4)-L(fwdPxQx)
342	.int       L(P5Q4)-L(fwdPxQx)
343	.int       L(P6Q4)-L(fwdPxQx)
344	.int       L(P7Q4)-L(fwdPxQx)
345
346	.int       L(P0Q5)-L(fwdPxQx)	/* 40 */
347	.int       L(P1Q5)-L(fwdPxQx)
348	.int       L(P2Q5)-L(fwdPxQx)
349	.int       L(P3Q5)-L(fwdPxQx)
350	.int       L(P4Q5)-L(fwdPxQx)
351	.int       L(P5Q5)-L(fwdPxQx)
352	.int       L(P6Q5)-L(fwdPxQx)
353	.int       L(P7Q5)-L(fwdPxQx)
354
355	.int       L(P0Q6)-L(fwdPxQx)	/* 48 */
356	.int       L(P1Q6)-L(fwdPxQx)
357	.int       L(P2Q6)-L(fwdPxQx)
358	.int       L(P3Q6)-L(fwdPxQx)
359	.int       L(P4Q6)-L(fwdPxQx)
360	.int       L(P5Q6)-L(fwdPxQx)
361	.int       L(P6Q6)-L(fwdPxQx)
362	.int       L(P7Q6)-L(fwdPxQx)
363
364	.int       L(P0Q7)-L(fwdPxQx)	/* 56 */
365	.int       L(P1Q7)-L(fwdPxQx)
366	.int       L(P2Q7)-L(fwdPxQx)
367	.int       L(P3Q7)-L(fwdPxQx)
368	.int       L(P4Q7)-L(fwdPxQx)
369	.int       L(P5Q7)-L(fwdPxQx)
370	.int       L(P6Q7)-L(fwdPxQx)
371	.int       L(P7Q7)-L(fwdPxQx)
372
373	.int       L(P0Q8)-L(fwdPxQx)	/* 64 */
374	.int       L(P1Q8)-L(fwdPxQx)
375	.int       L(P2Q8)-L(fwdPxQx)
376	.int       L(P3Q8)-L(fwdPxQx)
377	.int       L(P4Q8)-L(fwdPxQx)
378	.int       L(P5Q8)-L(fwdPxQx)
379	.int       L(P6Q8)-L(fwdPxQx)
380	.int       L(P7Q8)-L(fwdPxQx)
381
382	.int       L(P0Q9)-L(fwdPxQx)	/* 72 */
383	.int       L(P1Q9)-L(fwdPxQx)
384	.int       L(P2Q9)-L(fwdPxQx)
385	.int       L(P3Q9)-L(fwdPxQx)
386	.int       L(P4Q9)-L(fwdPxQx)
387	.int       L(P5Q9)-L(fwdPxQx)
388	.int       L(P6Q9)-L(fwdPxQx)
389	.int       L(P7Q9)-L(fwdPxQx)	/* 79 */
390
391	.p2align 4
392L(P0Q9):
393	mov    -0x48(%rdi), %rcx
394	mov    %rcx, -0x48(%rsi)
395L(P0Q8):
396	mov    -0x40(%rdi), %r10
397	mov    %r10, -0x40(%rsi)
398L(P0Q7):
399	mov    -0x38(%rdi), %r8
400	mov    %r8, -0x38(%rsi)
401L(P0Q6):
402	mov    -0x30(%rdi), %rcx
403	mov    %rcx, -0x30(%rsi)
404L(P0Q5):
405	mov    -0x28(%rdi), %r10
406	mov    %r10, -0x28(%rsi)
407L(P0Q4):
408	mov    -0x20(%rdi), %r8
409	mov    %r8, -0x20(%rsi)
410L(P0Q3):
411	mov    -0x18(%rdi), %rcx
412	mov    %rcx, -0x18(%rsi)
413L(P0Q2):
414	mov    -0x10(%rdi), %r10
415	mov    %r10, -0x10(%rsi)
416L(P0Q1):
417	mov    -0x8(%rdi), %r8
418	mov    %r8, -0x8(%rsi)
419L(P0Q0):
420	ret
421
422	.p2align 4
423L(P1Q9):
424	mov    -0x49(%rdi), %r8
425	mov    %r8, -0x49(%rsi)
426L(P1Q8):
427	mov    -0x41(%rdi), %rcx
428	mov    %rcx, -0x41(%rsi)
429L(P1Q7):
430	mov    -0x39(%rdi), %r10
431	mov    %r10, -0x39(%rsi)
432L(P1Q6):
433	mov    -0x31(%rdi), %r8
434	mov    %r8, -0x31(%rsi)
435L(P1Q5):
436	mov    -0x29(%rdi), %rcx
437	mov    %rcx, -0x29(%rsi)
438L(P1Q4):
439	mov    -0x21(%rdi), %r10
440	mov    %r10, -0x21(%rsi)
441L(P1Q3):
442	mov    -0x19(%rdi), %r8
443	mov    %r8, -0x19(%rsi)
444L(P1Q2):
445	mov    -0x11(%rdi), %rcx
446	mov    %rcx, -0x11(%rsi)
447L(P1Q1):
448	mov    -0x9(%rdi), %r10
449	mov    %r10, -0x9(%rsi)
450L(P1Q0):
451	movzbq -0x1(%rdi), %r8
452	mov    %r8b, -0x1(%rsi)
453	ret
454
455	.p2align 4
456L(P2Q9):
457	mov    -0x4a(%rdi), %r8
458	mov    %r8, -0x4a(%rsi)
459L(P2Q8):
460	mov    -0x42(%rdi), %rcx
461	mov    %rcx, -0x42(%rsi)
462L(P2Q7):
463	mov    -0x3a(%rdi), %r10
464	mov    %r10, -0x3a(%rsi)
465L(P2Q6):
466	mov    -0x32(%rdi), %r8
467	mov    %r8, -0x32(%rsi)
468L(P2Q5):
469	mov    -0x2a(%rdi), %rcx
470	mov    %rcx, -0x2a(%rsi)
471L(P2Q4):
472	mov    -0x22(%rdi), %r10
473	mov    %r10, -0x22(%rsi)
474L(P2Q3):
475	mov    -0x1a(%rdi), %r8
476	mov    %r8, -0x1a(%rsi)
477L(P2Q2):
478	mov    -0x12(%rdi), %rcx
479	mov    %rcx, -0x12(%rsi)
480L(P2Q1):
481	mov    -0xa(%rdi), %r10
482	mov    %r10, -0xa(%rsi)
483L(P2Q0):
484	movzwq -0x2(%rdi), %r8
485	mov    %r8w, -0x2(%rsi)
486	ret
487
488	.p2align 4
489L(P3Q9):
490	mov    -0x4b(%rdi), %r8
491	mov    %r8, -0x4b(%rsi)
492L(P3Q8):
493	mov    -0x43(%rdi), %rcx
494	mov    %rcx, -0x43(%rsi)
495L(P3Q7):
496	mov    -0x3b(%rdi), %r10
497	mov    %r10, -0x3b(%rsi)
498L(P3Q6):
499	mov    -0x33(%rdi), %r8
500	mov    %r8, -0x33(%rsi)
501L(P3Q5):
502	mov    -0x2b(%rdi), %rcx
503	mov    %rcx, -0x2b(%rsi)
504L(P3Q4):
505	mov    -0x23(%rdi), %r10
506	mov    %r10, -0x23(%rsi)
507L(P3Q3):
508	mov    -0x1b(%rdi), %r8
509	mov    %r8, -0x1b(%rsi)
510L(P3Q2):
511	mov    -0x13(%rdi), %rcx
512	mov    %rcx, -0x13(%rsi)
513L(P3Q1):
514	mov    -0xb(%rdi), %r10
515	mov    %r10, -0xb(%rsi)
516	/*
517	 * These trailing loads/stores have to do all their loads 1st,
518	 * then do the stores.
519	 */
520L(P3Q0):
521	movzwq -0x3(%rdi), %r8
522	movzbq -0x1(%rdi), %r10
523	mov    %r8w, -0x3(%rsi)
524	mov    %r10b, -0x1(%rsi)
525	ret
526
527	.p2align 4
528L(P4Q9):
529	mov    -0x4c(%rdi), %r8
530	mov    %r8, -0x4c(%rsi)
531L(P4Q8):
532	mov    -0x44(%rdi), %rcx
533	mov    %rcx, -0x44(%rsi)
534L(P4Q7):
535	mov    -0x3c(%rdi), %r10
536	mov    %r10, -0x3c(%rsi)
537L(P4Q6):
538	mov    -0x34(%rdi), %r8
539	mov    %r8, -0x34(%rsi)
540L(P4Q5):
541	mov    -0x2c(%rdi), %rcx
542	mov    %rcx, -0x2c(%rsi)
543L(P4Q4):
544	mov    -0x24(%rdi), %r10
545	mov    %r10, -0x24(%rsi)
546L(P4Q3):
547	mov    -0x1c(%rdi), %r8
548	mov    %r8, -0x1c(%rsi)
549L(P4Q2):
550	mov    -0x14(%rdi), %rcx
551	mov    %rcx, -0x14(%rsi)
552L(P4Q1):
553	mov    -0xc(%rdi), %r10
554	mov    %r10, -0xc(%rsi)
555L(P4Q0):
556	mov    -0x4(%rdi), %r8d
557	mov    %r8d, -0x4(%rsi)
558	ret
559
560	.p2align 4
561L(P5Q9):
562	mov    -0x4d(%rdi), %r8
563	mov    %r8, -0x4d(%rsi)
564L(P5Q8):
565	mov    -0x45(%rdi), %rcx
566	mov    %rcx, -0x45(%rsi)
567L(P5Q7):
568	mov    -0x3d(%rdi), %r10
569	mov    %r10, -0x3d(%rsi)
570L(P5Q6):
571	mov    -0x35(%rdi), %r8
572	mov    %r8, -0x35(%rsi)
573L(P5Q5):
574	mov    -0x2d(%rdi), %rcx
575	mov    %rcx, -0x2d(%rsi)
576L(P5Q4):
577	mov    -0x25(%rdi), %r10
578	mov    %r10, -0x25(%rsi)
579L(P5Q3):
580	mov    -0x1d(%rdi), %r8
581	mov    %r8, -0x1d(%rsi)
582L(P5Q2):
583	mov    -0x15(%rdi), %rcx
584	mov    %rcx, -0x15(%rsi)
585L(P5Q1):
586	mov    -0xd(%rdi), %r10
587	mov    %r10, -0xd(%rsi)
588L(P5Q0):
589	mov    -0x5(%rdi), %r8d
590	movzbq -0x1(%rdi), %r10
591	mov    %r8d, -0x5(%rsi)
592	mov    %r10b, -0x1(%rsi)
593	ret
594
595	.p2align 4
596L(P6Q9):
597	mov    -0x4e(%rdi), %r8
598	mov    %r8, -0x4e(%rsi)
599L(P6Q8):
600	mov    -0x46(%rdi), %rcx
601	mov    %rcx, -0x46(%rsi)
602L(P6Q7):
603	mov    -0x3e(%rdi), %r10
604	mov    %r10, -0x3e(%rsi)
605L(P6Q6):
606	mov    -0x36(%rdi), %r8
607	mov    %r8, -0x36(%rsi)
608L(P6Q5):
609	mov    -0x2e(%rdi), %rcx
610	mov    %rcx, -0x2e(%rsi)
611L(P6Q4):
612	mov    -0x26(%rdi), %r10
613	mov    %r10, -0x26(%rsi)
614L(P6Q3):
615	mov    -0x1e(%rdi), %r8
616	mov    %r8, -0x1e(%rsi)
617L(P6Q2):
618	mov    -0x16(%rdi), %rcx
619	mov    %rcx, -0x16(%rsi)
620L(P6Q1):
621	mov    -0xe(%rdi), %r10
622	mov    %r10, -0xe(%rsi)
623L(P6Q0):
624	mov    -0x6(%rdi), %r8d
625	movzwq -0x2(%rdi), %r10
626	mov    %r8d, -0x6(%rsi)
627	mov    %r10w, -0x2(%rsi)
628	ret
629
630	.p2align 4
631L(P7Q9):
632	mov    -0x4f(%rdi), %r8
633	mov    %r8, -0x4f(%rsi)
634L(P7Q8):
635	mov    -0x47(%rdi), %rcx
636	mov    %rcx, -0x47(%rsi)
637L(P7Q7):
638	mov    -0x3f(%rdi), %r10
639	mov    %r10, -0x3f(%rsi)
640L(P7Q6):
641	mov    -0x37(%rdi), %r8
642	mov    %r8, -0x37(%rsi)
643L(P7Q5):
644	mov    -0x2f(%rdi), %rcx
645	mov    %rcx, -0x2f(%rsi)
646L(P7Q4):
647	mov    -0x27(%rdi), %r10
648	mov    %r10, -0x27(%rsi)
649L(P7Q3):
650	mov    -0x1f(%rdi), %r8
651	mov    %r8, -0x1f(%rsi)
652L(P7Q2):
653	mov    -0x17(%rdi), %rcx
654	mov    %rcx, -0x17(%rsi)
655L(P7Q1):
656	mov    -0xf(%rdi), %r10
657	mov    %r10, -0xf(%rsi)
658L(P7Q0):
659	mov    -0x7(%rdi), %r8d
660	movzwq -0x3(%rdi), %r10
661	movzbq -0x1(%rdi), %rcx
662	mov    %r8d, -0x7(%rsi)
663	mov    %r10w, -0x3(%rsi)
664	mov    %cl, -0x1(%rsi)
665	ret
666
667	/*
668	 * For large sizes rep smovq is fastest.
669	 * Transition point determined experimentally as measured on
670	 * Intel Xeon processors (incl. Nehalem and previous generations) and
671	 * AMD Opteron. The transition value is patched at boot time to avoid
672	 * memory reference hit.
673	 */
674	.globl bcopy_patch_start
675bcopy_patch_start:
676	cmpq	$BCOPY_NHM_REP, %rdx
677	.globl bcopy_patch_end
678bcopy_patch_end:
679
680	.p2align 4
681	ALTENTRY(bcopy_ck_size)
682
683	cmpq	$BCOPY_DFLT_REP, %rdx
684	jae	L(use_rep)
685
686	/*
687	 * Align to a 8-byte boundary. Avoids penalties from unaligned stores
688	 * as well as from stores spanning cachelines.
689	 */
690	test	$0x7, %rsi
691	jz	L(aligned_loop)
692	test	$0x1, %rsi
693	jz	2f
694	movzbq	(%rdi), %r8
695	dec	%rdx
696	inc	%rdi
697	mov	%r8b, (%rsi)
698	inc	%rsi
6992:
700	test	$0x2, %rsi
701	jz	4f
702	movzwq	(%rdi), %r8
703	sub	$0x2, %rdx
704	add	$0x2, %rdi
705	mov	%r8w, (%rsi)
706	add	$0x2, %rsi
7074:
708	test	$0x4, %rsi
709	jz	L(aligned_loop)
710	mov	(%rdi), %r8d
711	sub	$0x4, %rdx
712	add	$0x4, %rdi
713	mov	%r8d, (%rsi)
714	add	$0x4, %rsi
715
716	/*
717	 * Copy 64-bytes per loop
718	 */
719	.p2align 4
720L(aligned_loop):
721	mov	(%rdi), %r8
722	mov	0x8(%rdi), %r10
723	lea	-0x40(%rdx), %rdx
724	mov	%r8, (%rsi)
725	mov	%r10, 0x8(%rsi)
726	mov	0x10(%rdi), %rcx
727	mov	0x18(%rdi), %r8
728	mov	%rcx, 0x10(%rsi)
729	mov	%r8, 0x18(%rsi)
730
731	cmp	$0x40, %rdx
732	mov	0x20(%rdi), %r10
733	mov	0x28(%rdi), %rcx
734	mov	%r10, 0x20(%rsi)
735	mov	%rcx, 0x28(%rsi)
736	mov	0x30(%rdi), %r8
737	mov	0x38(%rdi), %r10
738	lea	0x40(%rdi), %rdi
739	mov	%r8, 0x30(%rsi)
740	mov	%r10, 0x38(%rsi)
741	lea	0x40(%rsi), %rsi
742	jae	L(aligned_loop)
743
744	/*
745	 * Copy remaining bytes (0-63)
746	 */
747L(do_remainder):
748	leaq	L(fwdPxQx)(%rip), %r10
749	addq	%rdx, %rdi
750	addq	%rdx, %rsi
751	movslq	(%r10,%rdx,4), %rcx
752	leaq	(%rcx,%r10,1), %r10
753	INDIRECT_JMP_REG(r10)
754
755	/*
756	 * Use rep smovq. Clear remainder via unrolled code
757	 */
758	.p2align 4
759L(use_rep):
760	xchgq	%rdi, %rsi		/* %rsi = source, %rdi = destination */
761	movq	%rdx, %rcx		/* %rcx = count */
762	shrq	$3, %rcx		/* 8-byte word count */
763	rep
764	  smovq
765
766	xchgq	%rsi, %rdi		/* %rdi = src, %rsi = destination */
767	andq	$7, %rdx		/* remainder */
768	jnz	L(do_remainder)
769	ret
770#undef	L
771	SET_SIZE(bcopy_ck_size)
772
773#ifdef DEBUG
774	/*
775	 * Setup frame on the run-time stack. The end of the input argument
776	 * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
777	 * always points to the end of the latest allocated stack frame.
778	 * panic(const char *format, ...) is a varargs function. When a
779	 * function taking variable arguments is called, %rax must be set
780	 * to eight times the number of floating point parameters passed
781	 * to the function in SSE registers.
782	 */
783call_panic:
784	pushq	%rbp			/* align stack properly */
785	movq	%rsp, %rbp
786	xorl	%eax, %eax		/* no variable arguments */
787	call	panic			/* %rdi = format string */
788#endif
789	SET_SIZE(bcopy_altentry)
790	SET_SIZE(bcopy)
791
792
793/*
794 * Zero a block of storage, returning an error code if we
795 * take a kernel pagefault which cannot be resolved.
796 * Returns errno value on pagefault error, 0 if all ok
797 */
798
799	ENTRY(kzero)
800#ifdef DEBUG
801        cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
802        jnb	0f
803        leaq	.kzero_panic_msg(%rip), %rdi
804	jmp	call_panic		/* setup stack and call panic */
8050:
806#endif
807	/*
808	 * pass lofault value as 3rd argument for fault return
809	 */
810	leaq	_kzeroerr(%rip), %rdx
811
812	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
813	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
814	movq	%rdx, T_LOFAULT(%r9)	/* new lofault */
815	call	bzero_altentry
816	xorl	%eax, %eax
817	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
818	ret
819	/*
820	 * A fault during bzero is indicated through an errno value
821	 * in %rax when we iretq to here.
822	 */
823_kzeroerr:
824	addq	$8, %rsp		/* pop bzero_altentry call ret addr */
825	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
826	ret
827	SET_SIZE(kzero)
828
829/*
830 * Zero a block of storage.
831 */
832
833	ENTRY(bzero)
834#ifdef DEBUG
835	cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
836	jnb	0f
837	leaq	.bzero_panic_msg(%rip), %rdi
838	jmp	call_panic		/* setup stack and call panic */
8390:
840#endif
841	ALTENTRY(bzero_altentry)
842do_zero:
843#define	L(s) .bzero##s
844	xorl	%eax, %eax
845
846	cmpq	$0x50, %rsi		/* 80 */
847	jae	L(ck_align)
848
849	/*
850	 * Performance data shows many caller's are zeroing small buffers. So
851	 * for best perf for these sizes unrolled code is used. Store zeros
852	 * without worrying about alignment.
853	 */
854	leaq	L(setPxQx)(%rip), %r10
855	addq	%rsi, %rdi
856	movslq	(%r10,%rsi,4), %rcx
857	leaq	(%rcx,%r10,1), %r10
858	INDIRECT_JMP_REG(r10)
859
860	.p2align 4
861L(setPxQx):
862	.int       L(P0Q0)-L(setPxQx)	/* 0 */
863	.int       L(P1Q0)-L(setPxQx)
864	.int       L(P2Q0)-L(setPxQx)
865	.int       L(P3Q0)-L(setPxQx)
866	.int       L(P4Q0)-L(setPxQx)
867	.int       L(P5Q0)-L(setPxQx)
868	.int       L(P6Q0)-L(setPxQx)
869	.int       L(P7Q0)-L(setPxQx)
870
871	.int       L(P0Q1)-L(setPxQx)	/* 8 */
872	.int       L(P1Q1)-L(setPxQx)
873	.int       L(P2Q1)-L(setPxQx)
874	.int       L(P3Q1)-L(setPxQx)
875	.int       L(P4Q1)-L(setPxQx)
876	.int       L(P5Q1)-L(setPxQx)
877	.int       L(P6Q1)-L(setPxQx)
878	.int       L(P7Q1)-L(setPxQx)
879
880	.int       L(P0Q2)-L(setPxQx)	/* 16 */
881	.int       L(P1Q2)-L(setPxQx)
882	.int       L(P2Q2)-L(setPxQx)
883	.int       L(P3Q2)-L(setPxQx)
884	.int       L(P4Q2)-L(setPxQx)
885	.int       L(P5Q2)-L(setPxQx)
886	.int       L(P6Q2)-L(setPxQx)
887	.int       L(P7Q2)-L(setPxQx)
888
889	.int       L(P0Q3)-L(setPxQx)	/* 24 */
890	.int       L(P1Q3)-L(setPxQx)
891	.int       L(P2Q3)-L(setPxQx)
892	.int       L(P3Q3)-L(setPxQx)
893	.int       L(P4Q3)-L(setPxQx)
894	.int       L(P5Q3)-L(setPxQx)
895	.int       L(P6Q3)-L(setPxQx)
896	.int       L(P7Q3)-L(setPxQx)
897
898	.int       L(P0Q4)-L(setPxQx)	/* 32 */
899	.int       L(P1Q4)-L(setPxQx)
900	.int       L(P2Q4)-L(setPxQx)
901	.int       L(P3Q4)-L(setPxQx)
902	.int       L(P4Q4)-L(setPxQx)
903	.int       L(P5Q4)-L(setPxQx)
904	.int       L(P6Q4)-L(setPxQx)
905	.int       L(P7Q4)-L(setPxQx)
906
907	.int       L(P0Q5)-L(setPxQx)	/* 40 */
908	.int       L(P1Q5)-L(setPxQx)
909	.int       L(P2Q5)-L(setPxQx)
910	.int       L(P3Q5)-L(setPxQx)
911	.int       L(P4Q5)-L(setPxQx)
912	.int       L(P5Q5)-L(setPxQx)
913	.int       L(P6Q5)-L(setPxQx)
914	.int       L(P7Q5)-L(setPxQx)
915
916	.int       L(P0Q6)-L(setPxQx)	/* 48 */
917	.int       L(P1Q6)-L(setPxQx)
918	.int       L(P2Q6)-L(setPxQx)
919	.int       L(P3Q6)-L(setPxQx)
920	.int       L(P4Q6)-L(setPxQx)
921	.int       L(P5Q6)-L(setPxQx)
922	.int       L(P6Q6)-L(setPxQx)
923	.int       L(P7Q6)-L(setPxQx)
924
925	.int       L(P0Q7)-L(setPxQx)	/* 56 */
926	.int       L(P1Q7)-L(setPxQx)
927	.int       L(P2Q7)-L(setPxQx)
928	.int       L(P3Q7)-L(setPxQx)
929	.int       L(P4Q7)-L(setPxQx)
930	.int       L(P5Q7)-L(setPxQx)
931	.int       L(P6Q7)-L(setPxQx)
932	.int       L(P7Q7)-L(setPxQx)
933
934	.int       L(P0Q8)-L(setPxQx)	/* 64 */
935	.int       L(P1Q8)-L(setPxQx)
936	.int       L(P2Q8)-L(setPxQx)
937	.int       L(P3Q8)-L(setPxQx)
938	.int       L(P4Q8)-L(setPxQx)
939	.int       L(P5Q8)-L(setPxQx)
940	.int       L(P6Q8)-L(setPxQx)
941	.int       L(P7Q8)-L(setPxQx)
942
943	.int       L(P0Q9)-L(setPxQx)	/* 72 */
944	.int       L(P1Q9)-L(setPxQx)
945	.int       L(P2Q9)-L(setPxQx)
946	.int       L(P3Q9)-L(setPxQx)
947	.int       L(P4Q9)-L(setPxQx)
948	.int       L(P5Q9)-L(setPxQx)
949	.int       L(P6Q9)-L(setPxQx)
950	.int       L(P7Q9)-L(setPxQx)	/* 79 */
951
952	.p2align 4
953L(P0Q9): mov    %rax, -0x48(%rdi)
954L(P0Q8): mov    %rax, -0x40(%rdi)
955L(P0Q7): mov    %rax, -0x38(%rdi)
956L(P0Q6): mov    %rax, -0x30(%rdi)
957L(P0Q5): mov    %rax, -0x28(%rdi)
958L(P0Q4): mov    %rax, -0x20(%rdi)
959L(P0Q3): mov    %rax, -0x18(%rdi)
960L(P0Q2): mov    %rax, -0x10(%rdi)
961L(P0Q1): mov    %rax, -0x8(%rdi)
962L(P0Q0):
963	 ret
964
965	.p2align 4
966L(P1Q9): mov    %rax, -0x49(%rdi)
967L(P1Q8): mov    %rax, -0x41(%rdi)
968L(P1Q7): mov    %rax, -0x39(%rdi)
969L(P1Q6): mov    %rax, -0x31(%rdi)
970L(P1Q5): mov    %rax, -0x29(%rdi)
971L(P1Q4): mov    %rax, -0x21(%rdi)
972L(P1Q3): mov    %rax, -0x19(%rdi)
973L(P1Q2): mov    %rax, -0x11(%rdi)
974L(P1Q1): mov    %rax, -0x9(%rdi)
975L(P1Q0): mov    %al, -0x1(%rdi)
976	 ret
977
978	.p2align 4
979L(P2Q9): mov    %rax, -0x4a(%rdi)
980L(P2Q8): mov    %rax, -0x42(%rdi)
981L(P2Q7): mov    %rax, -0x3a(%rdi)
982L(P2Q6): mov    %rax, -0x32(%rdi)
983L(P2Q5): mov    %rax, -0x2a(%rdi)
984L(P2Q4): mov    %rax, -0x22(%rdi)
985L(P2Q3): mov    %rax, -0x1a(%rdi)
986L(P2Q2): mov    %rax, -0x12(%rdi)
987L(P2Q1): mov    %rax, -0xa(%rdi)
988L(P2Q0): mov    %ax, -0x2(%rdi)
989	 ret
990
991	.p2align 4
992L(P3Q9): mov    %rax, -0x4b(%rdi)
993L(P3Q8): mov    %rax, -0x43(%rdi)
994L(P3Q7): mov    %rax, -0x3b(%rdi)
995L(P3Q6): mov    %rax, -0x33(%rdi)
996L(P3Q5): mov    %rax, -0x2b(%rdi)
997L(P3Q4): mov    %rax, -0x23(%rdi)
998L(P3Q3): mov    %rax, -0x1b(%rdi)
999L(P3Q2): mov    %rax, -0x13(%rdi)
1000L(P3Q1): mov    %rax, -0xb(%rdi)
1001L(P3Q0): mov    %ax, -0x3(%rdi)
1002	 mov    %al, -0x1(%rdi)
1003	 ret
1004
1005	.p2align 4
1006L(P4Q9): mov    %rax, -0x4c(%rdi)
1007L(P4Q8): mov    %rax, -0x44(%rdi)
1008L(P4Q7): mov    %rax, -0x3c(%rdi)
1009L(P4Q6): mov    %rax, -0x34(%rdi)
1010L(P4Q5): mov    %rax, -0x2c(%rdi)
1011L(P4Q4): mov    %rax, -0x24(%rdi)
1012L(P4Q3): mov    %rax, -0x1c(%rdi)
1013L(P4Q2): mov    %rax, -0x14(%rdi)
1014L(P4Q1): mov    %rax, -0xc(%rdi)
1015L(P4Q0): mov    %eax, -0x4(%rdi)
1016	 ret
1017
1018	.p2align 4
1019L(P5Q9): mov    %rax, -0x4d(%rdi)
1020L(P5Q8): mov    %rax, -0x45(%rdi)
1021L(P5Q7): mov    %rax, -0x3d(%rdi)
1022L(P5Q6): mov    %rax, -0x35(%rdi)
1023L(P5Q5): mov    %rax, -0x2d(%rdi)
1024L(P5Q4): mov    %rax, -0x25(%rdi)
1025L(P5Q3): mov    %rax, -0x1d(%rdi)
1026L(P5Q2): mov    %rax, -0x15(%rdi)
1027L(P5Q1): mov    %rax, -0xd(%rdi)
1028L(P5Q0): mov    %eax, -0x5(%rdi)
1029	 mov    %al, -0x1(%rdi)
1030	 ret
1031
1032	.p2align 4
1033L(P6Q9): mov    %rax, -0x4e(%rdi)
1034L(P6Q8): mov    %rax, -0x46(%rdi)
1035L(P6Q7): mov    %rax, -0x3e(%rdi)
1036L(P6Q6): mov    %rax, -0x36(%rdi)
1037L(P6Q5): mov    %rax, -0x2e(%rdi)
1038L(P6Q4): mov    %rax, -0x26(%rdi)
1039L(P6Q3): mov    %rax, -0x1e(%rdi)
1040L(P6Q2): mov    %rax, -0x16(%rdi)
1041L(P6Q1): mov    %rax, -0xe(%rdi)
1042L(P6Q0): mov    %eax, -0x6(%rdi)
1043	 mov    %ax, -0x2(%rdi)
1044	 ret
1045
1046	.p2align 4
1047L(P7Q9): mov    %rax, -0x4f(%rdi)
1048L(P7Q8): mov    %rax, -0x47(%rdi)
1049L(P7Q7): mov    %rax, -0x3f(%rdi)
1050L(P7Q6): mov    %rax, -0x37(%rdi)
1051L(P7Q5): mov    %rax, -0x2f(%rdi)
1052L(P7Q4): mov    %rax, -0x27(%rdi)
1053L(P7Q3): mov    %rax, -0x1f(%rdi)
1054L(P7Q2): mov    %rax, -0x17(%rdi)
1055L(P7Q1): mov    %rax, -0xf(%rdi)
1056L(P7Q0): mov    %eax, -0x7(%rdi)
1057	 mov    %ax, -0x3(%rdi)
1058	 mov    %al, -0x1(%rdi)
1059	 ret
1060
1061	/*
1062	 * Align to a 16-byte boundary. Avoids penalties from unaligned stores
1063	 * as well as from stores spanning cachelines. Note 16-byte alignment
1064	 * is better in case where rep sstosq is used.
1065	 */
1066	.p2align 4
1067L(ck_align):
1068	test	$0xf, %rdi
1069	jz	L(aligned_now)
1070	test	$1, %rdi
1071	jz	2f
1072	mov	%al, (%rdi)
1073	dec	%rsi
1074	lea	1(%rdi),%rdi
10752:
1076	test	$2, %rdi
1077	jz	4f
1078	mov	%ax, (%rdi)
1079	sub	$2, %rsi
1080	lea	2(%rdi),%rdi
10814:
1082	test	$4, %rdi
1083	jz	8f
1084	mov	%eax, (%rdi)
1085	sub	$4, %rsi
1086	lea	4(%rdi),%rdi
10878:
1088	test	$8, %rdi
1089	jz	L(aligned_now)
1090	mov	%rax, (%rdi)
1091	sub	$8, %rsi
1092	lea	8(%rdi),%rdi
1093
1094	/*
1095	 * For large sizes rep sstoq is fastest.
1096	 * Transition point determined experimentally as measured on
1097	 * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
1098	 */
1099L(aligned_now):
1100	cmp	$BZERO_USE_REP, %rsi
1101	ja	L(use_rep)
1102
1103	/*
1104	 * zero 64-bytes per loop
1105	 */
1106	.p2align 4
1107L(bzero_loop):
1108	leaq	-0x40(%rsi), %rsi
1109	cmpq	$0x40, %rsi
1110	movq	%rax, (%rdi)
1111	movq	%rax, 0x8(%rdi)
1112	movq	%rax, 0x10(%rdi)
1113	movq	%rax, 0x18(%rdi)
1114	movq	%rax, 0x20(%rdi)
1115	movq	%rax, 0x28(%rdi)
1116	movq	%rax, 0x30(%rdi)
1117	movq	%rax, 0x38(%rdi)
1118	leaq	0x40(%rdi), %rdi
1119	jae	L(bzero_loop)
1120
1121	/*
1122	 * Clear any remaining bytes..
1123	 */
11249:
1125	leaq	L(setPxQx)(%rip), %r10
1126	addq	%rsi, %rdi
1127	movslq	(%r10,%rsi,4), %rcx
1128	leaq	(%rcx,%r10,1), %r10
1129	INDIRECT_JMP_REG(r10)
1130
1131	/*
1132	 * Use rep sstoq. Clear any remainder via unrolled code
1133	 */
1134	.p2align 4
1135L(use_rep):
1136	movq	%rsi, %rcx		/* get size in bytes */
1137	shrq	$3, %rcx		/* count of 8-byte words to zero */
1138	rep
1139	  sstoq				/* %rcx = words to clear (%rax=0) */
1140	andq	$7, %rsi		/* remaining bytes */
1141	jnz	9b
1142	ret
1143#undef	L
1144	SET_SIZE(bzero_altentry)
1145	SET_SIZE(bzero)
1146
1147/*
1148 * Transfer data to and from user space -
1149 * Note that these routines can cause faults
1150 * It is assumed that the kernel has nothing at
1151 * less than KERNELBASE in the virtual address space.
1152 *
1153 * Note that copyin(9F) and copyout(9F) are part of the
1154 * DDI/DKI which specifies that they return '-1' on "errors."
1155 *
1156 * Sigh.
1157 *
1158 * So there's two extremely similar routines - xcopyin_nta() and
1159 * xcopyout_nta() which return the errno that we've faithfully computed.
1160 * This allows other callers (e.g. uiomove(9F)) to work correctly.
1161 * Given that these are used pretty heavily, we expand the calling
1162 * sequences inline for all flavours (rather than making wrappers).
1163 */
1164
1165/*
1166 * Copy user data to kernel space.
1167 */
1168
1169	ENTRY(copyin)
1170	pushq	%rbp
1171	movq	%rsp, %rbp
1172	subq	$24, %rsp
1173
1174	/*
1175	 * save args in case we trap and need to rerun as a copyop
1176	 */
1177	movq	%rdi, (%rsp)
1178	movq	%rsi, 0x8(%rsp)
1179	movq	%rdx, 0x10(%rsp)
1180
1181	movq	kernelbase(%rip), %rax
1182#ifdef DEBUG
1183	cmpq	%rax, %rsi		/* %rsi = kaddr */
1184	jnb	1f
1185	leaq	.copyin_panic_msg(%rip), %rdi
1186	xorl	%eax, %eax
1187	call	panic
11881:
1189#endif
1190	/*
1191	 * pass lofault value as 4th argument to do_copy_fault
1192	 */
1193	leaq	_copyin_err(%rip), %rcx
1194
1195	movq	%gs:CPU_THREAD, %r9
1196	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1197	jae	3f			/* take copyop if uaddr > kernelbase */
1198	SMAP_DISABLE_INSTR(0)
1199	jmp	do_copy_fault		/* Takes care of leave for us */
1200
1201_copyin_err:
1202	SMAP_ENABLE_INSTR(2)
1203	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
1204	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
12053:
1206	movq	T_COPYOPS(%r9), %rax
1207	cmpq	$0, %rax
1208	jz	2f
1209	/*
1210	 * reload args for the copyop
1211	 */
1212	movq	(%rsp), %rdi
1213	movq	0x8(%rsp), %rsi
1214	movq	0x10(%rsp), %rdx
1215	leave
1216	movq	CP_COPYIN(%rax), %rax
1217	INDIRECT_JMP_REG(rax)
1218
12192:	movl	$-1, %eax
1220	leave
1221	ret
1222	SET_SIZE(copyin)
1223
1224	ENTRY(xcopyin_nta)
1225	pushq	%rbp
1226	movq	%rsp, %rbp
1227	subq	$24, %rsp
1228
1229	/*
1230	 * save args in case we trap and need to rerun as a copyop
1231	 * %rcx is consumed in this routine so we don't need to save
1232	 * it.
1233	 */
1234	movq	%rdi, (%rsp)
1235	movq	%rsi, 0x8(%rsp)
1236	movq	%rdx, 0x10(%rsp)
1237
1238	movq	kernelbase(%rip), %rax
1239#ifdef DEBUG
1240	cmpq	%rax, %rsi		/* %rsi = kaddr */
1241	jnb	1f
1242	leaq	.xcopyin_panic_msg(%rip), %rdi
1243	xorl	%eax, %eax
1244	call	panic
12451:
1246#endif
1247	movq	%gs:CPU_THREAD, %r9
1248	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1249	jae	4f
1250	cmpq	$0, %rcx		/* No non-temporal access? */
1251	/*
1252	 * pass lofault value as 4th argument to do_copy_fault
1253	 */
1254	leaq	_xcopyin_err(%rip), %rcx	/* doesn't set rflags */
1255	jnz	6f			/* use regular access */
1256	/*
1257	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1258	 */
1259	cmpq	$XCOPY_MIN_SIZE, %rdx
1260	jae	5f
12616:
1262	SMAP_DISABLE_INSTR(1)
1263	jmp	do_copy_fault
1264
1265	/*
1266	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1267	 * count is COUNT_ALIGN_SIZE aligned.
1268	 */
12695:
1270	movq	%rdi, %r10
1271	orq	%rsi, %r10
1272	andq	$NTA_ALIGN_MASK, %r10
1273	orq	%rdx, %r10
1274	andq	$COUNT_ALIGN_MASK, %r10
1275	jnz	6b
1276	leaq	_xcopyin_nta_err(%rip), %rcx	/* doesn't set rflags */
1277	SMAP_DISABLE_INSTR(2)
1278	jmp	do_copy_fault_nta	/* use non-temporal access */
1279
12804:
1281	movl	$EFAULT, %eax
1282	jmp	3f
1283
1284	/*
1285	 * A fault during do_copy_fault or do_copy_fault_nta is
1286	 * indicated through an errno value in %rax and we iret from the
1287	 * trap handler to here.
1288	 */
1289_xcopyin_err:
1290	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
1291_xcopyin_nta_err:
1292	SMAP_ENABLE_INSTR(3)
1293	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
12943:
1295	movq	T_COPYOPS(%r9), %r8
1296	cmpq	$0, %r8
1297	jz	2f
1298
1299	/*
1300	 * reload args for the copyop
1301	 */
1302	movq	(%rsp), %rdi
1303	movq	0x8(%rsp), %rsi
1304	movq	0x10(%rsp), %rdx
1305	leave
1306	movq	CP_XCOPYIN(%r8), %r8
1307	INDIRECT_JMP_REG(r8)
1308
13092:	leave
1310	ret
1311	SET_SIZE(xcopyin_nta)
1312
1313/*
1314 * Copy kernel data to user space.
1315 */
1316
1317	ENTRY(copyout)
1318	pushq	%rbp
1319	movq	%rsp, %rbp
1320	subq	$24, %rsp
1321
1322	/*
1323	 * save args in case we trap and need to rerun as a copyop
1324	 */
1325	movq	%rdi, (%rsp)
1326	movq	%rsi, 0x8(%rsp)
1327	movq	%rdx, 0x10(%rsp)
1328
1329	movq	kernelbase(%rip), %rax
1330#ifdef DEBUG
1331	cmpq	%rax, %rdi		/* %rdi = kaddr */
1332	jnb	1f
1333	leaq	.copyout_panic_msg(%rip), %rdi
1334	xorl	%eax, %eax
1335	call	panic
13361:
1337#endif
1338	/*
1339	 * pass lofault value as 4th argument to do_copy_fault
1340	 */
1341	leaq	_copyout_err(%rip), %rcx
1342
1343	movq	%gs:CPU_THREAD, %r9
1344	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1345	jae	3f			/* take copyop if uaddr > kernelbase */
1346	SMAP_DISABLE_INSTR(3)
1347	jmp	do_copy_fault		/* Calls leave for us */
1348
1349_copyout_err:
1350	SMAP_ENABLE_INSTR(4)
1351	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
1352	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
13533:
1354	movq	T_COPYOPS(%r9), %rax
1355	cmpq	$0, %rax
1356	jz	2f
1357
1358	/*
1359	 * reload args for the copyop
1360	 */
1361	movq	(%rsp), %rdi
1362	movq	0x8(%rsp), %rsi
1363	movq	0x10(%rsp), %rdx
1364	leave
1365	movq	CP_COPYOUT(%rax), %rax
1366	INDIRECT_JMP_REG(rax)
1367
13682:	movl	$-1, %eax
1369	leave
1370	ret
1371	SET_SIZE(copyout)
1372
1373	ENTRY(xcopyout_nta)
1374	pushq	%rbp
1375	movq	%rsp, %rbp
1376	subq	$24, %rsp
1377
1378	/*
1379	 * save args in case we trap and need to rerun as a copyop
1380	 */
1381	movq	%rdi, (%rsp)
1382	movq	%rsi, 0x8(%rsp)
1383	movq	%rdx, 0x10(%rsp)
1384
1385	movq	kernelbase(%rip), %rax
1386#ifdef DEBUG
1387	cmpq	%rax, %rdi		/* %rdi = kaddr */
1388	jnb	1f
1389	leaq	.xcopyout_panic_msg(%rip), %rdi
1390	xorl	%eax, %eax
1391	call	panic
13921:
1393#endif
1394	movq	%gs:CPU_THREAD, %r9
1395	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1396	jae	4f
1397
1398	cmpq	$0, %rcx		/* No non-temporal access? */
1399	/*
1400	 * pass lofault value as 4th argument to do_copy_fault
1401	 */
1402	leaq	_xcopyout_err(%rip), %rcx
1403	jnz	6f
1404	/*
1405	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
1406	 */
1407	cmpq	$XCOPY_MIN_SIZE, %rdx
1408	jae	5f
14096:
1410	SMAP_DISABLE_INSTR(4)
1411	jmp	do_copy_fault
1412
1413	/*
1414	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
1415	 * count is COUNT_ALIGN_SIZE aligned.
1416	 */
14175:
1418	movq	%rdi, %r10
1419	orq	%rsi, %r10
1420	andq	$NTA_ALIGN_MASK, %r10
1421	orq	%rdx, %r10
1422	andq	$COUNT_ALIGN_MASK, %r10
1423	jnz	6b
1424	leaq	_xcopyout_nta_err(%rip), %rcx
1425	SMAP_DISABLE_INSTR(5)
1426	call	do_copy_fault_nta
1427	SMAP_ENABLE_INSTR(5)
1428	ret
1429
14304:
1431	movl	$EFAULT, %eax
1432	jmp	3f
1433
1434	/*
1435	 * A fault during do_copy_fault or do_copy_fault_nta is
1436	 * indicated through an errno value in %rax and we iret from the
1437	 * trap handler to here.
1438	 */
1439_xcopyout_err:
1440	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
1441_xcopyout_nta_err:
1442	SMAP_ENABLE_INSTR(6)
1443	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
14443:
1445	movq	T_COPYOPS(%r9), %r8
1446	cmpq	$0, %r8
1447	jz	2f
1448
1449	/*
1450	 * reload args for the copyop
1451	 */
1452	movq	(%rsp), %rdi
1453	movq	0x8(%rsp), %rsi
1454	movq	0x10(%rsp), %rdx
1455	leave
1456	movq	CP_XCOPYOUT(%r8), %r8
1457	INDIRECT_JMP_REG(r8)
1458
14592:	leave
1460	ret
1461	SET_SIZE(xcopyout_nta)
1462
1463/*
1464 * Copy a null terminated string from one point to another in
1465 * the kernel address space.
1466 */
1467
1468	ENTRY(copystr)
1469	pushq	%rbp
1470	movq	%rsp, %rbp
1471#ifdef DEBUG
1472	movq	kernelbase(%rip), %rax
1473	cmpq	%rax, %rdi		/* %rdi = from */
1474	jb	0f
1475	cmpq	%rax, %rsi		/* %rsi = to */
1476	jnb	1f
14770:	leaq	.copystr_panic_msg(%rip), %rdi
1478	xorl	%eax, %eax
1479	call	panic
14801:
1481#endif
1482	movq	%gs:CPU_THREAD, %r9
1483	movq	T_LOFAULT(%r9), %r8	/* pass current lofault value as */
1484					/* 5th argument to do_copystr */
1485	xorl	%r10d,%r10d		/* pass smap restore need in %r10d */
1486					/* as a non-ABI 6th arg */
1487do_copystr:
1488	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
1489	movq    T_LOFAULT(%r9), %r11	/* save the current lofault */
1490	movq	%r8, T_LOFAULT(%r9)	/* new lofault */
1491
1492	movq	%rdx, %r8		/* save maxlength */
1493
1494	cmpq	$0, %rdx		/* %rdx = maxlength */
1495	je	copystr_enametoolong	/* maxlength == 0 */
1496
1497copystr_loop:
1498	decq	%r8
1499	movb	(%rdi), %al
1500	incq	%rdi
1501	movb	%al, (%rsi)
1502	incq	%rsi
1503	cmpb	$0, %al
1504	je	copystr_null		/* null char */
1505	cmpq	$0, %r8
1506	jne	copystr_loop
1507
1508copystr_enametoolong:
1509	movl	$ENAMETOOLONG, %eax
1510	jmp	copystr_out
1511
1512copystr_null:
1513	xorl	%eax, %eax		/* no error */
1514
1515copystr_out:
1516	cmpq	$0, %rcx		/* want length? */
1517	je	copystr_smap		/* no */
1518	subq	%r8, %rdx		/* compute length and store it */
1519	movq	%rdx, (%rcx)
1520
1521copystr_smap:
1522	cmpl	$0, %r10d
1523	jz	copystr_done
1524	SMAP_ENABLE_INSTR(7)
1525
1526copystr_done:
1527	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
1528	leave
1529	ret
1530	SET_SIZE(copystr)
1531
1532/*
1533 * Copy a null terminated string from the user address space into
1534 * the kernel address space.
1535 */
1536
1537	ENTRY(copyinstr)
1538	pushq	%rbp
1539	movq	%rsp, %rbp
1540	subq	$32, %rsp
1541
1542	/*
1543	 * save args in case we trap and need to rerun as a copyop
1544	 */
1545	movq	%rdi, (%rsp)
1546	movq	%rsi, 0x8(%rsp)
1547	movq	%rdx, 0x10(%rsp)
1548	movq	%rcx, 0x18(%rsp)
1549
1550	movq	kernelbase(%rip), %rax
1551#ifdef DEBUG
1552	cmpq	%rax, %rsi		/* %rsi = kaddr */
1553	jnb	1f
1554	leaq	.copyinstr_panic_msg(%rip), %rdi
1555	xorl	%eax, %eax
1556	call	panic
15571:
1558#endif
1559	/*
1560	 * pass lofault value as 5th argument to do_copystr
1561	 * do_copystr expects whether or not we need smap in %r10d
1562	 */
1563	leaq	_copyinstr_error(%rip), %r8
1564	movl	$1, %r10d
1565
1566	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
1567	jae	4f
1568	SMAP_DISABLE_INSTR(6)
1569	jmp	do_copystr
15704:
1571	movq	%gs:CPU_THREAD, %r9
1572	jmp	3f
1573
1574_copyinstr_error:
1575	SMAP_ENABLE_INSTR(8)
1576	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
15773:
1578	movq	T_COPYOPS(%r9), %rax
1579	cmpq	$0, %rax
1580	jz	2f
1581
1582	/*
1583	 * reload args for the copyop
1584	 */
1585	movq	(%rsp), %rdi
1586	movq	0x8(%rsp), %rsi
1587	movq	0x10(%rsp), %rdx
1588	movq	0x18(%rsp), %rcx
1589	leave
1590	movq	CP_COPYINSTR(%rax), %rax
1591	INDIRECT_JMP_REG(rax)
1592
15932:	movl	$EFAULT, %eax		/* return EFAULT */
1594	leave
1595	ret
1596	SET_SIZE(copyinstr)
1597
1598/*
1599 * Copy a null terminated string from the kernel
1600 * address space to the user address space.
1601 */
1602
1603	ENTRY(copyoutstr)
1604	pushq	%rbp
1605	movq	%rsp, %rbp
1606	subq	$32, %rsp
1607
1608	/*
1609	 * save args in case we trap and need to rerun as a copyop
1610	 */
1611	movq	%rdi, (%rsp)
1612	movq	%rsi, 0x8(%rsp)
1613	movq	%rdx, 0x10(%rsp)
1614	movq	%rcx, 0x18(%rsp)
1615
1616	movq	kernelbase(%rip), %rax
1617#ifdef DEBUG
1618	cmpq	%rax, %rdi		/* %rdi = kaddr */
1619	jnb	1f
1620	leaq	.copyoutstr_panic_msg(%rip), %rdi
1621	jmp	call_panic		/* setup stack and call panic */
16221:
1623#endif
1624	/*
1625	 * pass lofault value as 5th argument to do_copystr
1626	 * pass one as 6th argument to do_copystr in %r10d
1627	 */
1628	leaq	_copyoutstr_error(%rip), %r8
1629	movl	$1, %r10d
1630
1631	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
1632	jae	4f
1633	SMAP_DISABLE_INSTR(7)
1634	jmp	do_copystr
16354:
1636	movq	%gs:CPU_THREAD, %r9
1637	jmp	3f
1638
1639_copyoutstr_error:
1640	SMAP_ENABLE_INSTR(9)
1641	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
16423:
1643	movq	T_COPYOPS(%r9), %rax
1644	cmpq	$0, %rax
1645	jz	2f
1646
1647	/*
1648	 * reload args for the copyop
1649	 */
1650	movq	(%rsp), %rdi
1651	movq	0x8(%rsp), %rsi
1652	movq	0x10(%rsp), %rdx
1653	movq	0x18(%rsp), %rcx
1654	leave
1655	movq	CP_COPYOUTSTR(%rax), %rax
1656	INDIRECT_JMP_REG(rax)
1657
16582:	movl	$EFAULT, %eax		/* return EFAULT */
1659	leave
1660	ret
1661	SET_SIZE(copyoutstr)
1662
1663/*
1664 * Since all of the fuword() variants are so similar, we have a macro to spit
1665 * them out.  This allows us to create DTrace-unobservable functions easily.
1666 */
1667
1668/*
1669 * Note that we don't save and reload the arguments here
1670 * because their values are not altered in the copy path.
1671 * Additionally, when successful, the smap_enable jmp will
1672 * actually return us to our original caller.
1673 */
1674
1675#define	FUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)	\
1676	ENTRY(NAME)				\
1677	movq	%gs:CPU_THREAD, %r9;		\
1678	cmpq	kernelbase(%rip), %rdi;		\
1679	jae	1f;				\
1680	leaq	_flt_##NAME, %rdx;		\
1681	movq	%rdx, T_LOFAULT(%r9);		\
1682	SMAP_DISABLE_INSTR(DISNUM)		\
1683	INSTR	(%rdi), REG;			\
1684	movq	$0, T_LOFAULT(%r9);		\
1685	INSTR	REG, (%rsi);			\
1686	xorl	%eax, %eax;			\
1687	SMAP_ENABLE_INSTR(EN1)			\
1688	ret;					\
1689_flt_##NAME:					\
1690	SMAP_ENABLE_INSTR(EN2)			\
1691	movq	$0, T_LOFAULT(%r9);		\
16921:						\
1693	movq	T_COPYOPS(%r9), %rax;		\
1694	cmpq	$0, %rax;			\
1695	jz	2f;				\
1696	movq	COPYOP(%rax), %rax;		\
1697	INDIRECT_JMP_REG(rax);			\
16982:						\
1699	movl	$-1, %eax;			\
1700	ret;					\
1701	SET_SIZE(NAME)
1702
1703	FUWORD(fuword64, movq, %rax, CP_FUWORD64,8,10,11)
1704	FUWORD(fuword32, movl, %eax, CP_FUWORD32,9,12,13)
1705	FUWORD(fuword16, movw, %ax, CP_FUWORD16,10,14,15)
1706	FUWORD(fuword8, movb, %al, CP_FUWORD8,11,16,17)
1707
1708#undef	FUWORD
1709
1710/*
1711 * Set user word.
1712 */
1713
1714/*
1715 * Note that we don't save and reload the arguments here
1716 * because their values are not altered in the copy path.
1717 */
1718
1719#define	SUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)	\
1720	ENTRY(NAME)				\
1721	movq	%gs:CPU_THREAD, %r9;		\
1722	cmpq	kernelbase(%rip), %rdi;		\
1723	jae	1f;				\
1724	leaq	_flt_##NAME, %rdx;		\
1725	SMAP_DISABLE_INSTR(DISNUM)		\
1726	movq	%rdx, T_LOFAULT(%r9);		\
1727	INSTR	REG, (%rdi);			\
1728	movq	$0, T_LOFAULT(%r9);		\
1729	xorl	%eax, %eax;			\
1730	SMAP_ENABLE_INSTR(EN1)			\
1731	ret;					\
1732_flt_##NAME:					\
1733	SMAP_ENABLE_INSTR(EN2)			\
1734	movq	$0, T_LOFAULT(%r9);		\
17351:						\
1736	movq	T_COPYOPS(%r9), %rax;		\
1737	cmpq	$0, %rax;			\
1738	jz	3f;				\
1739	movq	COPYOP(%rax), %rax;		\
1740	INDIRECT_JMP_REG(rax);			\
17413:						\
1742	movl	$-1, %eax;			\
1743	ret;					\
1744	SET_SIZE(NAME)
1745
1746	SUWORD(suword64, movq, %rsi, CP_SUWORD64,12,18,19)
1747	SUWORD(suword32, movl, %esi, CP_SUWORD32,13,20,21)
1748	SUWORD(suword16, movw, %si, CP_SUWORD16,14,22,23)
1749	SUWORD(suword8, movb, %sil, CP_SUWORD8,15,24,25)
1750
1751#undef	SUWORD
1752
1753#define	FUWORD_NOERR(NAME, INSTR, REG)		\
1754	ENTRY(NAME)				\
1755	cmpq	kernelbase(%rip), %rdi;		\
1756	cmovnbq	kernelbase(%rip), %rdi;		\
1757	INSTR	(%rdi), REG;			\
1758	INSTR	REG, (%rsi);			\
1759	ret;					\
1760	SET_SIZE(NAME)
1761
1762	FUWORD_NOERR(fuword64_noerr, movq, %rax)
1763	FUWORD_NOERR(fuword32_noerr, movl, %eax)
1764	FUWORD_NOERR(fuword16_noerr, movw, %ax)
1765	FUWORD_NOERR(fuword8_noerr, movb, %al)
1766
1767#undef	FUWORD_NOERR
1768
1769#define	SUWORD_NOERR(NAME, INSTR, REG)		\
1770	ENTRY(NAME)				\
1771	cmpq	kernelbase(%rip), %rdi;		\
1772	cmovnbq	kernelbase(%rip), %rdi;		\
1773	INSTR	REG, (%rdi);			\
1774	ret;					\
1775	SET_SIZE(NAME)
1776
1777	SUWORD_NOERR(suword64_noerr, movq, %rsi)
1778	SUWORD_NOERR(suword32_noerr, movl, %esi)
1779	SUWORD_NOERR(suword16_noerr, movw, %si)
1780	SUWORD_NOERR(suword8_noerr, movb, %sil)
1781
1782#undef	SUWORD_NOERR
1783
1784
1785	.weak	subyte
1786	subyte=suword8
1787	.weak	subyte_noerr
1788	subyte_noerr=suword8_noerr
1789
1790	.weak	fulword
1791	fulword=fuword64
1792	.weak	fulword_noerr
1793	fulword_noerr=fuword64_noerr
1794	.weak	sulword
1795	sulword=suword64
1796	.weak	sulword_noerr
1797	sulword_noerr=suword64_noerr
1798
1799	ENTRY(copyin_noerr)
1800	movq	kernelbase(%rip), %rax
1801#ifdef DEBUG
1802	cmpq	%rax, %rsi		/* %rsi = kto */
1803	jae	1f
1804	leaq	.cpyin_ne_pmsg(%rip), %rdi
1805	jmp	call_panic		/* setup stack and call panic */
18061:
1807#endif
1808	cmpq	%rax, %rdi		/* ufrom < kernelbase */
1809	jb	do_copy
1810	movq	%rax, %rdi		/* force fault at kernelbase */
1811	jmp	do_copy
1812	SET_SIZE(copyin_noerr)
1813
1814	ENTRY(copyout_noerr)
1815	movq	kernelbase(%rip), %rax
1816#ifdef DEBUG
1817	cmpq	%rax, %rdi		/* %rdi = kfrom */
1818	jae	1f
1819	leaq	.cpyout_ne_pmsg(%rip), %rdi
1820	jmp	call_panic		/* setup stack and call panic */
18211:
1822#endif
1823	cmpq	%rax, %rsi		/* uto < kernelbase */
1824	jb	do_copy
1825	movq	%rax, %rsi		/* force fault at kernelbase */
1826	jmp	do_copy
1827	SET_SIZE(copyout_noerr)
1828
1829	ENTRY(uzero)
1830	movq	kernelbase(%rip), %rax
1831	cmpq	%rax, %rdi
1832	jb	do_zero
1833	movq	%rax, %rdi	/* force fault at kernelbase */
1834	jmp	do_zero
1835	SET_SIZE(uzero)
1836
1837	ENTRY(ucopy)
1838	movq	kernelbase(%rip), %rax
1839	cmpq	%rax, %rdi
1840	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
1841	cmpq	%rax, %rsi
1842	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
1843	jmp	do_copy
1844	SET_SIZE(ucopy)
1845
1846	/*
1847	 * Note, the frame pointer is required here becuase do_copystr expects
1848	 * to be able to pop it off!
1849	 */
1850	ENTRY(ucopystr)
1851	pushq	%rbp
1852	movq	%rsp, %rbp
1853	movq	kernelbase(%rip), %rax
1854	cmpq	%rax, %rdi
1855	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
1856	cmpq	%rax, %rsi
1857	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
1858	/* do_copystr expects lofault address in %r8 */
1859	/* do_copystr expects whether or not we need smap in %r10 */
1860	xorl	%r10d, %r10d
1861	movq	%gs:CPU_THREAD, %r8
1862	movq	T_LOFAULT(%r8), %r8
1863	jmp	do_copystr
1864	SET_SIZE(ucopystr)
1865
1866#ifdef DEBUG
1867	.data
1868.kcopy_panic_msg:
1869	.string "kcopy: arguments below kernelbase"
1870.bcopy_panic_msg:
1871	.string "bcopy: arguments below kernelbase"
1872.kzero_panic_msg:
1873        .string "kzero: arguments below kernelbase"
1874.bzero_panic_msg:
1875	.string	"bzero: arguments below kernelbase"
1876.copyin_panic_msg:
1877	.string "copyin: kaddr argument below kernelbase"
1878.xcopyin_panic_msg:
1879	.string	"xcopyin: kaddr argument below kernelbase"
1880.copyout_panic_msg:
1881	.string "copyout: kaddr argument below kernelbase"
1882.xcopyout_panic_msg:
1883	.string	"xcopyout: kaddr argument below kernelbase"
1884.copystr_panic_msg:
1885	.string	"copystr: arguments in user space"
1886.copyinstr_panic_msg:
1887	.string	"copyinstr: kaddr argument not in kernel address space"
1888.copyoutstr_panic_msg:
1889	.string	"copyoutstr: kaddr argument not in kernel address space"
1890.cpyin_ne_pmsg:
1891	.string "copyin_noerr: argument not in kernel address space"
1892.cpyout_ne_pmsg:
1893	.string "copyout_noerr: argument not in kernel address space"
1894#endif
1895
1896.data
1897.align	4
1898.globl	_smap_enable_patch_count
1899.type	_smap_enable_patch_count,@object
1900.size	_smap_enable_patch_count, 4
1901_smap_enable_patch_count:
1902	.long	SMAP_ENABLE_COUNT
1903
1904.globl	_smap_disable_patch_count
1905.type	_smap_disable_patch_count,@object
1906.size	_smap_disable_patch_count, 4
1907_smap_disable_patch_count:
1908	.long SMAP_DISABLE_COUNT
1909