xref: /illumos-gate/usr/src/uts/intel/os/fpu.c (revision 251becc882939aaf03088561add2c257a7a92424)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2021 Joyent, Inc.
24  * Copyright 2021 RackTop Systems, Inc.
25  * Copyright 2021 Oxide Computer Company
26  */
27 
28 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
29 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
30 /*		All Rights Reserved				*/
31 
32 /*	Copyright (c) 1987, 1988 Microsoft Corporation		*/
33 /*		All Rights Reserved				*/
34 
35 /*
36  * Copyright (c) 2009, Intel Corporation.
37  * All rights reserved.
38  */
39 
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/signal.h>
43 #include <sys/regset.h>
44 #include <sys/privregs.h>
45 #include <sys/psw.h>
46 #include <sys/trap.h>
47 #include <sys/fault.h>
48 #include <sys/systm.h>
49 #include <sys/user.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/pcb.h>
53 #include <sys/lwp.h>
54 #include <sys/cpuvar.h>
55 #include <sys/thread.h>
56 #include <sys/disp.h>
57 #include <sys/fp.h>
58 #include <sys/siginfo.h>
59 #include <sys/archsystm.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <sys/x86_archext.h>
63 #include <sys/sysmacros.h>
64 #include <sys/cmn_err.h>
65 #include <sys/kfpu.h>
66 
67 /*
68  * FPU Management Overview
69  * -----------------------
70  *
71  * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
72  * however, many aspects of its life as a coprocessor are still around in x86.
73  *
74  * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
75  * While that state still exists, there is much more that is covered by the FPU.
76  * Today, this includes not just traditional FPU state, but also supervisor only
77  * state. The following state is currently managed and covered logically by the
78  * idea of the FPU registers:
79  *
80  *    o Traditional x87 FPU
81  *    o Vector Registers (%xmm, %ymm, %zmm)
82  *    o Memory Protection Extensions (MPX) Bounds Registers
83  *    o Protected Key Rights Registers (PKRU)
84  *    o Processor Trace data
85  *
86  * The rest of this covers how the FPU is managed and controlled, how state is
87  * saved and restored between threads, interactions with hypervisors, and other
88  * information exported to user land through aux vectors. A lot of background
89  * information is here to synthesize major parts of the Intel SDM, but
90  * unfortunately, it is not a replacement for reading it.
91  *
92  * FPU Control Registers
93  * ---------------------
94  *
95  * Because the x87 FPU began its life as a co-processor and the FPU was
96  * optional there are several bits that show up in %cr0 that we have to
97  * manipulate when dealing with the FPU. These are:
98  *
99  *   o CR0.ET	The 'extension type' bit. This was used originally to indicate
100  *		that the FPU co-processor was present. Now it is forced on for
101  *		compatibility. This is often used to verify whether or not the
102  *		FPU is present.
103  *
104  *   o CR0.NE	The 'native error' bit. Used to indicate that native error
105  *		mode should be enabled. This indicates that we should take traps
106  *		on FPU errors. The OS enables this early in boot.
107  *
108  *   o CR0.MP	The 'Monitor Coprocessor' bit. Used to control whether or not
109  *		wait/fwait instructions generate a #NM if CR0.TS is set.
110  *
111  *   o CR0.EM	The 'Emulation' bit. This is used to cause floating point
112  *		operations (x87 through SSE4) to trap with a #UD so they can be
113  *		emulated. The system never sets this bit, but makes sure it is
114  *		clear on processor start up.
115  *
116  *   o CR0.TS	The 'Task Switched' bit. When this is turned on, a floating
117  *		point operation will generate a #NM. An fwait will as well,
118  *		depending on the value in CR0.MP.
119  *
120  * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
121  * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
122  * complicated role. Historically it has been used to allow running systems to
123  * restore the FPU registers lazily. This will be discussed in greater depth
124  * later on.
125  *
126  * %cr4 is also used as part of the FPU control. Specifically we need to worry
127  * about the following bits in the system:
128  *
129  *   o CR4.OSFXSR	This bit is used to indicate that the OS understands and
130  *			supports the execution of the fxsave and fxrstor
131  *			instructions. This bit is required to be set to enable
132  *			the use of the SSE->SSE4 instructions.
133  *
134  *   o CR4.OSXMMEXCPT	This bit is used to indicate that the OS can understand
135  *			and take a SIMD floating point exception (#XM). This bit
136  *			is always enabled by the system.
137  *
138  *   o CR4.OSXSAVE	This bit is used to indicate that the OS understands and
139  *			supports the execution of the xsave and xrstor family of
140  *			instructions. This bit is required to use any of the AVX
141  *			and newer feature sets.
142  *
143  * Because all supported processors are 64-bit, they'll always support the XMM
144  * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
145  * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
146  *
147  * %xcr0 is used to manage the behavior of the xsave feature set and is only
148  * present on the system if xsave is supported. %xcr0 is read and written to
149  * through by the xgetbv and xsetbv instructions. This register is present
150  * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
151  * different component of the xsave state and controls whether or not that
152  * information is saved and restored. For newer feature sets like AVX and MPX,
153  * it also controls whether or not the corresponding instructions can be
154  * executed (much like CR0.OSFXSR does for the SSE feature sets).
155  *
156  * Everything in %xcr0 is around features available to users. There is also the
157  * IA32_XSS MSR which is used to control supervisor-only features that are still
158  * part of the xsave state. Bits that can be set in %xcr0 are reserved in
159  * IA32_XSS and vice versa. This is an important property that is particularly
160  * relevant to how the xsave instructions operate.
161  *
162  * Save Mechanisms
163  * ---------------
164  *
165  * When switching between running threads the FPU state needs to be saved and
166  * restored by the OS. If this state was not saved, users would rightfully
167  * complain about corrupt state. There are three mechanisms that exist on the
168  * processor for saving and restoring these state images:
169  *
170  *   o fsave
171  *   o fxsave
172  *   o xsave
173  *
174  * fsave saves and restores only the x87 FPU and is the oldest of these
175  * mechanisms. This mechanism is never used in the kernel today because we are
176  * always running on systems that support fxsave.
177  *
178  * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
179  * state to be saved and restored to and from a struct fxsave_state. This is the
180  * default mechanism that is used to save and restore the FPU on amd64. An
181  * important aspect of fxsave that was different from the original i386 fsave
182  * mechanism is that the restoring of FPU state with pending exceptions will not
183  * generate an exception, it will be deferred to the next use of the FPU.
184  *
185  * The final and by far the most complex mechanism is that of the xsave set.
186  * xsave allows for saving and restoring all of the traditional x86 pieces (x87
187  * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
188  * registers.
189  *
190  * Data is saved and restored into and out of a struct xsave_state. The first
191  * part of the struct xsave_state is equivalent to the struct fxsave_state.
192  * After that, there is a header which is used to describe the remaining
193  * portions of the state. The header is a 64-byte value of which the first two
194  * uint64_t values are defined and the rest are reserved and must be zero. The
195  * first uint64_t is the xstate_bv member. This describes which values in the
196  * xsave_state are actually valid and present. This is updated on a save and
197  * used on restore. The second member is the xcomp_bv member. Its last bit
198  * determines whether or not a compressed version of the structure is used.
199  *
200  * When the uncompressed structure is used (currently the only format we
201  * support), then each state component is at a fixed offset in the structure,
202  * even if it is not being used. For example, if you only saved the AVX related
203  * state, but did not save the MPX related state, the offset would not change
204  * for any component. With the compressed format, components that aren't used
205  * are all elided (though the x87 and SSE state are always there).
206  *
207  * Unlike fxsave which saves all state, the xsave family does not always save
208  * and restore all the state that could be covered by the xsave_state. The
209  * instructions all take an argument which is a mask of what to consider. This
210  * is the same mask that will be used in the xstate_bv vector and it is also the
211  * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
212  * considered with the xsaves and xrstors instructions.
213  *
214  * When a save or restore is requested, a bitwise and is performed between the
215  * requested bits and those that have been enabled in %xcr0. Only the bits that
216  * match that are then saved or restored. Others will be silently ignored by
217  * the processor. This idea is used often in the OS. We will always request that
218  * we save and restore all of the state, but only those portions that are
219  * actually enabled in %xcr0 will be touched.
220  *
221  * If a feature has been asked to be restored that is not set in the xstate_bv
222  * feature vector of the save state, then it will be set to its initial state by
223  * the processor (usually zeros). Also, when asked to save state, the processor
224  * may not write out data that is in its initial state as an optimization. This
225  * optimization only applies to saving data and not to restoring data.
226  *
227  * There are a few different variants of the xsave and xrstor instruction. They
228  * are:
229  *
230  *   o xsave	This is the original save instruction. It will save all of the
231  *		requested data in the xsave state structure. It only saves data
232  *		in the uncompressed (xcomp_bv[63] is zero) format. It may be
233  *		executed at all privilege levels.
234  *
235  *   o xrstor	This is the original restore instruction. It will restore all of
236  *		the requested data. The xrstor function can handle both the
237  *		compressed and uncompressed formats. It may be executed at all
238  *		privilege levels.
239  *
240  *   o xsaveopt	This is a variant of the xsave instruction that employs
241  *		optimizations to try and only write out state that has been
242  *		modified since the last time an xrstor instruction was called.
243  *		The processor tracks a tuple of information about the last
244  *		xrstor and tries to ensure that the same buffer is being used
245  *		when this optimization is being used. However, because of the
246  *		way that it tracks the xrstor buffer based on the address of it,
247  *		it is not suitable for use if that buffer can be easily reused.
248  *		The most common case is trying to save data to the stack in
249  *		rtld. It may be executed at all privilege levels.
250  *
251  *   o xsavec	This is a variant of the xsave instruction that writes out the
252  *		compressed form of the xsave_state. Otherwise it behaves as
253  *		xsave. It may be executed at all privilege levels.
254  *
255  *   o xsaves	This is a variant of the xsave instruction. It is similar to
256  *		xsavec in that it always writes the compressed form of the
257  *		buffer. Unlike all the other forms, this instruction looks at
258  *		both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
259  *		what to save and restore. xsaves also implements the same
260  *		optimization that xsaveopt does around modified pieces. User
261  *		land may not execute the instruction.
262  *
263  *   o xrstors	This is a variant of the xrstor instruction. Similar to xsaves
264  *		it can save and restore both the user and privileged states.
265  *		Unlike xrstor it can only operate on the compressed form.
266  *		User land may not execute the instruction.
267  *
268  * Based on all of these, the kernel has a precedence for what it will use.
269  * Basically, xsaves (not supported) is preferred to xsaveopt, which is
270  * preferred to xsave. A similar scheme is used when informing rtld (more later)
271  * about what it should use. xsavec is preferred to xsave. xsaveopt is not
272  * recommended due to the modified optimization not being appropriate for this
273  * use.
274  *
275  * Finally, there is one last gotcha with the xsave state. Importantly some AMD
276  * processors did not always save and restore some of the FPU exception state in
277  * some cases like Intel did. In those cases the OS will make up for this fact
278  * itself.
279  *
280  * FPU Initialization
281  * ------------------
282  *
283  * One difference with the FPU registers is that not all threads have FPU state,
284  * only those that have an lwp. Generally this means kernel threads, which all
285  * share p0 and its lwp, do not have FPU state. Though there are definitely
286  * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
287  * and lwp interchangeably, just think of thread meaning a thread that has a
288  * lwp.
289  *
290  * Each lwp has its FPU state allocated in its pcb (process control block). The
291  * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
292  * dynamically at start up based on the save mechanism that we're using and the
293  * amount of memory required for it. This is dynamic because the xsave_state
294  * size varies based on the supported feature set.
295  *
296  * The hardware side of the FPU is initialized early in boot before we mount the
297  * root file system. This is effectively done in fpu_probe(). This is where we
298  * make the final decision about what the save and restore mechanisms we should
299  * use are, create the fpsave_cachep kmem cache, and initialize a number of
300  * function pointers that use save and restoring logic.
301  *
302  * The thread/lwp side is a a little more involved. There are two different
303  * things that we need to concern ourselves with. The first is how the FPU
304  * resources are allocated and the second is how the FPU state is initialized
305  * for a given lwp.
306  *
307  * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
308  * This is always called unconditionally by the system as part of creating an
309  * LWP.
310  *
311  * There are three different initialization paths that we deal with. The first
312  * is when we are executing a new process. As part of exec all of the register
313  * state is reset. The exec case is particularly important because init is born
314  * like Athena, sprouting from the head of the kernel, without any true parent
315  * to fork from. The second is used whenever we fork or create a new lwp.  The
316  * third is to deal with special lwps like the agent lwp.
317  *
318  * During exec, we will call fp_exec() which will initialize and set up the FPU
319  * state for the process. That will fill in the initial state for the FPU and
320  * also set that state in the FPU itself. As part of fp_exec() we also install a
321  * thread context operations vector that takes care of dealing with the saving
322  * and restoring of the FPU. These context handlers will also be called whenever
323  * an lwp is created or forked. In those cases, to initialize the FPU we will
324  * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
325  * operations vector for the new thread.
326  *
327  * Next we'll end up in the context operation fp_new_lwp(). This saves the
328  * current thread's state, initializes the new thread's state, and copies over
329  * the relevant parts of the originating thread's state. It's as this point that
330  * we also install the FPU context operations into the new thread, which ensures
331  * that all future threads that are descendants of the current one get the
332  * thread context operations (unless they call exec).
333  *
334  * To deal with some things like the agent lwp, we double check the state of the
335  * FPU in sys_rtt_common() to make sure that it has been enabled before
336  * returning to user land. In general, this path should be rare, but it's useful
337  * for the odd lwp here and there.
338  *
339  * The FPU state will remain valid most of the time. There are times that
340  * the state will be rewritten. For example in restorecontext, due to /proc, or
341  * the lwp calls exec(). Whether the context is being freed or we are resetting
342  * the state, we will call fp_free() to disable the FPU and our context.
343  *
344  * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
345  * state by calling fp_lwp_cleanup().
346  *
347  * Kernel FPU Multiplexing
348  * -----------------------
349  *
350  * Just as the kernel has to maintain all of the general purpose registers when
351  * switching between scheduled threads, the same is true of the FPU registers.
352  *
353  * When a thread has FPU state, it also has a set of context operations
354  * installed. These context operations take care of making sure that the FPU is
355  * properly saved and restored during a context switch (fpsave_ctxt and
356  * fprestore_ctxt respectively). This means that the current implementation of
357  * the FPU is 'eager', when a thread is running the CPU will have its FPU state
358  * loaded. While this is always true when executing in userland, there are a few
359  * cases where this is not true in the kernel.
360  *
361  * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
362  * employed. This meant that the FPU would be saved on a context switch and the
363  * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
364  * then take a #NM trap, at which point we would restore the FPU from the save
365  * area and return to user land. Given the frequency of use of the FPU alone by
366  * libc, there's no point returning to user land just to trap again.
367  *
368  * There are a few cases though where the FPU state may need to be changed for a
369  * thread on its behalf. The most notable cases are in the case of processes
370  * using /proc, restorecontext, forking, etc. In all of these cases the kernel
371  * will force a threads FPU state to be saved into the PCB through the fp_save()
372  * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
373  * pcb. This indicates that the save state holds currently valid data. As a side
374  * effect of this, CR0.TS will be set. To make sure that all of the state is
375  * updated before returning to user land, in these cases, we set a flag on the
376  * PCB that says the FPU needs to be updated. This will make sure that we take
377  * the slow path out of a system call to fix things up for the thread. Due to
378  * the fact that this is a rather rare case, effectively setting the equivalent
379  * of t_postsys is acceptable.
380  *
381  * CR0.TS will be set after a save occurs and cleared when a restore occurs.
382  * Generally this means it will be cleared immediately by the new thread that is
383  * running in a context switch. However, this isn't the case for kernel threads.
384  * They currently operate with CR0.TS set as no kernel state is restored for
385  * them. This means that using the FPU will cause a #NM and panic.
386  *
387  * The FPU_VALID flag on the currently executing thread's pcb is meant to track
388  * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
389  * However, because we eagerly restore, the only time that CR0.TS should be set
390  * for a non-kernel thread is during operations where it will be cleared before
391  * returning to user land and importantly, the only data that is in it is its
392  * own.
393  *
394  * Kernel FPU Usage
395  * ----------------
396  *
397  * Traditionally the kernel never used the FPU since it had no need for
398  * floating point operations. However, modern FPU hardware supports a variety
399  * of SIMD extensions which can speed up code such as parity calculations or
400  * encryption.
401  *
402  * To allow the kernel to take advantage of these features, the
403  * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
404  * around any usage of the FPU by the kernel to ensure that user-level context
405  * is properly saved/restored, as well as to properly setup the FPU for use by
406  * the kernel. There are a variety of ways this wrapping can be used, as
407  * discussed in this section below.
408  *
409  * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
410  * operations, the kernel_fpu_alloc() function should be used to allocate a
411  * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
412  * state. This structure is not tied to any thread. That is, different threads
413  * can reuse the same kfpu_state_t structure, although not concurrently. A
414  * kfpu_state_t structure is freed by the kernel_fpu_free() function.
415  *
416  * In some cases, the kernel may need to use the FPU for a short operation
417  * without the overhead to manage a kfpu_state_t structure and without
418  * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
419  * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
420  * parameter. This indicates that there is no kfpu_state_t. When used this way,
421  * kernel preemption should be disabled by the caller (kpreempt_disable) before
422  * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
423  * For this usage, it is important to limit the kernel's FPU use to short
424  * operations. The tradeoff between using the FPU without a kfpu_state_t
425  * structure vs. the overhead of allowing a context switch while using the FPU
426  * should be carefully considered on a case by case basis.
427  *
428  * In other cases, kernel threads have an LWP, but never execute in user space.
429  * In this situation, the LWP's pcb_fpu area can be used to save/restore the
430  * kernel's FPU state if the thread is context switched, instead of having to
431  * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
432  * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
433  * enable this behavior. It is the caller's responsibility to ensure that this
434  * is only used for a kernel thread which never executes in user space.
435  *
436  * FPU Exceptions
437  * --------------
438  *
439  * Certain operations can cause the kernel to take traps due to FPU activity.
440  * Generally these events will cause a user process to receive a SIGFPU and if
441  * the kernel receives it in kernel context, we will die. Traditionally the #NM
442  * (Device Not Available / No Math) exception generated by CR0.TS would have
443  * caused us to restore the FPU. Now it is a fatal event regardless of whether
444  * or not user land causes it.
445  *
446  * While there are some cases where the kernel uses the FPU, it is up to the
447  * kernel to use the FPU in a way such that it cannot receive a trap or to use
448  * the appropriate trap protection mechanisms.
449  *
450  * Hypervisors
451  * -----------
452  *
453  * When providing support for hypervisors things are a little bit more
454  * complicated because the FPU is not virtualized at all. This means that they
455  * need to save and restore the FPU and %xcr0 across entry and exit to the
456  * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
457  * allow us to use the full native state to make sure that we are always saving
458  * and restoring the full FPU that the host sees, even when the guest is using a
459  * subset.
460  *
461  * One tricky aspect of this is that the guest may be using a subset of %xcr0
462  * and therefore changing our %xcr0 on the fly. It is vital that when we're
463  * saving and restoring the FPU that we always use the largest %xcr0 contents
464  * otherwise we will end up leaving behind data in it.
465  *
466  * ELF PLT Support
467  * ---------------
468  *
469  * rtld has to preserve a subset of the FPU when it is saving and restoring
470  * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
471  * more information. As a result, we set up an aux vector that contains
472  * information about what save and restore mechanisms it should be using and
473  * the sizing thereof based on what the kernel supports. This is passed down in
474  * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
475  * initialized in fpu_subr.c.
476  */
477 
478 kmem_cache_t *fpsave_cachep;
479 
480 /* Legacy fxsave layout + xsave header + ymm */
481 #define	AVX_XSAVE_SIZE		(512 + 64 + 256)
482 
483 /*
484  * Various sanity checks.
485  */
486 CTASSERT(sizeof (struct fxsave_state) == 512);
487 CTASSERT(sizeof (struct fnsave_state) == 108);
488 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
489 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
490 
491 /*
492  * This structure is the x86 implementation of the kernel FPU that is defined in
493  * uts/common/sys/kfpu.h.
494  */
495 
496 typedef enum kfpu_flags {
497 	/*
498 	 * This indicates that the save state has initial FPU data.
499 	 */
500 	KFPU_F_INITIALIZED = 0x01
501 } kfpu_flags_t;
502 
503 struct kfpu_state {
504 	fpu_ctx_t	kfpu_ctx;
505 	kfpu_flags_t	kfpu_flags;
506 	kthread_t	*kfpu_curthread;
507 };
508 
509 /*
510  * Initial kfpu state for SSE/SSE2 used by fpinit()
511  */
512 const struct fxsave_state sse_initial = {
513 	FPU_CW_INIT,	/* fx_fcw */
514 	0,		/* fx_fsw */
515 	0,		/* fx_fctw */
516 	0,		/* fx_fop */
517 	0,		/* fx_rip */
518 	0,		/* fx_rdp */
519 	SSE_MXCSR_INIT	/* fx_mxcsr */
520 	/* rest of structure is zero */
521 };
522 
523 /*
524  * Initial kfpu state for AVX used by fpinit()
525  */
526 const struct xsave_state avx_initial = {
527 	/*
528 	 * The definition below needs to be identical with sse_initial
529 	 * defined above.
530 	 */
531 	{
532 		FPU_CW_INIT,	/* fx_fcw */
533 		0,		/* fx_fsw */
534 		0,		/* fx_fctw */
535 		0,		/* fx_fop */
536 		0,		/* fx_rip */
537 		0,		/* fx_rdp */
538 		SSE_MXCSR_INIT	/* fx_mxcsr */
539 		/* rest of structure is zero */
540 	},
541 	/*
542 	 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid,
543 	 * and CPU should initialize XMM/YMM.
544 	 */
545 	1,
546 	0	/* xs_xcomp_bv */
547 	/* rest of structure is zero */
548 };
549 
550 /*
551  * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
552  * the #gp exception caused by setting unsupported bits in the
553  * MXCSR register
554  */
555 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
556 
557 /*
558  * Initial kfpu state for x87 used by fpinit()
559  */
560 const struct fnsave_state x87_initial = {
561 	FPU_CW_INIT,	/* f_fcw */
562 	0,		/* __f_ign0 */
563 	0,		/* f_fsw */
564 	0,		/* __f_ign1 */
565 	0xffff,		/* f_ftw */
566 	/* rest of structure is zero */
567 };
568 
569 /*
570  * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
571  * have an XSAVE-capable chip in fpu_probe.
572  */
573 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
574 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
575 
576 /*
577  * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
578  */
579 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
580 
581 static int fpe_sicode(uint_t);
582 static int fpe_simd_sicode(uint_t);
583 static void fp_new_lwp(void *, void *);
584 static void fp_free_ctx(void *, int);
585 
586 static struct ctxop *
587 fp_ctxop_allocate(struct fpu_ctx *fp)
588 {
589 	const struct ctxop_template tpl = {
590 		.ct_rev		= CTXOP_TPL_REV,
591 		.ct_save	= fpsave_ctxt,
592 		.ct_restore	= fprestore_ctxt,
593 		.ct_fork	= fp_new_lwp,
594 		.ct_lwp_create	= fp_new_lwp,
595 		.ct_free	= fp_free_ctx,
596 	};
597 	return (ctxop_allocate(&tpl, fp));
598 }
599 
600 /*
601  * Copy the state of parent lwp's floating point context into the new lwp.
602  * Invoked for both fork() and lwp_create().
603  *
604  * Note that we inherit -only- the control state (e.g. exception masks,
605  * rounding, precision control, etc.); the FPU registers are otherwise
606  * reset to their initial state.
607  */
608 static void
609 fp_new_lwp(void *parent, void *child)
610 {
611 	kthread_id_t t = parent, ct = child;
612 	struct fpu_ctx *fp;		/* parent fpu context */
613 	struct fpu_ctx *cfp;		/* new fpu context */
614 	struct fxsave_state *fx, *cfx;
615 	struct xsave_state *cxs;
616 
617 	ASSERT(fp_kind != FP_NO);
618 
619 	fp = &t->t_lwp->lwp_pcb.pcb_fpu;
620 	cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
621 
622 	/*
623 	 * If the parent FPU state is still in the FPU hw then save it;
624 	 * conveniently, fp_save() already does this for us nicely.
625 	 */
626 	fp_save(fp);
627 
628 	cfp->fpu_flags = FPU_EN | FPU_VALID;
629 	cfp->fpu_regs.kfpu_status = 0;
630 	cfp->fpu_regs.kfpu_xstatus = 0;
631 
632 	/*
633 	 * Make sure that the child's FPU is cleaned up and made ready for user
634 	 * land.
635 	 */
636 	PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
637 
638 	switch (fp_save_mech) {
639 	case FP_FXSAVE:
640 		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
641 		cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
642 		bcopy(&sse_initial, cfx, sizeof (*cfx));
643 		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
644 		cfx->fx_fcw = fx->fx_fcw;
645 		break;
646 
647 	case FP_XSAVE:
648 		cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
649 
650 		VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
651 
652 		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
653 		cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
654 		cfx = &cxs->xs_fxsave;
655 
656 		bcopy(&avx_initial, cxs, sizeof (*cxs));
657 		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
658 		cfx->fx_fcw = fx->fx_fcw;
659 		cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) &
660 		    XFEATURE_FP_INITIAL);
661 		break;
662 	default:
663 		panic("Invalid fp_save_mech");
664 		/*NOTREACHED*/
665 	}
666 
667 	/*
668 	 * Mark that both the parent and child need to have the FPU cleaned up
669 	 * before returning to user land.
670 	 */
671 
672 	ctxop_attach(ct, fp_ctxop_allocate(cfp));
673 }
674 
675 /*
676  * Free any state associated with floating point context.
677  * Fp_free can be called in three cases:
678  * 1) from reaper -> thread_free -> freectx-> fp_free
679  *	fp context belongs to a thread on deathrow
680  *	nothing to do,  thread will never be resumed
681  *	thread calling ctxfree is reaper
682  *
683  * 2) from exec -> freectx -> fp_free
684  *	fp context belongs to the current thread
685  *	must disable fpu, thread calling ctxfree is curthread
686  *
687  * 3) from restorecontext -> setfpregs -> fp_free
688  *	we have a modified context in the memory (lwp->pcb_fpu)
689  *	disable fpu and release the fp context for the CPU
690  *
691  */
692 void
693 fp_free(struct fpu_ctx *fp)
694 {
695 	ASSERT(fp_kind != FP_NO);
696 
697 	if (fp->fpu_flags & FPU_VALID)
698 		return;
699 
700 	kpreempt_disable();
701 	/*
702 	 * We want to do fpsave rather than fpdisable so that we can
703 	 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
704 	 */
705 	fp->fpu_flags |= FPU_VALID;
706 	/* If for current thread disable FP to track FPU_VALID */
707 	if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
708 		/* Clear errors if any to prevent frstor from complaining */
709 		(void) fperr_reset();
710 		if (fp_kind & __FP_SSE)
711 			(void) fpxerr_reset();
712 		fpdisable();
713 	}
714 	kpreempt_enable();
715 }
716 
717 /*
718  * Wrapper for freectx to make the types line up for fp_free()
719  */
720 static void
721 fp_free_ctx(void *arg, int isexec __unused)
722 {
723 	fp_free((struct fpu_ctx *)arg);
724 }
725 
726 /*
727  * Store the floating point state and disable the floating point unit.
728  */
729 void
730 fp_save(struct fpu_ctx *fp)
731 {
732 	ASSERT(fp_kind != FP_NO);
733 
734 	kpreempt_disable();
735 	if (!fp || fp->fpu_flags & FPU_VALID ||
736 	    (fp->fpu_flags & FPU_EN) == 0) {
737 		kpreempt_enable();
738 		return;
739 	}
740 	ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
741 
742 	switch (fp_save_mech) {
743 	case FP_FXSAVE:
744 		fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
745 		break;
746 
747 	case FP_XSAVE:
748 		xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
749 		break;
750 	default:
751 		panic("Invalid fp_save_mech");
752 		/*NOTREACHED*/
753 	}
754 
755 	fp->fpu_flags |= FPU_VALID;
756 
757 	/*
758 	 * We save the FPU as part of forking, execing, modifications via /proc,
759 	 * restorecontext, etc. As such, we need to make sure that we return to
760 	 * userland with valid state in the FPU. If we're context switched out
761 	 * before we hit sys_rtt_common() we'll end up having restored the FPU
762 	 * as part of the context ops operations. The restore logic always makes
763 	 * sure that FPU_VALID is set before doing a restore so we don't restore
764 	 * it a second time.
765 	 */
766 	PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
767 
768 	kpreempt_enable();
769 }
770 
771 /*
772  * Restore the FPU context for the thread:
773  * The possibilities are:
774  *	1. No active FPU context: Load the new context into the FPU hw
775  *	   and enable the FPU.
776  */
777 void
778 fp_restore(struct fpu_ctx *fp)
779 {
780 	switch (fp_save_mech) {
781 	case FP_FXSAVE:
782 		fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
783 		break;
784 
785 	case FP_XSAVE:
786 		xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
787 		break;
788 	default:
789 		panic("Invalid fp_save_mech");
790 		/*NOTREACHED*/
791 	}
792 
793 	fp->fpu_flags &= ~FPU_VALID;
794 }
795 
796 /*
797  * Reset the FPU such that it is in a valid state for a new thread that is
798  * coming out of exec. The FPU will be in a usable state at this point. At this
799  * point we know that the FPU state has already been allocated and if this
800  * wasn't an init process, then it will have had fp_free() previously called.
801  */
802 void
803 fp_exec(void)
804 {
805 	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
806 
807 	if (fp_save_mech == FP_XSAVE) {
808 		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
809 	}
810 
811 	struct ctxop *ctx = fp_ctxop_allocate(fp);
812 	/*
813 	 * Make sure that we're not preempted in the middle of initializing the
814 	 * FPU on CPU.
815 	 */
816 	kpreempt_disable();
817 	ctxop_attach(curthread, ctx);
818 	fpinit();
819 	fp->fpu_flags = FPU_EN;
820 	kpreempt_enable();
821 }
822 
823 
824 /*
825  * Seeds the initial state for the current thread.  The possibilities are:
826  *      1. Another process has modified the FPU state before we have done any
827  *         initialization: Load the FPU state from the LWP state.
828  *      2. The FPU state has not been externally modified:  Load a clean state.
829  */
830 void
831 fp_seed(void)
832 {
833 	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
834 
835 	ASSERT(curthread->t_preempt >= 1);
836 	ASSERT((fp->fpu_flags & FPU_EN) == 0);
837 
838 	/*
839 	 * Always initialize a new context and initialize the hardware.
840 	 */
841 	if (fp_save_mech == FP_XSAVE) {
842 		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
843 	}
844 
845 	ctxop_attach(curthread, fp_ctxop_allocate(fp));
846 	fpinit();
847 
848 	/*
849 	 * If FPU_VALID is set, it means someone has modified registers via
850 	 * /proc.  In this case, restore the current lwp's state.
851 	 */
852 	if (fp->fpu_flags & FPU_VALID)
853 		fp_restore(fp);
854 
855 	ASSERT((fp->fpu_flags & FPU_VALID) == 0);
856 	fp->fpu_flags = FPU_EN;
857 }
858 
859 /*
860  * When using xsave/xrstor, these three functions are used by the lwp code to
861  * manage the memory for the xsave area.
862  */
863 void
864 fp_lwp_init(struct _klwp *lwp)
865 {
866 	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
867 
868 	/*
869 	 * We keep a copy of the pointer in lwp_fpu so that we can restore the
870 	 * value in forklwp() after we duplicate the parent's LWP state.
871 	 */
872 	lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
873 	    kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
874 
875 	if (fp_save_mech == FP_XSAVE) {
876 		/*
877 		 *
878 		 * We bzero since the fpinit() code path will only
879 		 * partially initialize the xsave area using avx_inital.
880 		 */
881 		ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
882 		bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
883 	}
884 }
885 
886 void
887 fp_lwp_cleanup(struct _klwp *lwp)
888 {
889 	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
890 
891 	if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
892 		kmem_cache_free(fpsave_cachep,
893 		    fp->fpu_regs.kfpu_u.kfpu_generic);
894 		lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
895 	}
896 }
897 
898 /*
899  * Called during the process of forklwp(). The kfpu_u pointer will have been
900  * overwritten while copying the parent's LWP structure. We have a valid copy
901  * stashed in the child's lwp_fpu which we use to restore the correct value.
902  */
903 void
904 fp_lwp_dup(struct _klwp *lwp)
905 {
906 	void *xp = lwp->lwp_fpu;
907 	size_t sz;
908 
909 	switch (fp_save_mech) {
910 	case FP_FXSAVE:
911 		sz = sizeof (struct fxsave_state);
912 		break;
913 	case FP_XSAVE:
914 		sz = cpuid_get_xsave_size();
915 		break;
916 	default:
917 		panic("Invalid fp_save_mech");
918 		/*NOTREACHED*/
919 	}
920 
921 	/* copy the parent's values into the new lwp's struct */
922 	bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
923 	/* now restore the pointer */
924 	lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
925 }
926 
927 /*
928  * Handle a processor extension error fault
929  * Returns non zero for error.
930  */
931 
932 /*ARGSUSED*/
933 int
934 fpexterrflt(struct regs *rp)
935 {
936 	uint32_t fpcw, fpsw;
937 	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
938 
939 	ASSERT(fp_kind != FP_NO);
940 
941 	/*
942 	 * Now we can enable the interrupts.
943 	 * (NOTE: x87 fp exceptions come thru interrupt gate)
944 	 */
945 	sti();
946 
947 	if (!fpu_exists)
948 		return (FPE_FLTINV);
949 
950 	/*
951 	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
952 	 * it'll be saved into the fpu context area passed in (that of the
953 	 * current thread).  If it's not dirty (it may not be, due to
954 	 * an intervening save due to a context switch between the sti(),
955 	 * above and here, then it's safe to just use the stored values in
956 	 * the context save area to determine the cause of the fault.
957 	 */
958 	fp_save(fp);
959 
960 	/* clear exception flags in saved state, as if by fnclex */
961 	switch (fp_save_mech) {
962 	case FP_FXSAVE:
963 		fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
964 		fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
965 		fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
966 		break;
967 
968 	case FP_XSAVE:
969 		fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
970 		fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
971 		fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
972 		/*
973 		 * Always set LEGACY_FP as it may have been cleared by XSAVE
974 		 * instruction
975 		 */
976 		fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
977 		break;
978 	default:
979 		panic("Invalid fp_save_mech");
980 		/*NOTREACHED*/
981 	}
982 
983 	fp->fpu_regs.kfpu_status = fpsw;
984 
985 	if ((fpsw & FPS_ES) == 0)
986 		return (0);		/* No exception */
987 
988 	/*
989 	 * "and" the exception flags with the complement of the mask
990 	 * bits to determine which exception occurred
991 	 */
992 	return (fpe_sicode(fpsw & ~fpcw & 0x3f));
993 }
994 
995 /*
996  * Handle an SSE/SSE2 precise exception.
997  * Returns a non-zero sicode for error.
998  */
999 /*ARGSUSED*/
1000 int
1001 fpsimderrflt(struct regs *rp)
1002 {
1003 	uint32_t mxcsr, xmask;
1004 	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1005 
1006 	ASSERT(fp_kind & __FP_SSE);
1007 
1008 	/*
1009 	 * NOTE: Interrupts are disabled during execution of this
1010 	 * function.  They are enabled by the caller in trap.c.
1011 	 */
1012 
1013 	/*
1014 	 * The only way we could have gotten here if there is no FP unit
1015 	 * is via a user executing an INT $19 instruction, so there is
1016 	 * no fault in that case.
1017 	 */
1018 	if (!fpu_exists)
1019 		return (0);
1020 
1021 	/*
1022 	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
1023 	 * it'll be saved into the fpu context area passed in (that of the
1024 	 * current thread).  If it's not dirty, then it's safe to just use
1025 	 * the stored values in the context save area to determine the
1026 	 * cause of the fault.
1027 	 */
1028 	fp_save(fp);		/* save the FPU state */
1029 
1030 	if (fp_save_mech == FP_XSAVE) {
1031 		mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1032 		fp->fpu_regs.kfpu_status =
1033 		    fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1034 	} else {
1035 		mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1036 		fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1037 	}
1038 	fp->fpu_regs.kfpu_xstatus = mxcsr;
1039 
1040 	/*
1041 	 * compute the mask that determines which conditions can cause
1042 	 * a #xm exception, and use this to clean the status bits so that
1043 	 * we can identify the true cause of this one.
1044 	 */
1045 	xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1046 	return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1047 }
1048 
1049 /*
1050  * In the unlikely event that someone is relying on this subcode being
1051  * FPE_FLTILL for denormalize exceptions, it can always be patched back
1052  * again to restore old behaviour.
1053  */
1054 int fpe_fltden = FPE_FLTDEN;
1055 
1056 /*
1057  * Map from the FPU status word to the FP exception si_code.
1058  */
1059 static int
1060 fpe_sicode(uint_t sw)
1061 {
1062 	if (sw & FPS_IE)
1063 		return (FPE_FLTINV);
1064 	if (sw & FPS_ZE)
1065 		return (FPE_FLTDIV);
1066 	if (sw & FPS_DE)
1067 		return (fpe_fltden);
1068 	if (sw & FPS_OE)
1069 		return (FPE_FLTOVF);
1070 	if (sw & FPS_UE)
1071 		return (FPE_FLTUND);
1072 	if (sw & FPS_PE)
1073 		return (FPE_FLTRES);
1074 	return (FPE_FLTINV);	/* default si_code for other exceptions */
1075 }
1076 
1077 /*
1078  * Map from the SSE status word to the FP exception si_code.
1079  */
1080 static int
1081 fpe_simd_sicode(uint_t sw)
1082 {
1083 	if (sw & SSE_IE)
1084 		return (FPE_FLTINV);
1085 	if (sw & SSE_ZE)
1086 		return (FPE_FLTDIV);
1087 	if (sw & SSE_DE)
1088 		return (FPE_FLTDEN);
1089 	if (sw & SSE_OE)
1090 		return (FPE_FLTOVF);
1091 	if (sw & SSE_UE)
1092 		return (FPE_FLTUND);
1093 	if (sw & SSE_PE)
1094 		return (FPE_FLTRES);
1095 	return (FPE_FLTINV);	/* default si_code for other exceptions */
1096 }
1097 
1098 /*
1099  * This routine is invoked as part of libc's __fpstart implementation
1100  * via sysi86(2).
1101  *
1102  * It may be called -before- any context has been assigned in which case
1103  * we try and avoid touching the hardware.  Or it may be invoked well
1104  * after the context has been assigned and fiddled with, in which case
1105  * just tweak it directly.
1106  */
1107 void
1108 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1109 {
1110 	struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1111 	struct fxsave_state *fx;
1112 
1113 	if (!fpu_exists || fp_kind == FP_NO)
1114 		return;
1115 
1116 	if ((fp->fpu_flags & FPU_EN) == 0) {
1117 		if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1118 			/*
1119 			 * Common case.  Floating point unit not yet
1120 			 * enabled, and kernel already intends to initialize
1121 			 * the hardware the way the caller wants.
1122 			 */
1123 			return;
1124 		}
1125 		/*
1126 		 * Hmm.  Userland wants a different default.
1127 		 * Do a fake "first trap" to establish the context, then
1128 		 * handle as if we already had a context before we came in.
1129 		 */
1130 		kpreempt_disable();
1131 		fp_seed();
1132 		kpreempt_enable();
1133 	}
1134 
1135 	/*
1136 	 * Ensure that the current hardware state is flushed back to the
1137 	 * pcb, then modify that copy.  Next use of the fp will
1138 	 * restore the context.
1139 	 */
1140 	fp_save(fp);
1141 
1142 	switch (fp_save_mech) {
1143 	case FP_FXSAVE:
1144 		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1145 		fx->fx_fcw = fcw;
1146 		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1147 		break;
1148 
1149 	case FP_XSAVE:
1150 		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1151 		fx->fx_fcw = fcw;
1152 		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1153 		/*
1154 		 * Always set LEGACY_FP as it may have been cleared by XSAVE
1155 		 * instruction
1156 		 */
1157 		fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
1158 		break;
1159 	default:
1160 		panic("Invalid fp_save_mech");
1161 		/*NOTREACHED*/
1162 	}
1163 }
1164 
1165 static void
1166 kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1167 {
1168 	struct xsave_state *xs;
1169 
1170 	switch (fp_save_mech) {
1171 	case FP_FXSAVE:
1172 		bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1173 		    sizeof (struct fxsave_state));
1174 		kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1175 		break;
1176 	case FP_XSAVE:
1177 		xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1178 		bzero(xs, cpuid_get_xsave_size());
1179 		bcopy(&avx_initial, xs, sizeof (*xs));
1180 		xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1181 		kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1182 		break;
1183 	default:
1184 		panic("invalid fp_save_mech");
1185 	}
1186 
1187 	/*
1188 	 * Set the corresponding flags that the system expects on the FPU state
1189 	 * to indicate that this is our state. The FPU_EN flag is required to
1190 	 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1191 	 * not set below as it represents that this state is being suppressed
1192 	 * by the kernel.
1193 	 */
1194 	kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1195 	kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1196 }
1197 
1198 kfpu_state_t *
1199 kernel_fpu_alloc(int kmflags)
1200 {
1201 	kfpu_state_t *kfpu;
1202 
1203 	if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1204 		return (NULL);
1205 	}
1206 
1207 	kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1208 	    kmem_cache_alloc(fpsave_cachep, kmflags);
1209 	if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1210 		kmem_free(kfpu, sizeof (kfpu_state_t));
1211 		return (NULL);
1212 	}
1213 
1214 	kernel_fpu_fpstate_init(kfpu);
1215 
1216 	return (kfpu);
1217 }
1218 
1219 void
1220 kernel_fpu_free(kfpu_state_t *kfpu)
1221 {
1222 	kmem_cache_free(fpsave_cachep,
1223 	    kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1224 	kmem_free(kfpu, sizeof (kfpu_state_t));
1225 }
1226 
1227 static void
1228 kernel_fpu_ctx_save(void *arg)
1229 {
1230 	kfpu_state_t *kfpu = arg;
1231 	fpu_ctx_t *pf;
1232 
1233 	if (kfpu == NULL) {
1234 		/*
1235 		 * A NULL kfpu implies this is a kernel thread with an LWP and
1236 		 * no user-level FPU usage. Use the lwp fpu save area.
1237 		 */
1238 		pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1239 
1240 		ASSERT(curthread->t_procp->p_flag & SSYS);
1241 		ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1242 
1243 		fp_save(pf);
1244 	} else {
1245 		pf = &kfpu->kfpu_ctx;
1246 
1247 		ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1248 		ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1249 
1250 		/*
1251 		 * Note, we can't use fp_save because it assumes that we're
1252 		 * saving to the thread's PCB and not somewhere else. Because
1253 		 * this is a different FPU context, we instead have to do this
1254 		 * ourselves.
1255 		 */
1256 		switch (fp_save_mech) {
1257 		case FP_FXSAVE:
1258 			fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1259 			break;
1260 		case FP_XSAVE:
1261 			xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1262 			break;
1263 		default:
1264 			panic("Invalid fp_save_mech");
1265 		}
1266 
1267 		/*
1268 		 * Because we have saved context here, our save state is no
1269 		 * longer valid and therefore needs to be reinitialized.
1270 		 */
1271 		kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1272 	}
1273 
1274 	pf->fpu_flags |= FPU_VALID;
1275 
1276 	/*
1277 	 * Clear KFPU flag. This allows swtch to check for improper kernel
1278 	 * usage of the FPU (i.e. switching to a new thread while the old
1279 	 * thread was in the kernel and using the FPU, but did not perform a
1280 	 * context save).
1281 	 */
1282 	curthread->t_flag &= ~T_KFPU;
1283 }
1284 
1285 static void
1286 kernel_fpu_ctx_restore(void *arg)
1287 {
1288 	kfpu_state_t *kfpu = arg;
1289 	fpu_ctx_t *pf;
1290 
1291 	if (kfpu == NULL) {
1292 		/*
1293 		 * A NULL kfpu implies this is a kernel thread with an LWP and
1294 		 * no user-level FPU usage. Use the lwp fpu save area.
1295 		 */
1296 		pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1297 
1298 		ASSERT(curthread->t_procp->p_flag & SSYS);
1299 		ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1300 	} else {
1301 		pf = &kfpu->kfpu_ctx;
1302 
1303 		ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1304 		ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1305 	}
1306 
1307 	fp_restore(pf);
1308 	curthread->t_flag |= T_KFPU;
1309 }
1310 
1311 /*
1312  * Validate that the thread is not switching off-cpu while actively using the
1313  * FPU within the kernel.
1314  */
1315 void
1316 kernel_fpu_no_swtch(void)
1317 {
1318 	if ((curthread->t_flag & T_KFPU) != 0) {
1319 		panic("curthread swtch-ing while the kernel is using the FPU");
1320 	}
1321 }
1322 
1323 static const struct ctxop_template kfpu_ctxop_tpl = {
1324 	.ct_rev		= CTXOP_TPL_REV,
1325 	.ct_save	= kernel_fpu_ctx_save,
1326 	.ct_restore	= kernel_fpu_ctx_restore,
1327 };
1328 
1329 void
1330 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1331 {
1332 	klwp_t *pl = curthread->t_lwp;
1333 	struct ctxop *ctx;
1334 
1335 	if ((curthread->t_flag & T_KFPU) != 0) {
1336 		panic("curthread attempting to nest kernel FPU states");
1337 	}
1338 
1339 	/* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1340 	ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1341 	    (KFPU_USE_LWP | KFPU_NO_STATE));
1342 
1343 	if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1344 		/*
1345 		 * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1346 		 * hold our kernel FPU context, we depend on the caller doing
1347 		 * kpreempt_disable for the duration of our FPU usage. This
1348 		 * should only be done for very short periods of time.
1349 		 */
1350 		ASSERT(curthread->t_preempt > 0);
1351 		ASSERT(kfpu == NULL);
1352 
1353 		if (pl != NULL) {
1354 			/*
1355 			 * We might have already saved once so FPU_VALID could
1356 			 * be set. This is handled in fp_save.
1357 			 */
1358 			fp_save(&pl->lwp_pcb.pcb_fpu);
1359 			pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1360 		}
1361 
1362 		curthread->t_flag |= T_KFPU;
1363 
1364 		/* Always restore the fpu to the initial state. */
1365 		fpinit();
1366 
1367 		return;
1368 	}
1369 
1370 	/*
1371 	 * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1372 	 */
1373 
1374 	if ((flags & KFPU_USE_LWP) == 0) {
1375 		if (kfpu->kfpu_curthread != NULL)
1376 			panic("attempting to reuse kernel FPU state at %p when "
1377 			    "another thread already is using", kfpu);
1378 
1379 		if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1380 			kernel_fpu_fpstate_init(kfpu);
1381 
1382 		kfpu->kfpu_curthread = curthread;
1383 	}
1384 
1385 	/*
1386 	 * Not all threads may have an active LWP. If they do and we're not
1387 	 * going to re-use the LWP, then we should go ahead and save the state.
1388 	 * We must also note that the fpu is now being used by the kernel and
1389 	 * therefore we do not want to manage the fpu state via the user-level
1390 	 * thread's context handlers.
1391 	 *
1392 	 * We might have already saved once (due to a prior use of the kernel
1393 	 * FPU or another code path) so FPU_VALID could be set. This is handled
1394 	 * by fp_save, as is the FPU_EN check.
1395 	 */
1396 	ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1397 	kpreempt_disable();
1398 	if (pl != NULL) {
1399 		if ((flags & KFPU_USE_LWP) == 0)
1400 			fp_save(&pl->lwp_pcb.pcb_fpu);
1401 		pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1402 	}
1403 
1404 	/*
1405 	 * Set the context operations for kernel FPU usage.  Because kernel FPU
1406 	 * setup and ctxop attachment needs to happen under the protection of
1407 	 * kpreempt_disable(), we allocate the ctxop outside the guard so its
1408 	 * sleeping allocation will not cause a voluntary swtch().  This allows
1409 	 * the rest of the initialization to proceed, ensuring valid state for
1410 	 * the ctxop handlers.
1411 	 */
1412 	ctxop_attach(curthread, ctx);
1413 	curthread->t_flag |= T_KFPU;
1414 
1415 	if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1416 		/*
1417 		 * For pure kernel threads with an LWP, we can use the LWP's
1418 		 * pcb_fpu to save/restore context.
1419 		 */
1420 		fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1421 
1422 		VERIFY(curthread->t_procp->p_flag & SSYS);
1423 		VERIFY(kfpu == NULL);
1424 		ASSERT((pf->fpu_flags & FPU_EN) == 0);
1425 
1426 		/* Always restore the fpu to the initial state. */
1427 		if (fp_save_mech == FP_XSAVE)
1428 			pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1429 		fpinit();
1430 		pf->fpu_flags = FPU_EN | FPU_KERNEL;
1431 	} else {
1432 		/* initialize the kfpu state */
1433 		kernel_fpu_ctx_restore(kfpu);
1434 	}
1435 	kpreempt_enable();
1436 }
1437 
1438 void
1439 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1440 {
1441 	ulong_t iflags;
1442 
1443 	if ((curthread->t_flag & T_KFPU) == 0) {
1444 		panic("curthread attempting to clear kernel FPU state "
1445 		    "without using it");
1446 	}
1447 
1448 	/*
1449 	 * General comments on why the rest of this function is structured the
1450 	 * way it is. Be aware that there is a lot of subtlety here.
1451 	 *
1452 	 * If a user-level thread ever uses the fpu while in the kernel, then
1453 	 * we cannot call fpdisable since that does STTS. That will set the
1454 	 * ts bit in %cr0 which will cause an exception if anything touches the
1455 	 * fpu. However, the user-level context switch handler (fpsave_ctxt)
1456 	 * needs to access the fpu to save the registers into the pcb.
1457 	 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1458 	 * fprestore_ctxt when the thread context switched onto the CPU.
1459 	 *
1460 	 * Calling fpdisable only effects the current CPU's %cr0 register.
1461 	 *
1462 	 * During ctxop_remove and kpreempt_enable, we can voluntarily context
1463 	 * switch, so the CPU we were on when we entered this function might
1464 	 * not be the same one we're on when we return from ctxop_remove or end
1465 	 * the function. Note there can be user-level context switch handlers
1466 	 * still installed if this is a user-level thread.
1467 	 *
1468 	 * We also must be careful in the unlikely chance we're running in an
1469 	 * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1470 	 * incorrectly for the "real" thread to resume on this CPU.
1471 	 */
1472 
1473 	if ((flags & KFPU_NO_STATE) == 0) {
1474 		kpreempt_disable();
1475 	} else {
1476 		ASSERT(curthread->t_preempt > 0);
1477 	}
1478 
1479 	curthread->t_flag &= ~T_KFPU;
1480 
1481 	/*
1482 	 * When we are ending things, we explicitly don't save the current
1483 	 * kernel FPU state back to the temporary state. The kfpu API is not
1484 	 * intended to be a permanent save location.
1485 	 *
1486 	 * If this is a user-level thread and we were to context switch
1487 	 * before returning to user-land, fpsave_ctxt will be a no-op since we
1488 	 * already saved the user-level FPU state the first time we run
1489 	 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1490 	 * the user-level fpu state). The fpsave_ctxt functions only save if
1491 	 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1492 	 * fprestore_ctxt will be done in sys_rtt_common when the thread
1493 	 * finally returns to user-land.
1494 	 */
1495 
1496 	if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1497 	    curthread->t_intr == NULL) {
1498 		/*
1499 		 * A kernel thread which is not an interrupt thread, so we
1500 		 * STTS now.
1501 		 */
1502 		fpdisable();
1503 	}
1504 
1505 	if ((flags & KFPU_NO_STATE) == 0) {
1506 		ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1507 
1508 		if (kfpu != NULL) {
1509 			if (kfpu->kfpu_curthread != curthread) {
1510 				panic("attempting to end kernel FPU state "
1511 				    "for %p, but active thread is not "
1512 				    "curthread", kfpu);
1513 			} else {
1514 				kfpu->kfpu_curthread = NULL;
1515 			}
1516 		}
1517 
1518 		kpreempt_enable();
1519 	}
1520 
1521 	if (curthread->t_lwp != NULL) {
1522 		uint_t f;
1523 
1524 		if (flags & KFPU_USE_LWP) {
1525 			f = FPU_EN | FPU_KERNEL;
1526 		} else {
1527 			f = FPU_KERNEL;
1528 		}
1529 		curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1530 	}
1531 }
1532