xref: /illumos-gate/usr/src/uts/intel/os/fpu.c (revision 75840da35ecec00345f7f5f5d85a1f19fae4bd26)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2021 Joyent, Inc.
24  * Copyright 2021 RackTop Systems, Inc.
25  * Copyright 2023 Oxide Computer Company
26  * Copyright 2025 Edgecast Cloud LLC.
27  */
28 
29 /*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
30 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
31 /*		All Rights Reserved				*/
32 
33 /*	Copyright (c) 1987, 1988 Microsoft Corporation		*/
34 /*		All Rights Reserved				*/
35 
36 /*
37  * Copyright (c) 2009, Intel Corporation.
38  * All rights reserved.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/param.h>
43 #include <sys/signal.h>
44 #include <sys/regset.h>
45 #include <sys/privregs.h>
46 #include <sys/psw.h>
47 #include <sys/trap.h>
48 #include <sys/fault.h>
49 #include <sys/systm.h>
50 #include <sys/user.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/pcb.h>
54 #include <sys/lwp.h>
55 #include <sys/cpuvar.h>
56 #include <sys/thread.h>
57 #include <sys/disp.h>
58 #include <sys/fp.h>
59 #include <sys/siginfo.h>
60 #include <sys/archsystm.h>
61 #include <sys/kmem.h>
62 #include <sys/debug.h>
63 #include <sys/x86_archext.h>
64 #include <sys/sysmacros.h>
65 #include <sys/cmn_err.h>
66 #include <sys/kfpu.h>
67 #include <sys/stdbool.h>
68 #include <sys/stdalign.h>
69 #include <sys/procfs_isa.h>
70 #include <sys/sunddi.h>
71 
72 /*
73  * FPU Management Overview
74  * -----------------------
75  *
76  * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
77  * however, many aspects of its life as a coprocessor are still around in x86.
78  *
79  * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
80  * While that state still exists, there is much more that is covered by the FPU.
81  * Today, this includes not just traditional FPU state, but also supervisor only
82  * state. The following state is currently managed and covered logically by the
83  * idea of the FPU registers and more generally is called the Extended Processor
84  * States:
85  *
86  *    o Traditional x87 FPU
87  *    o Vector Registers (%xmm, %ymm, %zmm)
88  *    o Memory Protection Extensions (MPX) Bounds Registers
89  *    o Protected Key Rights Registers (PKRU)
90  *    o Processor Trace data
91  *    o Control-Flow Enforcement state
92  *    o Hardware Duty Cycle
93  *    o Hardware P-states
94  *
95  * The rest of this covers how the FPU is managed and controlled, how state is
96  * saved and restored between threads, interactions with hypervisors, and other
97  * information exported to userland through aux vectors. A lot of background
98  * information is here to synthesize major parts of the Intel SDM, but
99  * unfortunately, it is not a replacement for reading it.
100  *
101  * FPU Control Registers
102  * ---------------------
103  *
104  * Because the x87 FPU began its life as a co-processor and the FPU was
105  * optional there are several bits that show up in %cr0 that we have to
106  * manipulate when dealing with the FPU. These are:
107  *
108  *   o CR0.ET	The 'extension type' bit. This was used originally to indicate
109  *		that the FPU co-processor was present. Now it is forced on for
110  *		compatibility. This is often used to verify whether or not the
111  *		FPU is present.
112  *
113  *   o CR0.NE	The 'native error' bit. Used to indicate that native error
114  *		mode should be enabled. This indicates that we should take traps
115  *		on FPU errors. The OS enables this early in boot.
116  *
117  *   o CR0.MP	The 'Monitor Coprocessor' bit. Used to control whether or not
118  *		wait/fwait instructions generate a #NM if CR0.TS is set.
119  *
120  *   o CR0.EM	The 'Emulation' bit. This is used to cause floating point
121  *		operations (x87 through SSE4) to trap with a #UD so they can be
122  *		emulated. The system never sets this bit, but makes sure it is
123  *		clear on processor start up.
124  *
125  *   o CR0.TS	The 'Task Switched' bit. When this is turned on, a floating
126  *		point operation will generate a #NM. An fwait will as well,
127  *		depending on the value in CR0.MP.
128  *
129  * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
130  * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
131  * complicated role. Historically it has been used to allow running systems to
132  * restore the FPU registers lazily. This will be discussed in greater depth
133  * later on.
134  *
135  * %cr4 is also used as part of the FPU control. Specifically we need to worry
136  * about the following bits in the system:
137  *
138  *   o CR4.OSFXSR	This bit is used to indicate that the OS understands and
139  *			supports the execution of the fxsave and fxrstor
140  *			instructions. This bit is required to be set to enable
141  *			the use of the SSE->SSE4 instructions.
142  *
143  *   o CR4.OSXMMEXCPT	This bit is used to indicate that the OS can understand
144  *			and take a SIMD floating point exception (#XM). This bit
145  *			is always enabled by the system.
146  *
147  *   o CR4.OSXSAVE	This bit is used to indicate that the OS understands and
148  *			supports the execution of the xsave and xrstor family of
149  *			instructions. This bit is required to use any of the AVX
150  *			and newer feature sets.
151  *
152  * Because all supported processors are 64-bit, they'll always support the XMM
153  * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
154  * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
155  *
156  * %xcr0 is used to manage the behavior of the xsave feature set and is only
157  * present on the system if xsave is supported. %xcr0 is read and written to
158  * through by the xgetbv and xsetbv instructions. This register is present
159  * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
160  * different component of the xsave state and controls whether or not that
161  * information is saved and restored. For newer feature sets like AVX and MPX,
162  * it also controls whether or not the corresponding instructions can be
163  * executed (much like CR0.OSFXSR does for the SSE feature sets).
164  *
165  * Everything in %xcr0 is around features available to users. There is also the
166  * IA32_XSS MSR which is used to control supervisor-only features that are still
167  * part of the xsave state. Bits that can be set in %xcr0 are reserved in
168  * IA32_XSS and vice versa. This is an important property that is particularly
169  * relevant to how the xsave instructions operate.
170  *
171  * Save Mechanisms
172  * ---------------
173  *
174  * When switching between running threads the FPU state needs to be saved and
175  * restored by the OS. If this state was not saved, users would rightfully
176  * complain about corrupt state. There are three mechanisms that exist on the
177  * processor for saving and restoring these state images:
178  *
179  *   o fsave
180  *   o fxsave
181  *   o xsave
182  *
183  * fsave saves and restores only the x87 FPU and is the oldest of these
184  * mechanisms. This mechanism is never used in the kernel today because we are
185  * always running on systems that support fxsave.
186  *
187  * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
188  * state to be saved and restored to and from a struct fxsave_state. This is the
189  * default mechanism that is used to save and restore the FPU on amd64. An
190  * important aspect of fxsave that was different from the original i386 fsave
191  * mechanism is that the restoring of FPU state with pending exceptions will not
192  * generate an exception, it will be deferred to the next use of the FPU.
193  *
194  * The final and by far the most complex mechanism is that of the xsave set.
195  * xsave allows for saving and restoring all of the traditional x86 pieces (x87
196  * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
197  * registers.
198  *
199  * Data is saved and restored into and out of a struct xsave_state. The first
200  * part of the struct xsave_state is equivalent to the struct fxsave_state.
201  * After that, there is a header which is used to describe the remaining
202  * portions of the state. The header is a 64-byte value of which the first two
203  * uint64_t values are defined and the rest are reserved and must be zero. The
204  * first uint64_t is the xstate_bv member. This describes which values in the
205  * xsave_state are actually valid and present. This is updated on a save and
206  * used on restore. The second member is the xcomp_bv member. Its last bit
207  * determines whether or not a compressed version of the structure is used.
208  *
209  * When the uncompressed structure is used (currently the only format we
210  * support), then each state component is at a fixed offset in the structure,
211  * even if it is not being used. For example, if you only saved the AVX related
212  * state, but did not save the MPX related state, the offset would not change
213  * for any component. With the compressed format, components that aren't used
214  * are all elided (though the x87 and SSE state are always there).
215  *
216  * Unlike fxsave which saves all state, the xsave family does not always save
217  * and restore all the state that could be covered by the xsave_state. The
218  * instructions all take an argument which is a mask of what to consider. This
219  * is the same mask that will be used in the xstate_bv vector and it is also the
220  * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
221  * considered with the xsaves and xrstors instructions.
222  *
223  * When a save or restore is requested, a bitwise and is performed between the
224  * requested bits and those that have been enabled in %xcr0. Only the bits that
225  * match that are then saved or restored. Others will be silently ignored by
226  * the processor. This idea is used often in the OS. We will always request that
227  * we save and restore all of the state, but only those portions that are
228  * actually enabled in %xcr0 will be touched.
229  *
230  * If a feature has been asked to be restored that is not set in the xstate_bv
231  * feature vector of the save state, then it will be set to its initial state by
232  * the processor (usually zeros). Also, when asked to save state, the processor
233  * may not write out data that is in its initial state as an optimization. This
234  * optimization only applies to saving data and not to restoring data.
235  *
236  * There are a few different variants of the xsave and xrstor instruction. They
237  * are:
238  *
239  *   o xsave	This is the original save instruction. It will save all of the
240  *		requested data in the xsave state structure. It only saves data
241  *		in the uncompressed (xcomp_bv[63] is zero) format. It may be
242  *		executed at all privilege levels.
243  *
244  *   o xrstor	This is the original restore instruction. It will restore all of
245  *		the requested data. The xrstor function can handle both the
246  *		compressed and uncompressed formats. It may be executed at all
247  *		privilege levels.
248  *
249  *   o xsaveopt	This is a variant of the xsave instruction that employs
250  *		optimizations to try and only write out state that has been
251  *		modified since the last time an xrstor instruction was called.
252  *		The processor tracks a tuple of information about the last
253  *		xrstor and tries to ensure that the same buffer is being used
254  *		when this optimization is being used. However, because of the
255  *		way that it tracks the xrstor buffer based on the address of it,
256  *		it is not suitable for use if that buffer can be easily reused.
257  *		The most common case is trying to save data to the stack in
258  *		rtld. It may be executed at all privilege levels.
259  *
260  *   o xsavec	This is a variant of the xsave instruction that writes out the
261  *		compressed form of the xsave_state. Otherwise it behaves as
262  *		xsave. It may be executed at all privilege levels.
263  *
264  *   o xsaves	This is a variant of the xsave instruction. It is similar to
265  *		xsavec in that it always writes the compressed form of the
266  *		buffer. Unlike all the other forms, this instruction looks at
267  *		both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
268  *		what to save and restore. xsaves also implements the same
269  *		optimization that xsaveopt does around modified pieces. User
270  *		land may not execute the instruction.
271  *
272  *   o xrstors	This is a variant of the xrstor instruction. Similar to xsaves
273  *		it can save and restore both the user and privileged states.
274  *		Unlike xrstor it can only operate on the compressed form.
275  *		User land may not execute the instruction.
276  *
277  * Based on all of these, the kernel has a precedence for what it will use.
278  * Basically, xsaves (not supported) is preferred to xsaveopt, which is
279  * preferred to xsave. A similar scheme is used when informing rtld (more later)
280  * about what it should use. xsavec is preferred to xsave. xsaveopt is not
281  * recommended due to the modified optimization not being appropriate for this
282  * use.
283  *
284  * Finally, there is one last gotcha with the xsave state. Importantly some AMD
285  * processors did not always save and restore some of the FPU exception state in
286  * some cases like Intel did. In those cases the OS will make up for this fact
287  * itself.
288  *
289  * FPU Initialization
290  * ------------------
291  *
292  * One difference with the FPU registers is that not all threads have FPU state,
293  * only those that have an lwp. Generally this means kernel threads, which all
294  * share p0 and its lwp, do not have FPU state. Though there are definitely
295  * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
296  * and lwp interchangeably, just think of thread meaning a thread that has a
297  * lwp.
298  *
299  * Each lwp has its FPU state allocated in its pcb (process control block). The
300  * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
301  * dynamically at start up based on the save mechanism that we're using and the
302  * amount of memory required for it. This is dynamic because the xsave_state
303  * size varies based on the supported feature set.
304  *
305  * The hardware side of the FPU is initialized early in boot before we mount the
306  * root file system. This is effectively done in fpu_probe(). This is where we
307  * make the final decision about what the save and restore mechanisms we should
308  * use are, create the fpsave_cachep kmem cache, and initialize a number of
309  * function pointers that use save and restoring logic.
310  *
311  * The thread/lwp side is a a little more involved. There are two different
312  * things that we need to concern ourselves with. The first is how the FPU
313  * resources are allocated and the second is how the FPU state is initialized
314  * for a given lwp.
315  *
316  * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
317  * This is always called unconditionally by the system as part of creating an
318  * LWP.
319  *
320  * There are three different initialization paths that we deal with. The first
321  * is when we are executing a new process. As part of exec all of the register
322  * state is reset. The exec case is particularly important because init is born
323  * like Athena, sprouting from the head of the kernel, without any true parent
324  * to fork from. The second is used whenever we fork or create a new lwp.  The
325  * third is to deal with special lwps like the agent lwp.
326  *
327  * During exec, we will call fp_exec() which will initialize and set up the FPU
328  * state for the process. That will fill in the initial state for the FPU and
329  * also set that state in the FPU itself. As part of fp_exec() we also install a
330  * thread context operations vector that takes care of dealing with the saving
331  * and restoring of the FPU. These context handlers will also be called whenever
332  * an lwp is created or forked. In those cases, to initialize the FPU we will
333  * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
334  * operations vector for the new thread.
335  *
336  * Next we'll end up in the context operation fp_new_lwp(). This saves the
337  * current thread's state, initializes the new thread's state, and copies over
338  * the relevant parts of the originating thread's state. It's as this point that
339  * we also install the FPU context operations into the new thread, which ensures
340  * that all future threads that are descendants of the current one get the
341  * thread context operations (unless they call exec).
342  *
343  * To deal with some things like the agent lwp, we double check the state of the
344  * FPU in sys_rtt_common() to make sure that it has been enabled before
345  * returning to userland. In general, this path should be rare, but it's useful
346  * for the odd lwp here and there.
347  *
348  * The FPU state will remain valid most of the time. There are times that
349  * the state will be rewritten. For example in restorecontext, due to /proc, or
350  * the lwp calls exec(). Whether the context is being freed or we are resetting
351  * the state, we will call fp_free() to disable the FPU and our context.
352  *
353  * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
354  * state by calling fp_lwp_cleanup().
355  *
356  * Kernel FPU Multiplexing
357  * -----------------------
358  *
359  * Just as the kernel has to maintain all of the general purpose registers when
360  * switching between scheduled threads, the same is true of the FPU registers.
361  *
362  * When a thread has FPU state, it also has a set of context operations
363  * installed. These context operations take care of making sure that the FPU is
364  * properly saved and restored during a context switch (fpsave_ctxt and
365  * fprestore_ctxt respectively). This means that the current implementation of
366  * the FPU is 'eager', when a thread is running the CPU will have its FPU state
367  * loaded. While this is always true when executing in userland, there are a few
368  * cases where this is not true in the kernel.
369  *
370  * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
371  * employed. This meant that the FPU would be saved on a context switch and the
372  * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
373  * then take a #NM trap, at which point we would restore the FPU from the save
374  * area and return to userland. Given the frequency of use of the FPU alone by
375  * libc, there's no point returning to userland just to trap again.
376  *
377  * There are a few cases though where the FPU state may need to be changed for a
378  * thread on its behalf. The most notable cases are in the case of processes
379  * using /proc, restorecontext, forking, etc. In all of these cases the kernel
380  * will force a threads FPU state to be saved into the PCB through the fp_save()
381  * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
382  * pcb. This indicates that the save state holds currently valid data. As a side
383  * effect of this, CR0.TS will be set. To make sure that all of the state is
384  * updated before returning to userland, in these cases, we set a flag on the
385  * PCB that says the FPU needs to be updated. This will make sure that we take
386  * the slow path out of a system call to fix things up for the thread. Due to
387  * the fact that this is a rather rare case, effectively setting the equivalent
388  * of t_postsys is acceptable.
389  *
390  * CR0.TS will be set after a save occurs and cleared when a restore occurs.
391  * Generally this means it will be cleared immediately by the new thread that is
392  * running in a context switch. However, this isn't the case for kernel threads.
393  * They currently operate with CR0.TS set as no kernel state is restored for
394  * them. This means that using the FPU will cause a #NM and panic.
395  *
396  * The FPU_VALID flag on the currently executing thread's pcb is meant to track
397  * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
398  * However, because we eagerly restore, the only time that CR0.TS should be set
399  * for a non-kernel thread is during operations where it will be cleared before
400  * returning to userland and importantly, the only data that is in it is its
401  * own.
402  *
403  * Kernel FPU Usage
404  * ----------------
405  *
406  * Traditionally the kernel never used the FPU since it had no need for
407  * floating point operations. However, modern FPU hardware supports a variety
408  * of SIMD extensions which can speed up code such as parity calculations or
409  * encryption.
410  *
411  * To allow the kernel to take advantage of these features, the
412  * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
413  * around any usage of the FPU by the kernel to ensure that user-level context
414  * is properly saved/restored, as well as to properly setup the FPU for use by
415  * the kernel. There are a variety of ways this wrapping can be used, as
416  * discussed in this section below.
417  *
418  * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
419  * operations, the kernel_fpu_alloc() function should be used to allocate a
420  * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
421  * state. This structure is not tied to any thread. That is, different threads
422  * can reuse the same kfpu_state_t structure, although not concurrently. A
423  * kfpu_state_t structure is freed by the kernel_fpu_free() function.
424  *
425  * In some cases, the kernel may need to use the FPU for a short operation
426  * without the overhead to manage a kfpu_state_t structure and without
427  * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
428  * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
429  * parameter. This indicates that there is no kfpu_state_t. When used this way,
430  * kernel preemption should be disabled by the caller (kpreempt_disable) before
431  * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
432  * For this usage, it is important to limit the kernel's FPU use to short
433  * operations. The tradeoff between using the FPU without a kfpu_state_t
434  * structure vs. the overhead of allowing a context switch while using the FPU
435  * should be carefully considered on a case by case basis.
436  *
437  * In other cases, kernel threads have an LWP, but never execute in user space.
438  * In this situation, the LWP's pcb_fpu area can be used to save/restore the
439  * kernel's FPU state if the thread is context switched, instead of having to
440  * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
441  * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
442  * enable this behavior. It is the caller's responsibility to ensure that this
443  * is only used for a kernel thread which never executes in user space.
444  *
445  * FPU Exceptions
446  * --------------
447  *
448  * Certain operations can cause the kernel to take traps due to FPU activity.
449  * Generally these events will cause a user process to receive a SIGFPU and if
450  * the kernel receives it in kernel context, we will die. Traditionally the #NM
451  * (Device Not Available / No Math) exception generated by CR0.TS would have
452  * caused us to restore the FPU. Now it is a fatal event regardless of whether
453  * or not userland causes it.
454  *
455  * While there are some cases where the kernel uses the FPU, it is up to the
456  * kernel to use the FPU in a way such that it cannot receive a trap or to use
457  * the appropriate trap protection mechanisms.
458  *
459  * Hypervisors
460  * -----------
461  *
462  * When providing support for hypervisors things are a little bit more
463  * complicated because the FPU is not virtualized at all. This means that they
464  * need to save and restore the FPU and %xcr0 across entry and exit to the
465  * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
466  * allow us to use the full native state to make sure that we are always saving
467  * and restoring the full FPU that the host sees, even when the guest is using a
468  * subset.
469  *
470  * One tricky aspect of this is that the guest may be using a subset of %xcr0
471  * and therefore changing our %xcr0 on the fly. It is vital that when we're
472  * saving and restoring the FPU that we always use the largest %xcr0 contents
473  * otherwise we will end up leaving behind data in it.
474  *
475  * ELF PLT Support
476  * ---------------
477  *
478  * rtld has to preserve a subset of the FPU when it is saving and restoring
479  * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
480  * more information. As a result, we set up an aux vector that contains
481  * information about what save and restore mechanisms it should be using and
482  * the sizing thereof based on what the kernel supports. This is passed down in
483  * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
484  * initialized in fpu_subr.c.
485  *
486  * Signal Handling and the ucontext_t
487  * ----------------------------------
488  *
489  * One of the many gifts that signals give us is the twofold fact that when a
490  * signal occurs, the signal handler is allowed to change the CPU's state
491  * arbitrarily and when the signal handler is done executing, we must restore it
492  * back to the original state. However, the second part of this is that the
493  * signal handler is actually allowed to modify the state that the thread will
494  * return to! To create this facade, the kernel will create a full ucontext_t
495  * state, effectively calling getcontext(2) on the thread's behalf, and a
496  * pointer to that is given to the signal handler (the void * argument for the
497  * sa_sigaction function pointer in sigaction(2)). When libc is done with a
498  * signal, it will call setcontext(2) with that same ucontext_t.
499  *
500  * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
501  * it's often declared on the stack itself, with the signal handler spilling all
502  * this state to the stack. The ucontext_t machine portion was broken into the
503  * general purpose and floating point registers. In 64-bit code, the floating
504  * point registers were mostly the same as the results of the fxsave instruction
505  * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
506  * starting point for information, it is transformed into a different shape to
507  * deal with the history of the 32-bit SYS V ABI.
508  *
509  * While this worked, if you're reading this, you're aware that the x86 FPU and
510  * extended register states didn't stop at the initial 16 128-bit %xmm
511  * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
512  * opmask registers. None of these fit inside the standard ucontext_t; however,
513  * they must all be preserved and restored across a signal. While the various
514  * x86 platform-specific ABIs all suggest that these registers are not preserved
515  * across a function call, receiving a signal is not a function call and must be
516  * thought of like a process receiving an interrupt. In other words, this
517  * extended state must be preserved.
518  *
519  * To facilitate this, we have extended the ucontext_t structure with an
520  * additional flag, UC_XSAVE, which indicates that the traditional padding
521  * member, uc_xsave, actually is a pointer to the extended state. While this is
522  * accessible outside of a signal handling context through the combination of
523  * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
524  * state is focused on signal handling. Signal handling spills all this state to
525  * the stack and if we cannot spill the entire state to the stack then our
526  * inability to deliver the signal results in the process being killed! While
527  * there are separate efforts to ensure that the signal stack sizing that is
528  * used for the minimum and maximum signal sizes are sufficient, we still need
529  * to do our part to minimize the likelihood here.
530  *
531  * In designing this, we make the following observations which have helped us
532  * focus our design:
533  *
534  *   o While the start of an xsave area is the traditional 512-byte fxsave XMM
535  *     region, we already have that in the fpregs. Thus there is no reason to
536  *     duplicate it. This not only saves 512 bytes of additional stack space,
537  *     but it also means we don't have to ask which of the version of it to take
538  *     if they were to differ.
539  *
540  *   o Many applications out there aren't necessarily using the extended vectors
541  *     and even when we do make libc and others take advantage of it, it will
542  *     behoove us to ensure that they are put back into their initial state
543  *     after use. This leads us to expect that in a number of cases, the actual
544  *     extended register state will be in its initial state.
545  *
546  *   o While the signal handler does allow contents to be modified, we are
547  *     starting with making the interface private and thus allowing us to excise
548  *     components that are in their initial state.
549  *
550  *   o There are similarities to what we want to create with the compressed
551  *     xsave format; however, because we don't always have support for the
552  *     compressed format, we can't just arbitrarily say let's do a compressed
553  *     save to the user stack.
554  *
555  *   o Because we are not handing this state directly to and from hardware, we
556  *     don't need to meet some of the constraints of the compressed xsave format
557  *     around wanting alignment for the initial save or additional components.
558  *
559  * All of the above lead us to our own unique format for this data. When the
560  * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
561  * uc_xsave_t structure which has a magic version number, a 32-bit length of the
562  * overall structure, and the 64-bit state bit-vector to represent which
563  * components are valid. Following this 8-byte header, each component that is
564  * present in the bit vector is immediately written out in roughly ascending bit
565  * order (the order is determined based on the order of the fpu_xsave_info
566  * array).
567  *
568  * This makes the rough logic that we have here when taking a signal and writing
569  * out this state as:
570  *
571  *   1. Ensure that the FPU is saved and that the contents of the pcb save area
572  *      are valid. That is, call fp_save() if the state is not already flagged
573  *      with FPU_VALID.
574  *
575  *   2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
576  *      and XFEATURE_SSE bits as these will be placed in the xsave area.
577  *
578  *   3. Initialize the uc_xsave_t by setting our version field, initializing the
579  *      length to the length of the current structure, and then setting the
580  *      modified bit vector above.
581  *
582  *   4. Walk each remaining bit of the bit-vector. For each set bit, copy out
583  *      its extended state starting at the current length in the header and then
584  *      increase the header size by that length.
585  *
586  *   5. Finally write out the final uc_xsave_t structure.
587  *
588  * The above process is also used when someone manually calls getcontext_extd(2)
589  * to get this state. The main difference between the two is which copyout
590  * function we use. This deserves some explanation. Our main starting point for
591  * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
592  * the signal handling context to operate with a different copyout than we
593  * normally use in say getcontext_extd(2).
594  *
595  * When we've received a signal, we're at the intersection of several different
596  * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
597  * the watchpoints effectively set a copyout override function (t_copyops) that
598  * we end up vectoring to rather than a normal copyout. This allows the data to
599  * be modified and for the watchpoint to fire. While this is all well and good
600  * normally, it is problematic if we are trying to handle a signal. The signal
601  * deliver logic, sendsig(), goes through and disables the watchpoint for the
602  * region of the stack that we are copying out to. However, disabling
603  * watchpoints is not sufficient, we also need to use the copyout_noerr
604  * variants.
605  *
606  * These variants also require the use of on_fault() and no_fault() for error
607  * handling. While it is tempting to try and on_fault() the entire
608  * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
609  * The first is that we don't want to disable faults during the entire operation
610  * as if the kernel messes up we will treat that as a user error. That isn't
611  * theoretical and happened during development. The second and perhaps more
612  * important issue is that correctly bounding the on_fault() / no_fault() means
613  * being careful about state. For example, kernel pre-emption is often disabled
614  * during parts of these operations, but it needs to be re-enabled when we're
615  * done. This would require tracking in some volatile variable that this had
616  * been enabled and disabled and tracking that.
617  *
618  * Instead, this is why fpu_signal_copyout() takes a copy out function as an
619  * argument. When we're in signal handling context, the function will use
620  * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
621  *
622  * RESTORING STATE
623  *
624  * Copying out our current state is the easier half of this problem. When the
625  * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
626  * assembled for it as described above. setcontext(2) isn't just used for
627  * returning from signals.
628  *
629  * The process for this goes in two steps. The first step is to copy in,
630  * validate, and transform the ucontext_t UC_XSAVE that we created above into an
631  * equivalent xsave format that we can use the appropriate xrstor function on.
632  * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
633  * come back through a second phase that is driven out of restorecontext() and
634  * is implemented in fpu_set_xsave().
635  *
636  * Let's start by discussing the second part of this, which is more
637  * straightforward. In particular, the second phase assumes that all of the
638  * validation and error handling has been done by the first phase. This means
639  * here, we have a buffer that is already the appropriate size
640  * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
641  * replace the actual save state with the current one.
642  *
643  * The only piece of shenanigans we have to do is around the kernel provided
644  * notion of 'status' and 'xstatus', which are cached versions of the x87 and
645  * SSE exception vectors. These are part of the fpregset ABI and therefore we
646  * need to propagate them from the temporary storage that part 1 sets up in the
647  * ignored region of the fxsave data. We use that because it is not persisted by
648  * the CPU, so clobbering it is generally alright.
649  *
650  * Once that is done, we simply note that we need a PCB update to occur to
651  * refresh the FPU state before we return to userland. Given that someone has
652  * called setcontext(2), this was always going to happen because we have to
653  * update segment registers and related, so this isn't so bad. With that, let's
654  * move onto the more nuanced part (1).
655  *
656  * When we're handling a setcontext(2) we have, in userland, a data structure
657  * that should match one we serialized out, though we cannot assume that a user
658  * has not modified it either accidentally or maliciously. Our goal is to set up
659  * the appropriate xsave state that can be passed to the CPU's xrstor. The first
660  * problem we have to deal with is where do we actually put this state?
661  *
662  * While not many programs actually call setcontext(2) of their own volition,
663  * this is going to get hit every time we take a signal. The first thought was
664  * to re-use the existing thread's save area; however, that's a bit challenging
665  * for a few reasons. In particular, we would need to ensure that we don't go
666  * off-CPU for any reason, which we cannot assume with a copyin from a user
667  * address space. In particular, it is trivial for us to hit a case where the
668  * stack has been paged out for some reason, which eschews that path.
669  *
670  * Instead, whenever a thread first calls setcontext(2), generally from signal
671  * context, we will at that time allocate another entry from the 'fpsave_cachep'
672  * kmem cache, giving us a buffer of the appropriate space to handle this. Once
673  * this buffer has been allocated, we leave it assigned to the thread's pcb and
674  * only tear it down when the thread itself finally exits. We reason that a
675  * thread that takes a signal once is either going to have the process exit
676  * shortly thereafter or is much more likely to take a signal again in the
677  * future. Many daemons and other processes set things up so signals are
678  * dispatched via one location, masking signals in other thread, using
679  * sigsuspend(2), signalfd(3C), or something similar.
680  *
681  * With this buffer in hand, we begin our task of reassembling state. Note, all
682  * of this is conditional on UC_XSAVE being set in the uc_flags member of the
683  * ucontext_t. If it is not set, then we assume that there is no extended state
684  * and will use the traditional path of setting the fpregset_t into the system
685  * via setfpregs().
686  *
687  * We first will copyin and validate the uc_xsave_t. In particular, we need to
688  * make sure the version makes sense, that the xsave component bit-vector
689  * doesn't have anything unexpected and more importantly unsupported in it, and
690  * that the addresses we've been given are within the user address space. At
691  * this point we can walk through our table of implemented bits and process
692  * them.
693  *
694  * For most components in here, the processing is straightforward. We continue
695  * walking our cursor and copy data into the kernel and place it in the
696  * appropriate place in our xsave state. If a xsave state component bit-vector
697  * isn't set, then we must ensure that we have the item in the initial state,
698  * which for everything other than the x87/SSE state is the memory being zeroed.
699  *
700  * The most unique case in the copyin state is that of the x87/SSE state. You
701  * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
702  * but instead have opted to use the single definition in the fpregset_t. Thus
703  * here, we copy it out of the fpregset_t, which the kernel has helpfully
704  * already unified into the 64-bit fxsave version prior to calling us, and
705  * install that into the save area we're building up.
706  *
707  * As part of this, there are two important pieces to be aware of. The first is
708  * that because the fpregset_t has both the status and xstatus members
709  * mentioned earlier, we temporarily copy them to the software-usable ignored
710  * areas of the fxsave state so we can corral this extra state into part (2)
711  * without needing to allocate additional space. The second piece is that when
712  * we're done processing this we explicitly remove the UC_FPU flag that would
713  * tell the kernel to proceed with updating that region. The problem is that
714  * that goes directly into the pcb's save area and not to the intermediate
715  * buffer as it uses the same entry point as /proc, mainly setfpregs().
716  *
717  * We don't do much validation of the actual contents of the registers that are
718  * being set with the exception of ensuring that no reserved bits of the mxcsr
719  * are used. This is not as strict as /proc, but failure here means the process
720  * is likely going to die (returning from setcontext() in a signal handler is
721  * fatal).
722  *
723  * /proc xregs
724  * -----------
725  *
726  * Observability of the state of the extended registers is important for
727  * understanding the system. While on the surface this is similar to signal
728  * handling, it is crucially different in a number of ways:
729  *
730  *   o In signal handling, we're trying to conserve every byte of stack that we
731  *     can.
732  *   o The /proc xregs file will end up in core files, which means that we need
733  *     a way of knowing what components are present and not present in it,
734  *     because this will vary from CPU to CPU due to the addition of
735  *     architectural features. For example, some CPUs support AVX-512, but
736  *     others do not.
737  *
738  *   o The signal handling structure (uc_xsave_t) is private and we're not
739  *     trying to have software modify it, on the other hand, the /proc
740  *     interfaces that we support we do want software to be able to interrogate
741  *     and manipulate. These need to be something that we can introduce
742  *     additional components into and make other changes that still allow it to
743  *     work.
744  *
745  * The x86 xregs format is documented in proc(5). The short form is that the
746  * prxregset_hdr_t has a number of information entries, which are of the type
747  * prxregset_info_t. Each of the information headers has a type, size, and
748  * offset which indicate where to find the additional data.
749  *
750  * Each entry is described as one of the entries in the fpu_xsave_info[]. These
751  * items either are a 1:1 correspondence with a xsave related feature (e.g.
752  * there is one entry for each of the three AVX-512 components) or it is
753  * something synthetic that we provide as additional information such as the
754  * PRX_INFO_XCR, which is a way of getting information about the system such as
755  * what is enabled in %xcr0 out there.
756  *
757  * Unlike signal handling, we are given the buffer to place everything that
758  * needs to be written out. This is partially the design of the /proc APIs. That
759  * is, we will always assemble everything into the entire buffer that /proc asks
760  * us to, and then it will use as much or as little of it as is required.
761  * Similarly, when setting things, we don't have to worry about copying in
762  * information in the same way as signal handling does, because /proc takes care
763  * of it and always hands us a full buffer. Sizing that is a little nuanced, but
764  * is all handled in prmachdep.c.
765  *
766  * When someone performs a read of the xregs and thus is asking us for the
767  * current state, there is a little bit of nuance that we need to deal with.
768  * The first, is whether or not the FPU is enabled and the second is if the FPU
769  * is enabled, whether a given component is noted as being in its initial state.
770  * This basically gives us three possible states for a given component:
771  *
772  *   1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
773  *      the illumos FPU default for an item. More on that in a moment.
774  *   2. The saved xsave state indicates that the bit for a given component is
775  *      zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
776  *      In this case, we must take the CPU's default for an item. This is
777  *      usually the same as illumos, but not always.
778  *   3. The saved xsave state indicates that a given component's state bit is
779  *      valid. The simplest of our cases. We can just take what we have from the
780  *      xsave state.
781  *
782  * The CPU's default state for most components other than the x87/SSE state is
783  * to have it be zeroed. This is what we treat as our default state as well. The
784  * primary difference is in the initialization of the x87/SSE state. The SYS V
785  * ABI requires that we enable a different floating point control word then the
786  * hardware default. This means that when we're dealing with case (1) for
787  * x87/SSE we have to be more careful than the other components. Thankfully for
788  * everything else this is just keeping it zeroed.
789  *
790  * A reasonable question would be why not just skip components that aren't
791  * marked as present. There are a few reasons we take a different approach and
792  * always include them. Both of these are to make lives simpler for consumers.
793  * In the first case, when someone is performing a read and wants to reassemble
794  * and answer the question of 'what is the value of %ymm0 or %zmm15', they have
795  * to combine multiple disparate parts. If one knows that the data we put into
796  * there is always valid and represents what is in hardware and doesn't have to
797  * keep track of what are the defaults in different circumstances, then that
798  * greatly simplifies consumers lives. It also helps us for core files and other
799  * observability cases because the answer to what is the operating system's
800  * default may change over time.
801  *
802  * Similarly, including all the possible structures means that we have
803  * simplified writes. Writes are always setting the full state of a thread,
804  * meaning that if someone wants to modify only a single register they must do a
805  * read, modify, and write. By including everything that they might need, it
806  * makes it easier for consumers to do this and not have to cons up the whole
807  * structure on their own.
808  *
809  * When we're setting state, things change around a little bit. We have a few
810  * constraints that are laid out in proc(5). In particular, we require that the
811  * PRX_INFO_XSAVE component always be present to tell us which other components
812  * we expect to be here and which ones we don't. We also are much stricter about
813  * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
814  * and may not be modified by a calling process. In addition, when we have
815  * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
816  * components, if they are being written to and have modifications, then we will
817  * indicate an error there.
818  *
819  * Because we are given the entire buffer from userland and don't need to have
820  * an intermediate place to copy it in, we will validate the entire thing in
821  * advance. Once it has been validated and we consider it legal, then we will
822  * translate each entry into its corresponding entry in pcb's normal floating
823  * point state. This is different from signal handling mostly because of the
824  * fact that we are not using copyin, and once we get to this point, there is
825  * no more validation, so we don't have the same concerns around blocking while
826  * pre-emption is disabled.
827  *
828  * The Wrinkle with fpregs
829  * -----------------------
830  *
831  * When we instead turn our attention to the fpregs, whether we're gathering
832  * them as part of the ucontext_t or as part of /proc, there are a few
833  * complications that we need to be aware of when we're operating on a kernel
834  * that is using xsave as the save mechanism. When we're using fxsave as the
835  * save mechanism, the CPU will always save the entire 512-byte fxsave region.
836  * The fpregs ABI that the kernel expects is basically this structure itself,
837  * which is transformed into a 32-bit compatible form in archdep.c.
838  *
839  * But xsave makes this much more complex and has historically been a source of
840  * bugs in the system. In particular, unlike fxsave, xsave has its component bit
841  * vector that is written out to indicate validity. This means that blindly
842  * copying the fxsave area without checking those bits will lead us to do the
843  * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
844  * while the x87 legacy fp flag covers the rest of the state. This is all good,
845  * aside from the MCXSR.
846  *
847  * One of the more complicated pieces of xsave state management is correctly
848  * answering the question of when the MXCSR is written out to xsave_state. In
849  * practice, this is rather convoluted and varies. If either the XMM or AVX
850  * feature bits are set then the CPU will write out the MXCSR and its mask
851  * register into the traditional fxsave state region. This behavior is dependent
852  * on the type of save function that we use. xsave and xsaveopt will look at the
853  * AVX feature bit; however, xsavec does not and only considers the SSE feature
854  * bit. This means that when we're retrieving things, we need to check both of
855  * those bits to determine if we should use the initial state or the value
856  * written out.
857  *
858  * When we come to someone trying to set the fpregs through /proc, the main
859  * question we have is what happens to the extended registers. We have opted to
860  * implement and document it such that a write to the fpregs only impacts the
861  * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
862  * copying the data into the save area, set the state bits for x87 and XMM
863  * state, and then set the FPU to be restored. All in all, this basically means
864  * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
865  * that we might have present.
866  *
867  * Forward Looking: Adding Intel AMX Support
868  * -----------------------------------------
869  *
870  * Nothing can stop the march of features being added into the FPU. One of the
871  * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
872  * Extensions (AMX), which add a large chunk of xsave state to each process.
873  * While things like AVX and AVX-512 have been enabled by default, the broader
874  * OS community has not been wanting to do this for AMX ,because of the size of
875  * the state which exceeds 8 KiB. While the signal handling state went out of
876  * its way to minimize the size it wrote to the stack, if this is used, it would
877  * need to be preserved.
878  *
879  * To deal with this reality and the fact that folks don't really want to
880  * enable it by default for all purposes when its use will be quite special
881  * purpose, Intel has also added a MSR around extended feature disable or xfd.
882  * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
883  * assumption, and the reason that so much of the /proc and signal logic ensures
884  * that we have the thread and process around, taking as an example the unused
885  * process argument in fpu_proc_xregs_info(), is that we will follow suit and
886  * default to having support disabled, but that a process will be able to opt
887  * into it, which will result in several different assumptions around signal
888  * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
889  *
890  * The following is a list of items to pay attention to for future folks who
891  * work on this:
892  *
893  *   o We will want to confirm whether other systems have opted to make this
894  *     process-wide or thread-wide. Assuming process-wide, we will need to do a
895  *     hold of all lwps while making a change. The interface for that probably
896  *     doesn't want to be /proc, as a process probably doesn't want to write to
897  *     its own control file. Changing it for another process could be done
898  *     through the agent-lwp.
899  *   o Opting into this should probably be a one-way street.
900  *   o Opting into this will need to evaluate all threads and in particular
901  *     stack sizes to confirm they adhere to the new minimum.
902  *   o We will need to make sure that setting and clearing the xfd MSR is part
903  *     of the FPU context ops and something we set by default on every CPU.
904  *   o We will need to add a new interface to allow opting into this feature.
905  *   o We will need to ensure that all subsequently created signal stacks adhere
906  *     to a required minimum size that we communicate through libc.
907  *   o We will need to make sure that both rtld and libc no longer rely on a
908  *     static value of the AT_SUN_FPSIZE, but rather realize that this can be
909  *     dynamic. At that time, we should evaluate if we can get away with not
910  *     needing to save this for rtld, even though signal handlers should assume
911  *     they will.
912  *   o The various components (because there is more than one) will want to be
913  *     added to the fpu_xsave_info[]. Consulting the processes's xfd will be
914  *     required and probably require logic changes.
915  *
916  * The above is not exhaustive. We'll probably have some other issues and fun
917  * while doing this.
918  */
919 
920 /*
921  * The kind of FPU we advertise to rtld so it knows what to do when working
922  * through the PLT.
923  */
924 int fp_elf = AT_386_FPINFO_FXSAVE;
925 
926 /*
927  * Mechanism to save FPU state.
928  */
929 int fp_save_mech = FP_FXSAVE;
930 
931 /*
932  * See section 10.5.1 in the Intel 64 and IA-32 Architectures Software
933  * Developer's Manual, Volume 1.
934  */
935 #define	FXSAVE_ALIGN	16
936 
937 /*
938  * See section 13.4 in the Intel 64 and IA-32 Architectures Software
939  * Developer's Manual, Volume 1.
940  */
941 #define	XSAVE_ALIGN	64
942 
943 kmem_cache_t *fpsave_cachep;
944 
945 /* Legacy fxsave layout + xsave header + ymm */
946 #define	AVX_XSAVE_SIZE		(512 + 64 + 256)
947 
948 /*
949  * Various sanity checks.
950  */
951 CTASSERT(sizeof (struct fxsave_state) == 512);
952 CTASSERT(sizeof (struct fnsave_state) == 108);
953 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
954 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
955 
956 /*
957  * Basic architectural alignment information.
958  */
959 #define	FPU_ALIGN_XMM	16
960 #define	FPU_ALIGN_YMM	32
961 #define	FPU_ALIGN_ZMM	64
962 
963 /*
964  * This structure is the x86 implementation of the kernel FPU that is defined in
965  * uts/common/sys/kfpu.h.
966  */
967 
968 typedef enum kfpu_flags {
969 	/*
970 	 * This indicates that the save state has initial FPU data.
971 	 */
972 	KFPU_F_INITIALIZED = 0x01
973 } kfpu_flags_t;
974 
975 struct kfpu_state {
976 	fpu_ctx_t	kfpu_ctx;
977 	kfpu_flags_t	kfpu_flags;
978 	kthread_t	*kfpu_curthread;
979 };
980 
981 /*
982  * Initial kfpu state for SSE/SSE2 used by fpinit()
983  */
984 const struct fxsave_state sse_initial = {
985 	FPU_CW_INIT,	/* fx_fcw */
986 	0,		/* fx_fsw */
987 	0,		/* fx_fctw */
988 	0,		/* fx_fop */
989 	0,		/* fx_rip */
990 	0,		/* fx_rdp */
991 	SSE_MXCSR_INIT	/* fx_mxcsr */
992 	/* rest of structure is zero */
993 };
994 
995 /*
996  * Initial kfpu state for AVX used by fpinit()
997  */
998 const struct xsave_state avx_initial = {
999 	/*
1000 	 * The definition below needs to be identical with sse_initial
1001 	 * defined above.
1002 	 */
1003 	.xs_fxsave = {
1004 		.fx_fcw = FPU_CW_INIT,
1005 		.fx_mxcsr = SSE_MXCSR_INIT,
1006 	},
1007 	.xs_header = {
1008 		/*
1009 		 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
1010 		 * valid, and CPU should initialize XMM/YMM.
1011 		 */
1012 		.xsh_xstate_bv = 1,
1013 		.xsh_xcomp_bv = 0,
1014 	},
1015 };
1016 
1017 /*
1018  * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
1019  * the #gp exception caused by setting unsupported bits in the
1020  * MXCSR register
1021  */
1022 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
1023 
1024 /*
1025  * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
1026  * have an XSAVE-capable chip in fpu_probe.
1027  */
1028 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
1029 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
1030 
1031 /*
1032  * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
1033  */
1034 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
1035 
1036 static int fpe_sicode(uint_t);
1037 static int fpe_simd_sicode(uint_t);
1038 static void fp_new_lwp(void *, void *);
1039 static void fp_free_ctx(void *, int);
1040 
1041 static struct ctxop *
fp_ctxop_allocate(struct fpu_ctx * fp)1042 fp_ctxop_allocate(struct fpu_ctx *fp)
1043 {
1044 	const struct ctxop_template tpl = {
1045 		.ct_rev		= CTXOP_TPL_REV,
1046 		.ct_save	= fpsave_ctxt,
1047 		.ct_restore	= fprestore_ctxt,
1048 		.ct_fork	= fp_new_lwp,
1049 		.ct_lwp_create	= fp_new_lwp,
1050 		.ct_free	= fp_free_ctx,
1051 	};
1052 	return (ctxop_allocate(&tpl, fp));
1053 }
1054 
1055 /*
1056  * Copy the state of parent lwp's floating point context into the new lwp.
1057  * Invoked for both fork() and lwp_create().
1058  *
1059  * Note that we inherit -only- the control state (e.g. exception masks,
1060  * rounding, precision control, etc.); the FPU registers are otherwise
1061  * reset to their initial state.
1062  */
1063 static void
fp_new_lwp(void * parent,void * child)1064 fp_new_lwp(void *parent, void *child)
1065 {
1066 	kthread_id_t t = parent, ct = child;
1067 	struct fpu_ctx *fp;		/* parent fpu context */
1068 	struct fpu_ctx *cfp;		/* new fpu context */
1069 	struct fxsave_state *fx, *cfx;
1070 	struct xsave_state *cxs;
1071 
1072 	ASSERT(fp_kind != FP_NO);
1073 
1074 	fp = &t->t_lwp->lwp_pcb.pcb_fpu;
1075 	cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
1076 
1077 	/*
1078 	 * If the parent FPU state is still in the FPU hw then save it;
1079 	 * conveniently, fp_save() already does this for us nicely.
1080 	 */
1081 	fp_save(fp);
1082 
1083 	cfp->fpu_flags = FPU_EN | FPU_VALID;
1084 	cfp->fpu_regs.kfpu_status = 0;
1085 	cfp->fpu_regs.kfpu_xstatus = 0;
1086 
1087 	/*
1088 	 * Make sure that the child's FPU is cleaned up and made ready for user
1089 	 * land.
1090 	 */
1091 	PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
1092 
1093 	switch (fp_save_mech) {
1094 	case FP_FXSAVE:
1095 		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1096 		cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
1097 		bcopy(&sse_initial, cfx, sizeof (*cfx));
1098 		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
1099 		cfx->fx_fcw = fx->fx_fcw;
1100 		break;
1101 
1102 	case FP_XSAVE:
1103 		cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
1104 
1105 		VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
1106 
1107 		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1108 		cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
1109 		cfx = &cxs->xs_fxsave;
1110 
1111 		bcopy(&avx_initial, cxs, sizeof (*cxs));
1112 		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
1113 		cfx->fx_fcw = fx->fx_fcw;
1114 		cxs->xs_header.xsh_xstate_bv |=
1115 		    (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
1116 		break;
1117 	default:
1118 		panic("Invalid fp_save_mech");
1119 		/*NOTREACHED*/
1120 	}
1121 
1122 	/*
1123 	 * Mark that both the parent and child need to have the FPU cleaned up
1124 	 * before returning to userland.
1125 	 */
1126 
1127 	ctxop_attach(ct, fp_ctxop_allocate(cfp));
1128 }
1129 
1130 /*
1131  * Free any state associated with floating point context.
1132  * Fp_free can be called in three cases:
1133  * 1) from reaper -> thread_free -> freectx-> fp_free
1134  *	fp context belongs to a thread on deathrow
1135  *	nothing to do,  thread will never be resumed
1136  *	thread calling ctxfree is reaper
1137  *
1138  * 2) from exec -> freectx -> fp_free
1139  *	fp context belongs to the current thread
1140  *	must disable fpu, thread calling ctxfree is curthread
1141  *
1142  * 3) from restorecontext -> setfpregs -> fp_free
1143  *	we have a modified context in the memory (lwp->pcb_fpu)
1144  *	disable fpu and release the fp context for the CPU
1145  *
1146  */
1147 void
fp_free(struct fpu_ctx * fp)1148 fp_free(struct fpu_ctx *fp)
1149 {
1150 	ASSERT(fp_kind != FP_NO);
1151 
1152 	if (fp->fpu_flags & FPU_VALID)
1153 		return;
1154 
1155 	kpreempt_disable();
1156 	/*
1157 	 * We want to do fpsave rather than fpdisable so that we can
1158 	 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
1159 	 */
1160 	fp->fpu_flags |= FPU_VALID;
1161 	/* If for current thread disable FP to track FPU_VALID */
1162 	if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
1163 		/* Clear errors if any to prevent frstor from complaining */
1164 		(void) fperr_reset();
1165 		if (fp_kind & __FP_SSE)
1166 			(void) fpxerr_reset();
1167 		fpdisable();
1168 	}
1169 	kpreempt_enable();
1170 }
1171 
1172 /*
1173  * Wrapper for freectx to make the types line up for fp_free()
1174  */
1175 static void
fp_free_ctx(void * arg,int isexec __unused)1176 fp_free_ctx(void *arg, int isexec __unused)
1177 {
1178 	fp_free((struct fpu_ctx *)arg);
1179 }
1180 
1181 /*
1182  * Store the floating point state and disable the floating point unit.
1183  */
1184 void
fp_save(struct fpu_ctx * fp)1185 fp_save(struct fpu_ctx *fp)
1186 {
1187 	ASSERT(fp_kind != FP_NO);
1188 
1189 	kpreempt_disable();
1190 	if (!fp || fp->fpu_flags & FPU_VALID ||
1191 	    (fp->fpu_flags & FPU_EN) == 0) {
1192 		kpreempt_enable();
1193 		return;
1194 	}
1195 	ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
1196 
1197 	switch (fp_save_mech) {
1198 	case FP_FXSAVE:
1199 		fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
1200 		break;
1201 
1202 	case FP_XSAVE:
1203 		xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
1204 		break;
1205 	default:
1206 		panic("Invalid fp_save_mech");
1207 		/*NOTREACHED*/
1208 	}
1209 
1210 	fp->fpu_flags |= FPU_VALID;
1211 
1212 	/*
1213 	 * We save the FPU as part of forking, execing, modifications via /proc,
1214 	 * restorecontext, etc. As such, we need to make sure that we return to
1215 	 * userland with valid state in the FPU. If we're context switched out
1216 	 * before we hit sys_rtt_common() we'll end up having restored the FPU
1217 	 * as part of the context ops operations. The restore logic always makes
1218 	 * sure that FPU_VALID is set before doing a restore so we don't restore
1219 	 * it a second time.
1220 	 */
1221 	PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
1222 
1223 	kpreempt_enable();
1224 }
1225 
1226 /*
1227  * Restore the FPU context for the thread:
1228  * The possibilities are:
1229  *	1. No active FPU context: Load the new context into the FPU hw
1230  *	   and enable the FPU.
1231  */
1232 void
fp_restore(struct fpu_ctx * fp)1233 fp_restore(struct fpu_ctx *fp)
1234 {
1235 	switch (fp_save_mech) {
1236 	case FP_FXSAVE:
1237 		fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
1238 		break;
1239 
1240 	case FP_XSAVE:
1241 		xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
1242 		break;
1243 	default:
1244 		panic("Invalid fp_save_mech");
1245 		/*NOTREACHED*/
1246 	}
1247 
1248 	fp->fpu_flags &= ~FPU_VALID;
1249 }
1250 
1251 /*
1252  * Reset the FPU such that it is in a valid state for a new thread that is
1253  * coming out of exec. The FPU will be in a usable state at this point. At this
1254  * point we know that the FPU state has already been allocated and if this
1255  * wasn't an init process, then it will have had fp_free() previously called.
1256  */
1257 void
fp_exec(void)1258 fp_exec(void)
1259 {
1260 	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1261 
1262 	if (fp_save_mech == FP_XSAVE) {
1263 		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
1264 	}
1265 
1266 	struct ctxop *ctx = fp_ctxop_allocate(fp);
1267 	/*
1268 	 * Make sure that we're not preempted in the middle of initializing the
1269 	 * FPU on CPU.
1270 	 */
1271 	kpreempt_disable();
1272 	ctxop_attach(curthread, ctx);
1273 	fpinit();
1274 	fp->fpu_flags = FPU_EN;
1275 	kpreempt_enable();
1276 }
1277 
1278 
1279 /*
1280  * Seeds the initial state for the current thread.  The possibilities are:
1281  *      1. Another process has modified the FPU state before we have done any
1282  *         initialization: Load the FPU state from the LWP state.
1283  *      2. The FPU state has not been externally modified:  Load a clean state.
1284  */
1285 void
fp_seed(void)1286 fp_seed(void)
1287 {
1288 	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1289 
1290 	ASSERT(curthread->t_preempt >= 1);
1291 	ASSERT((fp->fpu_flags & FPU_EN) == 0);
1292 
1293 	/*
1294 	 * Always initialize a new context and initialize the hardware.
1295 	 */
1296 	if (fp_save_mech == FP_XSAVE) {
1297 		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
1298 	}
1299 
1300 	ctxop_attach(curthread, fp_ctxop_allocate(fp));
1301 	fpinit();
1302 
1303 	/*
1304 	 * If FPU_VALID is set, it means someone has modified registers via
1305 	 * /proc.  In this case, restore the current lwp's state.
1306 	 */
1307 	if (fp->fpu_flags & FPU_VALID)
1308 		fp_restore(fp);
1309 
1310 	ASSERT((fp->fpu_flags & FPU_VALID) == 0);
1311 	fp->fpu_flags = FPU_EN;
1312 }
1313 
1314 /*
1315  * When using xsave/xrstor, these three functions are used by the lwp code to
1316  * manage the memory for the xsave area.
1317  */
1318 void
fp_lwp_init(klwp_t * lwp)1319 fp_lwp_init(klwp_t *lwp)
1320 {
1321 	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1322 
1323 	/*
1324 	 * We keep a copy of the pointer in lwp_fpu so that we can restore the
1325 	 * value in forklwp() after we duplicate the parent's LWP state.
1326 	 */
1327 	lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
1328 	    kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
1329 	fp->fpu_signal = NULL;
1330 
1331 	if (fp_save_mech == FP_XSAVE) {
1332 		/*
1333 		 *
1334 		 * We bzero since the fpinit() code path will only
1335 		 * partially initialize the xsave area using avx_inital.
1336 		 */
1337 		ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
1338 		bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
1339 	}
1340 }
1341 
1342 void
fp_lwp_cleanup(klwp_t * lwp)1343 fp_lwp_cleanup(klwp_t *lwp)
1344 {
1345 	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1346 
1347 	if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
1348 		kmem_cache_free(fpsave_cachep,
1349 		    fp->fpu_regs.kfpu_u.kfpu_generic);
1350 		lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
1351 	}
1352 
1353 	if (fp->fpu_signal != NULL) {
1354 		kmem_cache_free(fpsave_cachep, fp->fpu_signal);
1355 		fp->fpu_signal = NULL;
1356 	}
1357 }
1358 
1359 /*
1360  * Called during the process of forklwp(). The kfpu_u pointer will have been
1361  * overwritten while copying the parent's LWP structure. We have a valid copy
1362  * stashed in the child's lwp_fpu which we use to restore the correct value.
1363  */
1364 void
fp_lwp_dup(klwp_t * lwp)1365 fp_lwp_dup(klwp_t *lwp)
1366 {
1367 	void *xp = lwp->lwp_fpu;
1368 	size_t sz;
1369 
1370 	switch (fp_save_mech) {
1371 	case FP_FXSAVE:
1372 		sz = sizeof (struct fxsave_state);
1373 		break;
1374 	case FP_XSAVE:
1375 		sz = cpuid_get_xsave_size();
1376 		break;
1377 	default:
1378 		panic("Invalid fp_save_mech");
1379 		/*NOTREACHED*/
1380 	}
1381 
1382 	/* copy the parent's values into the new lwp's struct */
1383 	bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
1384 	/* now restore the pointer */
1385 	lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
1386 	/* Ensure that we don't inherit our parent's signal state */
1387 	lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
1388 }
1389 
1390 /*
1391  * Handle a processor extension error fault
1392  * Returns non zero for error.
1393  */
1394 
1395 /*ARGSUSED*/
1396 int
fpexterrflt(struct regs * rp)1397 fpexterrflt(struct regs *rp)
1398 {
1399 	uint32_t fpcw, fpsw;
1400 	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1401 
1402 	ASSERT(fp_kind != FP_NO);
1403 
1404 	/*
1405 	 * Now we can enable the interrupts.
1406 	 * (NOTE: x87 fp exceptions come thru interrupt gate)
1407 	 */
1408 	sti();
1409 
1410 	if (!fpu_exists)
1411 		return (FPE_FLTINV);
1412 
1413 	/*
1414 	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
1415 	 * it'll be saved into the fpu context area passed in (that of the
1416 	 * current thread).  If it's not dirty (it may not be, due to
1417 	 * an intervening save due to a context switch between the sti(),
1418 	 * above and here, then it's safe to just use the stored values in
1419 	 * the context save area to determine the cause of the fault.
1420 	 */
1421 	fp_save(fp);
1422 
1423 	/* clear exception flags in saved state, as if by fnclex */
1424 	switch (fp_save_mech) {
1425 	case FP_FXSAVE:
1426 		fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1427 		fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
1428 		fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
1429 		break;
1430 
1431 	case FP_XSAVE:
1432 		fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1433 		fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
1434 		fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
1435 		/*
1436 		 * Always set LEGACY_FP as it may have been cleared by XSAVE
1437 		 * instruction
1438 		 */
1439 		fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1440 		    XFEATURE_LEGACY_FP;
1441 		break;
1442 	default:
1443 		panic("Invalid fp_save_mech");
1444 		/*NOTREACHED*/
1445 	}
1446 
1447 	fp->fpu_regs.kfpu_status = fpsw;
1448 
1449 	if ((fpsw & FPS_ES) == 0)
1450 		return (0);		/* No exception */
1451 
1452 	/*
1453 	 * "and" the exception flags with the complement of the mask
1454 	 * bits to determine which exception occurred
1455 	 */
1456 	return (fpe_sicode(fpsw & ~fpcw & 0x3f));
1457 }
1458 
1459 /*
1460  * Handle an SSE/SSE2 precise exception.
1461  * Returns a non-zero sicode for error.
1462  */
1463 /*ARGSUSED*/
1464 int
fpsimderrflt(struct regs * rp)1465 fpsimderrflt(struct regs *rp)
1466 {
1467 	uint32_t mxcsr, xmask;
1468 	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1469 
1470 	ASSERT(fp_kind & __FP_SSE);
1471 
1472 	/*
1473 	 * NOTE: Interrupts are disabled during execution of this
1474 	 * function.  They are enabled by the caller in trap.c.
1475 	 */
1476 
1477 	/*
1478 	 * The only way we could have gotten here if there is no FP unit
1479 	 * is via a user executing an INT $19 instruction, so there is
1480 	 * no fault in that case.
1481 	 */
1482 	if (!fpu_exists)
1483 		return (0);
1484 
1485 	/*
1486 	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
1487 	 * it'll be saved into the fpu context area passed in (that of the
1488 	 * current thread).  If it's not dirty, then it's safe to just use
1489 	 * the stored values in the context save area to determine the
1490 	 * cause of the fault.
1491 	 */
1492 	fp_save(fp);		/* save the FPU state */
1493 
1494 	if (fp_save_mech == FP_XSAVE) {
1495 		mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1496 		fp->fpu_regs.kfpu_status =
1497 		    fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1498 	} else {
1499 		mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1500 		fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1501 	}
1502 	fp->fpu_regs.kfpu_xstatus = mxcsr;
1503 
1504 	/*
1505 	 * compute the mask that determines which conditions can cause
1506 	 * a #xm exception, and use this to clean the status bits so that
1507 	 * we can identify the true cause of this one.
1508 	 */
1509 	xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1510 	return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1511 }
1512 
1513 /*
1514  * In the unlikely event that someone is relying on this subcode being
1515  * FPE_FLTILL for denormalize exceptions, it can always be patched back
1516  * again to restore old behaviour.
1517  */
1518 int fpe_fltden = FPE_FLTDEN;
1519 
1520 /*
1521  * Map from the FPU status word to the FP exception si_code.
1522  */
1523 static int
fpe_sicode(uint_t sw)1524 fpe_sicode(uint_t sw)
1525 {
1526 	if (sw & FPS_IE)
1527 		return (FPE_FLTINV);
1528 	if (sw & FPS_ZE)
1529 		return (FPE_FLTDIV);
1530 	if (sw & FPS_DE)
1531 		return (fpe_fltden);
1532 	if (sw & FPS_OE)
1533 		return (FPE_FLTOVF);
1534 	if (sw & FPS_UE)
1535 		return (FPE_FLTUND);
1536 	if (sw & FPS_PE)
1537 		return (FPE_FLTRES);
1538 	return (FPE_FLTINV);	/* default si_code for other exceptions */
1539 }
1540 
1541 /*
1542  * Map from the SSE status word to the FP exception si_code.
1543  */
1544 static int
fpe_simd_sicode(uint_t sw)1545 fpe_simd_sicode(uint_t sw)
1546 {
1547 	if (sw & SSE_IE)
1548 		return (FPE_FLTINV);
1549 	if (sw & SSE_ZE)
1550 		return (FPE_FLTDIV);
1551 	if (sw & SSE_DE)
1552 		return (FPE_FLTDEN);
1553 	if (sw & SSE_OE)
1554 		return (FPE_FLTOVF);
1555 	if (sw & SSE_UE)
1556 		return (FPE_FLTUND);
1557 	if (sw & SSE_PE)
1558 		return (FPE_FLTRES);
1559 	return (FPE_FLTINV);	/* default si_code for other exceptions */
1560 }
1561 
1562 /*
1563  * This routine is invoked as part of libc's __fpstart implementation
1564  * via sysi86(2).
1565  *
1566  * It may be called -before- any context has been assigned in which case
1567  * we try and avoid touching the hardware.  Or it may be invoked well
1568  * after the context has been assigned and fiddled with, in which case
1569  * just tweak it directly.
1570  */
1571 void
fpsetcw(uint16_t fcw,uint32_t mxcsr)1572 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1573 {
1574 	struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1575 	struct fxsave_state *fx;
1576 
1577 	if (!fpu_exists || fp_kind == FP_NO)
1578 		return;
1579 
1580 	if ((fp->fpu_flags & FPU_EN) == 0) {
1581 		if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1582 			/*
1583 			 * Common case.  Floating point unit not yet
1584 			 * enabled, and kernel already intends to initialize
1585 			 * the hardware the way the caller wants.
1586 			 */
1587 			return;
1588 		}
1589 		/*
1590 		 * Hmm.  Userland wants a different default.
1591 		 * Do a fake "first trap" to establish the context, then
1592 		 * handle as if we already had a context before we came in.
1593 		 */
1594 		kpreempt_disable();
1595 		fp_seed();
1596 		kpreempt_enable();
1597 	}
1598 
1599 	/*
1600 	 * Ensure that the current hardware state is flushed back to the
1601 	 * pcb, then modify that copy.  Next use of the fp will
1602 	 * restore the context.
1603 	 */
1604 	fp_save(fp);
1605 
1606 	switch (fp_save_mech) {
1607 	case FP_FXSAVE:
1608 		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1609 		fx->fx_fcw = fcw;
1610 		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1611 		break;
1612 
1613 	case FP_XSAVE:
1614 		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1615 		fx->fx_fcw = fcw;
1616 		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1617 		/*
1618 		 * Always set LEGACY_FP as it may have been cleared by XSAVE
1619 		 * instruction
1620 		 */
1621 		fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1622 		    XFEATURE_LEGACY_FP;
1623 		break;
1624 	default:
1625 		panic("Invalid fp_save_mech");
1626 		/*NOTREACHED*/
1627 	}
1628 }
1629 
1630 static void
kernel_fpu_fpstate_init(kfpu_state_t * kfpu)1631 kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1632 {
1633 	struct xsave_state *xs;
1634 
1635 	switch (fp_save_mech) {
1636 	case FP_FXSAVE:
1637 		bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1638 		    sizeof (struct fxsave_state));
1639 		kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1640 		break;
1641 	case FP_XSAVE:
1642 		xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1643 		bzero(xs, cpuid_get_xsave_size());
1644 		bcopy(&avx_initial, xs, sizeof (*xs));
1645 		xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1646 		kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1647 		break;
1648 	default:
1649 		panic("invalid fp_save_mech");
1650 	}
1651 
1652 	/*
1653 	 * Set the corresponding flags that the system expects on the FPU state
1654 	 * to indicate that this is our state. The FPU_EN flag is required to
1655 	 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1656 	 * not set below as it represents that this state is being suppressed
1657 	 * by the kernel.
1658 	 */
1659 	kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1660 	kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1661 }
1662 
1663 kfpu_state_t *
kernel_fpu_alloc(int kmflags)1664 kernel_fpu_alloc(int kmflags)
1665 {
1666 	kfpu_state_t *kfpu;
1667 
1668 	if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1669 		return (NULL);
1670 	}
1671 
1672 	kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1673 	    kmem_cache_alloc(fpsave_cachep, kmflags);
1674 	if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1675 		kmem_free(kfpu, sizeof (kfpu_state_t));
1676 		return (NULL);
1677 	}
1678 
1679 	kernel_fpu_fpstate_init(kfpu);
1680 
1681 	return (kfpu);
1682 }
1683 
1684 void
kernel_fpu_free(kfpu_state_t * kfpu)1685 kernel_fpu_free(kfpu_state_t *kfpu)
1686 {
1687 	kmem_cache_free(fpsave_cachep,
1688 	    kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1689 	kmem_free(kfpu, sizeof (kfpu_state_t));
1690 }
1691 
1692 static void
kernel_fpu_ctx_save(void * arg)1693 kernel_fpu_ctx_save(void *arg)
1694 {
1695 	kfpu_state_t *kfpu = arg;
1696 	fpu_ctx_t *pf;
1697 
1698 	if (kfpu == NULL) {
1699 		/*
1700 		 * A NULL kfpu implies this is a kernel thread with an LWP and
1701 		 * no user-level FPU usage. Use the lwp fpu save area.
1702 		 */
1703 		pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1704 
1705 		ASSERT(curthread->t_procp->p_flag & SSYS);
1706 		ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1707 
1708 		fp_save(pf);
1709 	} else {
1710 		pf = &kfpu->kfpu_ctx;
1711 
1712 		ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1713 		ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1714 
1715 		/*
1716 		 * Note, we can't use fp_save because it assumes that we're
1717 		 * saving to the thread's PCB and not somewhere else. Because
1718 		 * this is a different FPU context, we instead have to do this
1719 		 * ourselves.
1720 		 */
1721 		switch (fp_save_mech) {
1722 		case FP_FXSAVE:
1723 			fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1724 			break;
1725 		case FP_XSAVE:
1726 			xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1727 			break;
1728 		default:
1729 			panic("Invalid fp_save_mech");
1730 		}
1731 
1732 		/*
1733 		 * Because we have saved context here, our save state is no
1734 		 * longer valid and therefore needs to be reinitialized.
1735 		 */
1736 		kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1737 	}
1738 
1739 	pf->fpu_flags |= FPU_VALID;
1740 
1741 	/*
1742 	 * Clear KFPU flag. This allows swtch to check for improper kernel
1743 	 * usage of the FPU (i.e. switching to a new thread while the old
1744 	 * thread was in the kernel and using the FPU, but did not perform a
1745 	 * context save).
1746 	 */
1747 	curthread->t_flag &= ~T_KFPU;
1748 }
1749 
1750 static void
kernel_fpu_ctx_restore(void * arg)1751 kernel_fpu_ctx_restore(void *arg)
1752 {
1753 	kfpu_state_t *kfpu = arg;
1754 	fpu_ctx_t *pf;
1755 
1756 	if (kfpu == NULL) {
1757 		/*
1758 		 * A NULL kfpu implies this is a kernel thread with an LWP and
1759 		 * no user-level FPU usage. Use the lwp fpu save area.
1760 		 */
1761 		pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1762 
1763 		ASSERT(curthread->t_procp->p_flag & SSYS);
1764 		ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1765 	} else {
1766 		pf = &kfpu->kfpu_ctx;
1767 
1768 		ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1769 		ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1770 	}
1771 
1772 	fp_restore(pf);
1773 	curthread->t_flag |= T_KFPU;
1774 }
1775 
1776 /*
1777  * Validate that the thread is not switching off-cpu while actively using the
1778  * FPU within the kernel.
1779  */
1780 void
kernel_fpu_no_swtch(void)1781 kernel_fpu_no_swtch(void)
1782 {
1783 	if ((curthread->t_flag & T_KFPU) != 0) {
1784 		panic("curthread swtch-ing while the kernel is using the FPU");
1785 	}
1786 }
1787 
1788 static const struct ctxop_template kfpu_ctxop_tpl = {
1789 	.ct_rev		= CTXOP_TPL_REV,
1790 	.ct_save	= kernel_fpu_ctx_save,
1791 	.ct_restore	= kernel_fpu_ctx_restore,
1792 };
1793 
1794 void
kernel_fpu_begin(kfpu_state_t * kfpu,uint_t flags)1795 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1796 {
1797 	klwp_t *pl = curthread->t_lwp;
1798 	struct ctxop *ctx;
1799 
1800 	if ((curthread->t_flag & T_KFPU) != 0) {
1801 		panic("curthread attempting to nest kernel FPU states");
1802 	}
1803 
1804 	/* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1805 	ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1806 	    (KFPU_USE_LWP | KFPU_NO_STATE));
1807 
1808 	if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1809 		/*
1810 		 * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1811 		 * hold our kernel FPU context, we depend on the caller doing
1812 		 * kpreempt_disable for the duration of our FPU usage. This
1813 		 * should only be done for very short periods of time.
1814 		 */
1815 		ASSERT(curthread->t_preempt > 0);
1816 		ASSERT(kfpu == NULL);
1817 
1818 		if (pl != NULL) {
1819 			/*
1820 			 * We might have already saved once so FPU_VALID could
1821 			 * be set. This is handled in fp_save.
1822 			 */
1823 			fp_save(&pl->lwp_pcb.pcb_fpu);
1824 			pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1825 		}
1826 
1827 		curthread->t_flag |= T_KFPU;
1828 
1829 		/* Always restore the fpu to the initial state. */
1830 		fpinit();
1831 
1832 		return;
1833 	}
1834 
1835 	/*
1836 	 * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1837 	 */
1838 
1839 	if ((flags & KFPU_USE_LWP) == 0) {
1840 		if (kfpu->kfpu_curthread != NULL)
1841 			panic("attempting to reuse kernel FPU state at %p when "
1842 			    "another thread already is using", kfpu);
1843 
1844 		if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1845 			kernel_fpu_fpstate_init(kfpu);
1846 
1847 		kfpu->kfpu_curthread = curthread;
1848 	}
1849 
1850 	/*
1851 	 * Not all threads may have an active LWP. If they do and we're not
1852 	 * going to re-use the LWP, then we should go ahead and save the state.
1853 	 * We must also note that the fpu is now being used by the kernel and
1854 	 * therefore we do not want to manage the fpu state via the user-level
1855 	 * thread's context handlers.
1856 	 *
1857 	 * We might have already saved once (due to a prior use of the kernel
1858 	 * FPU or another code path) so FPU_VALID could be set. This is handled
1859 	 * by fp_save, as is the FPU_EN check.
1860 	 */
1861 	ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1862 	kpreempt_disable();
1863 	if (pl != NULL) {
1864 		if ((flags & KFPU_USE_LWP) == 0)
1865 			fp_save(&pl->lwp_pcb.pcb_fpu);
1866 		pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1867 	}
1868 
1869 	/*
1870 	 * Set the context operations for kernel FPU usage.  Because kernel FPU
1871 	 * setup and ctxop attachment needs to happen under the protection of
1872 	 * kpreempt_disable(), we allocate the ctxop outside the guard so its
1873 	 * sleeping allocation will not cause a voluntary swtch().  This allows
1874 	 * the rest of the initialization to proceed, ensuring valid state for
1875 	 * the ctxop handlers.
1876 	 */
1877 	ctxop_attach(curthread, ctx);
1878 	curthread->t_flag |= T_KFPU;
1879 
1880 	if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1881 		/*
1882 		 * For pure kernel threads with an LWP, we can use the LWP's
1883 		 * pcb_fpu to save/restore context.
1884 		 */
1885 		fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1886 
1887 		VERIFY(curthread->t_procp->p_flag & SSYS);
1888 		VERIFY(kfpu == NULL);
1889 		ASSERT((pf->fpu_flags & FPU_EN) == 0);
1890 
1891 		/* Always restore the fpu to the initial state. */
1892 		if (fp_save_mech == FP_XSAVE)
1893 			pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1894 		fpinit();
1895 		pf->fpu_flags = FPU_EN | FPU_KERNEL;
1896 	} else {
1897 		/* initialize the kfpu state */
1898 		kernel_fpu_ctx_restore(kfpu);
1899 	}
1900 	kpreempt_enable();
1901 }
1902 
1903 void
kernel_fpu_end(kfpu_state_t * kfpu,uint_t flags)1904 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1905 {
1906 	if ((curthread->t_flag & T_KFPU) == 0) {
1907 		panic("curthread attempting to clear kernel FPU state "
1908 		    "without using it");
1909 	}
1910 
1911 	/*
1912 	 * General comments on why the rest of this function is structured the
1913 	 * way it is. Be aware that there is a lot of subtlety here.
1914 	 *
1915 	 * If a user-level thread ever uses the fpu while in the kernel, then
1916 	 * we cannot call fpdisable since that does STTS. That will set the
1917 	 * ts bit in %cr0 which will cause an exception if anything touches the
1918 	 * fpu. However, the user-level context switch handler (fpsave_ctxt)
1919 	 * needs to access the fpu to save the registers into the pcb.
1920 	 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1921 	 * fprestore_ctxt when the thread context switched onto the CPU.
1922 	 *
1923 	 * Calling fpdisable only effects the current CPU's %cr0 register.
1924 	 *
1925 	 * During ctxop_remove and kpreempt_enable, we can voluntarily context
1926 	 * switch, so the CPU we were on when we entered this function might
1927 	 * not be the same one we're on when we return from ctxop_remove or end
1928 	 * the function. Note there can be user-level context switch handlers
1929 	 * still installed if this is a user-level thread.
1930 	 *
1931 	 * We also must be careful in the unlikely chance we're running in an
1932 	 * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1933 	 * incorrectly for the "real" thread to resume on this CPU.
1934 	 */
1935 
1936 	if ((flags & KFPU_NO_STATE) == 0) {
1937 		kpreempt_disable();
1938 	} else {
1939 		ASSERT(curthread->t_preempt > 0);
1940 	}
1941 
1942 	curthread->t_flag &= ~T_KFPU;
1943 
1944 	/*
1945 	 * When we are ending things, we explicitly don't save the current
1946 	 * kernel FPU state back to the temporary state. The kfpu API is not
1947 	 * intended to be a permanent save location.
1948 	 *
1949 	 * If this is a user-level thread and we were to context switch
1950 	 * before returning to user-land, fpsave_ctxt will be a no-op since we
1951 	 * already saved the user-level FPU state the first time we run
1952 	 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1953 	 * the user-level fpu state). The fpsave_ctxt functions only save if
1954 	 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1955 	 * fprestore_ctxt will be done in sys_rtt_common when the thread
1956 	 * finally returns to user-land.
1957 	 */
1958 
1959 	if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1960 	    curthread->t_intr == NULL) {
1961 		/*
1962 		 * A kernel thread which is not an interrupt thread, so we
1963 		 * STTS now.
1964 		 */
1965 		fpdisable();
1966 	}
1967 
1968 	if ((flags & KFPU_NO_STATE) == 0) {
1969 		ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1970 
1971 		if (kfpu != NULL) {
1972 			if (kfpu->kfpu_curthread != curthread) {
1973 				panic("attempting to end kernel FPU state "
1974 				    "for %p, but active thread is not "
1975 				    "curthread", kfpu);
1976 			} else {
1977 				kfpu->kfpu_curthread = NULL;
1978 			}
1979 		}
1980 
1981 		kpreempt_enable();
1982 	}
1983 
1984 	if (curthread->t_lwp != NULL) {
1985 		uint_t f;
1986 
1987 		if (flags & KFPU_USE_LWP) {
1988 			f = FPU_EN | FPU_KERNEL;
1989 		} else {
1990 			f = FPU_KERNEL;
1991 		}
1992 		curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1993 	}
1994 }
1995 
1996 void
fpu_save_cache_init(void)1997 fpu_save_cache_init(void)
1998 {
1999 	switch (fp_save_mech) {
2000 	case FP_FXSAVE:
2001 		fpsave_cachep = kmem_cache_create("fxsave_cache",
2002 		    sizeof (struct fxsave_state), FXSAVE_ALIGN,
2003 		    NULL, NULL, NULL, NULL, NULL, 0);
2004 		break;
2005 	case FP_XSAVE:
2006 		fpsave_cachep = kmem_cache_create("xsave_cache",
2007 		    cpuid_get_xsave_size(), XSAVE_ALIGN,
2008 		    NULL, NULL, NULL, NULL, NULL, 0);
2009 		break;
2010 	default:
2011 		panic("Invalid fp_save_mech");
2012 	}
2013 }
2014 
2015 /*
2016  * Fill in FPU information that is required by exec.
2017  */
2018 void
fpu_auxv_info(int * typep,size_t * lenp)2019 fpu_auxv_info(int *typep, size_t *lenp)
2020 {
2021 	*typep = fp_elf;
2022 	switch (fp_save_mech) {
2023 	case FP_FXSAVE:
2024 		*lenp = sizeof (struct fxsave_state);
2025 		break;
2026 	case FP_XSAVE:
2027 		*lenp = cpuid_get_xsave_size();
2028 		break;
2029 	default:
2030 		*lenp = 0;
2031 		break;
2032 	}
2033 }
2034 
2035 /*
2036  * This function exists to transform an xsave_state into an fxsave_state. The
2037  * way that we have to do this is nuanced. We assume that callers have already
2038  * handled FPU_EN and thus we only need to consider the xsave_state and its
2039  * component vector itself. This results in the following cases that we need to
2040  * consider:
2041  *
2042  *   o Neither the x87 / XMM state bits are set. We use the hardware default and
2043  *     need to ensure to copy the xsave header.
2044  *   o Both x87 / XMM state bits are set. We can copy everything.
2045  *   o Only the x87 bit is set. We need to copy the x87 state but make the XMM
2046  *     state be in the initial case.
2047  *   o Only the XMM bit is set. The reverse of the above case.
2048  *
2049  * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
2050  * generally the same; however, the default floating point control word is
2051  * different.
2052  *
2053  * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
2054  * Because we are using xsave and xsaveopt in the kernel right now and not
2055  * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
2056  * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
2057  * is set, we must also come back and copy out the MXCSR register. Sorry, we
2058  * don't make the rules.
2059  */
2060 static void
fpu_xsave_to_fxsave(const struct xsave_state * xsave,struct fxsave_state * fx)2061 fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
2062 {
2063 	const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
2064 
2065 	switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
2066 	case XFEATURE_LEGACY_FP | XFEATURE_SSE:
2067 		bcopy(xsave, fx, sizeof (*fx));
2068 		return;
2069 	case XFEATURE_LEGACY_FP:
2070 		bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
2071 		fx->fx_mxcsr = SSE_MXCSR_INIT;
2072 		fx->fx_mxcsr_mask = 0;
2073 		break;
2074 	case XFEATURE_SSE:
2075 		bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
2076 		    fx_mxcsr));
2077 
2078 		fx->fx_fcw = FPU_CW_INIT_HW;
2079 		fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2080 		fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2081 		bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
2082 		break;
2083 	default:
2084 		bcopy(&sse_initial, fx, sizeof (*fx));
2085 		fx->fx_fcw = FPU_CW_INIT_HW;
2086 		break;
2087 	}
2088 
2089 	/*
2090 	 * Account for the AVX causing MXCSR to be valid.
2091 	 */
2092 	if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
2093 	    (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
2094 		fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2095 		fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2096 	}
2097 }
2098 
2099 /*
2100  * This function is designed to answer the question of are we using any xsave
2101  * family of instructions in context switch and therefore we have this state.
2102  * This should still remain true if we are using xsavec or xsaves in the kernel
2103  * in the future.
2104  */
2105 boolean_t
fpu_xsave_enabled(void)2106 fpu_xsave_enabled(void)
2107 {
2108 	return (fp_save_mech == FP_XSAVE);
2109 }
2110 
2111 /*
2112  * The following structure is used to track and manage the programmatic
2113  * construction of /proc and signal stack spilling of xsave information. All
2114  * known xsave types that the kernel supports must be included here.
2115  */
2116 typedef struct xsave_proc_info {
2117 	/*
2118 	 * This matches the /proc xregs type that this data represents. This s
2119 	 * used for /proc only.
2120 	 */
2121 	uint32_t xi_type;
2122 	/*
2123 	 * This indicates the size of the /proc data that we're operating on.
2124 	 * This is only used for /proc.
2125 	 */
2126 	size_t	xi_size;
2127 	/*
2128 	 * This indicates the alignment that we want to have for the member when
2129 	 * we're writing out. This is not used when setting data. This is only
2130 	 * used for /proc.
2131 	 */
2132 	size_t	xi_align;
2133 	/*
2134 	 * This indicates whether this member must always be considered or not.
2135 	 * This is used in both /proc and context/signal handling.
2136 	 */
2137 	bool	xi_always;
2138 	/*
2139 	 * This contains the corresponding bits in the xsave bit vector that
2140 	 * corresponds to this entry. This is used for both /proc and
2141 	 * context/signal handling.
2142 	 */
2143 	uint64_t xi_bits;
2144 	/*
2145 	 * The xi_fill function pointer is used to write out the /proc regset
2146 	 * data (e.g. when a user reads xregs). This is only used for the /proc
2147 	 * handling. The xi_valid function pointer is used instead to validate a
2148 	 * given set of data that we've read in, while the xi_set pointer is
2149 	 * used to actually transform the data in the underlying fpu save area.
2150 	 */
2151 	void	(*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
2152 	    void *);
2153 	bool	(*xi_valid)(model_t, const void *);
2154 	void	(*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
2155 	    uint64_t, const void *);
2156 	/*
2157 	 * The xi_signal_in and xi_signal_out function pointers are used for
2158 	 * extended context and signal handling information. They are used when
2159 	 * reading in data from a ucontext_t and writing it out respectively.
2160 	 * These are only used for context/signal handling.
2161 	 */
2162 	int	(*xi_signal_in)(const struct xsave_proc_info *,
2163 	    const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
2164 	    const uintptr_t);
2165 	int	(*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
2166 	    uc_xsave_t *, const void *fpup, uintptr_t);
2167 } xsave_proc_info_t;
2168 
2169 static bool
fpu_proc_xregs_initial_state(const fpu_ctx_t * fpu,uint64_t feats)2170 fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
2171 {
2172 	const struct xsave_state *xs = fpu->fpu_regs.kfpu_u.kfpu_xs;
2173 
2174 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
2175 		return (true);
2176 	}
2177 
2178 	return ((xs->xs_header.xsh_xstate_bv & feats) == 0);
2179 }
2180 
2181 static void
fpu_proc_xregs_xcr_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2182 fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2183     void *datap)
2184 {
2185 	prxregset_xcr_t *xcr = datap;
2186 
2187 	xcr->prx_xcr_xcr0 = xsave_bv_all;
2188 }
2189 
2190 /*
2191  * Unlike other instruction portions, we treat the xsave header and the legacy
2192  * XMM section together as both are somewhat tied at the instruction hip. Unlike
2193  * the when dealing with other xsave regions like the ymm and zmm components,
2194  * the initial state here is much more nuanced as it has to match what we actual
2195  * do in the OS and depends on the components that are present.
2196  */
2197 static void
fpu_proc_xregs_xsave_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2198 fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2199     void *datap)
2200 {
2201 	prxregset_xsave_t *prxsave = datap;
2202 	const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
2203 	size_t hdr_off;
2204 
2205 	/*
2206 	 * In the x87/XMM case, the no device vs. initial state is different
2207 	 * because the initial state case still wants us to copy the real xsave
2208 	 * header. It's also worth calling out that the actual illumos default
2209 	 * fxsave state is not the same as what Intel documents. The main
2210 	 * difference is in what the x87 FPU control word is. This results in
2211 	 * the following different cases that we need to think about:
2212 	 *
2213 	 *   o FPU_EN is not set. So we use the illumos default.
2214 	 */
2215 	if ((fpu->fpu_flags & FPU_EN) == 0) {
2216 		bcopy(&avx_initial, prxsave, sizeof (*prxsave));
2217 		return;
2218 	}
2219 
2220 	/*
2221 	 * Convert all the fxsave region while taking into account the validity
2222 	 * of the xsave bits. The prxregset_xsave_t structure is the same as the
2223 	 * xsave structure in our ABI and Intel designed the xsave header to
2224 	 * begin with the 512-bit fxsave structure.
2225 	 */
2226 	fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
2227 
2228 	/*
2229 	 * Now that we've dealt with the x87 and XMM state, take care of the
2230 	 * header.
2231 	 */
2232 	hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
2233 	bcopy((const void *)((uintptr_t)xsave + hdr_off),
2234 	    (void *)((uintptr_t)prxsave + hdr_off),
2235 	    sizeof (struct xsave_header));
2236 }
2237 
2238 static void
fpu_proc_xregs_std_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2239 fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2240     void *datap)
2241 {
2242 	if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
2243 		size_t size, off;
2244 		const void *xsave_off;
2245 
2246 		cpuid_get_xsave_info(info->xi_bits, &size, &off);
2247 		ASSERT3U(size, ==, info->xi_size);
2248 		xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2249 		    off);
2250 		bcopy(xsave_off, datap, info->xi_size);
2251 	}
2252 }
2253 
2254 /*
2255  * Users are not allowed to actually set the xcr information this way. However,
2256  * to make it easier for someone to just do a read, modify, write, of the xregs
2257  * data, if it is identical, then we will accept it (and do nothing).
2258  */
2259 static bool
fpu_proc_xregs_xcr_valid(model_t model,const void * datap)2260 fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
2261 {
2262 	const prxregset_xcr_t *xcr = datap;
2263 
2264 	return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
2265 	    xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
2266 }
2267 
2268 /*
2269  * To match traditional /proc semantics, we do not error if reserved bits of
2270  * MXCSR are set, they will be masked off when writing data. We do not allow
2271  * someone to indicate that they are asking for compressed xsave data, hence the
2272  * check that prx_xsh_comp_bv is zero. Separately, in fpu_proc_xregs_set() we
2273  * check that each component that was indicated in the xstate_bv is actually
2274  * present.
2275  */
2276 static bool
fpu_proc_xregs_xsave_valid(model_t model,const void * datap)2277 fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
2278 {
2279 	const prxregset_xsave_t *xsave = datap;
2280 	uint64_t rsvd[6] = { 0 };
2281 
2282 	if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
2283 	    xsave->prx_xsh_xcomp_bv != 0) {
2284 		return (false);
2285 	}
2286 
2287 	if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
2288 		return (false);
2289 	}
2290 
2291 	return (true);
2292 }
2293 
2294 /*
2295  * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
2296  * on x86; however, when operating in ILP32, subsets are reserved. We require
2297  * that all reserved portions are set to zero.
2298  */
2299 static bool
fpu_proc_xregs_ymm_valid(model_t model,const void * datap)2300 fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
2301 {
2302 	upad128_t ymm_zero[8];
2303 	const prxregset_ymm_t *ymm = datap;
2304 
2305 	if (model == DATAMODEL_LP64) {
2306 		return (true);
2307 	}
2308 
2309 	bzero(&ymm_zero, sizeof (ymm_zero));
2310 	return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
2311 }
2312 
2313 static bool
fpu_proc_xregs_zmm_valid(model_t model,const void * datap)2314 fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
2315 {
2316 	upad256_t zmm_zero[8];
2317 	const prxregset_zmm_t *zmm = datap;
2318 
2319 	if (model == DATAMODEL_LP64) {
2320 		return (true);
2321 	}
2322 
2323 	bzero(&zmm_zero, sizeof (zmm_zero));
2324 	return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
2325 }
2326 
2327 static bool
fpu_proc_xregs_hi_zmm_valid(model_t model,const void * datap)2328 fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
2329 {
2330 	prxregset_hi_zmm_t hi_zmm_zero;
2331 	const prxregset_hi_zmm_t *hi_zmm = datap;
2332 
2333 	if (model == DATAMODEL_LP64) {
2334 		return (true);
2335 	}
2336 
2337 	bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
2338 	return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
2339 }
2340 
2341 /*
2342  * The xsave state consists of the first 512 bytes of the XMM state and then the
2343  * xsave header itself. Because of the xsave header, this structure is marked
2344  * with xi_always, so we must always process and consider it.
2345  *
2346  * Semantically if either of the bits around SSE / x87 is set, then we will copy
2347  * the entire thing. This may mean that we end up copying a region that is not
2348  * valid into the save area; however, that should be OK as we still have the
2349  * specific bit flags that indicate what we should consider or not.
2350  *
2351  * There is one additional wrinkle we need to consider and honor here. The CPU
2352  * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
2353  * anything else. So if this is set and we do not have a valid x87/XMM bits
2354  * set then we will set the MXCSR to its default state in case the processor
2355  * tries to load it. For reference see:
2356  *
2357  *   o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
2358  *   o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
2359  *
2360  * Note, the behavior around this changes depending on whether using the
2361  * compressed xrstor or not. We are not, but it's worth being aware of. We do
2362  * not worry about MXCSR_MASK because the instructions ignore it.
2363  */
2364 static void
fpu_proc_xregs_xsave_set(fpu_ctx_t * fpu,const xsave_proc_info_t * info,uint64_t xsave_bv,const void * datap)2365 fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2366     uint64_t xsave_bv, const void *datap)
2367 {
2368 	const struct xsave_state *src_xs = datap;
2369 	struct xsave_state *targ_xs = fpu->fpu_regs.kfpu_u.kfpu_xs;
2370 
2371 	if ((xsave_bv & info->xi_bits) != 0) {
2372 		bcopy(&src_xs->xs_fxsave, &targ_xs->xs_fxsave,
2373 		    sizeof (struct fxsave_state));
2374 	} else if ((xsave_bv & XFEATURE_AVX) != 0) {
2375 		targ_xs->xs_fxsave.fx_mxcsr = SSE_MXCSR_INIT;
2376 	}
2377 
2378 	bcopy(&src_xs->xs_header, &targ_xs->xs_header,
2379 	    sizeof (struct xsave_header));
2380 	targ_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2381 }
2382 
2383 static void
fpu_proc_xregs_std_set(fpu_ctx_t * fpu,const xsave_proc_info_t * info,uint64_t xsave_bv,const void * datap)2384 fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2385     uint64_t xsave_bv, const void *datap)
2386 {
2387 	size_t size, off;
2388 	void *xsave_off;
2389 
2390 	cpuid_get_xsave_info(info->xi_bits, &size, &off);
2391 	xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2392 	    off);
2393 	bcopy(datap, xsave_off, size);
2394 }
2395 
2396 /*
2397  * Dealing with XMM data is a little more annoying in signal context. If UC_FPU
2398  * is set, the ucontext_t's fpregset_t contains a copy of the XMM region. That
2399  * must take priority over an XMM region that showed up in the uc_xsave_t data.
2400  * In the signal copyout code we do not save XMM region in the uc_xsave_t or set
2401  * it as a present component because of it being kept in the fpregset_t. Because
2402  * of this behavior, if we find the XMM (or x87) state bits present, we treat
2403  * that as an error.
2404  *
2405  * The system has always gone through and cleaned up the reserved bits in the
2406  * fxsave state when someone calls setcontext(). Therefore we need to do the
2407  * same thing which is why you see the masking of the mxcsr below.
2408  *
2409  * Finally, there is one last wrinkle here that we need to consider. The
2410  * fpregset_t has two private words which cache the status/exception
2411  * information. Therefore, we well...  cheat. Intel has left bytes 464 (0x1d0)
2412  * through 511 (0x1ff) available for us to do what we want. So we will pass this
2413  * through that for the moment to help us pass this state around without too
2414  * much extra allocation.
2415  */
2416 static int
fpu_signal_copyin_xmm(const xsave_proc_info_t * info,const ucontext_t * kuc,const uc_xsave_t * ucx,void * fpup,uintptr_t * udatap,const uintptr_t max_udata)2417 fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
2418     const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2419     const uintptr_t max_udata)
2420 {
2421 	struct xsave_state *xsave = fpup;
2422 
2423 	if ((ucx->ucx_bv & info->xi_bits) != 0) {
2424 		return (EINVAL);
2425 	}
2426 
2427 	if ((kuc->uc_flags & UC_FPU) != 0) {
2428 		bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
2429 		    sizeof (struct fxsave_state));
2430 		xsave->xs_fxsave.__fx_ign2[3]._l[0] =
2431 		    kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
2432 		xsave->xs_fxsave.__fx_ign2[3]._l[1] =
2433 		    kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
2434 		xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2435 		xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2436 	}
2437 
2438 	return (0);
2439 }
2440 
2441 static int
fpu_signal_copyin_std(const xsave_proc_info_t * info,const ucontext_t * kuc,const uc_xsave_t * ucx,void * fpup,uintptr_t * udatap,const uintptr_t max_udata)2442 fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
2443     const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2444     const uintptr_t max_udata)
2445 {
2446 	size_t len, xsave_off;
2447 	void *copy_to;
2448 	struct xsave_state *xsave = fpup;
2449 
2450 	cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2451 	if (*udatap + len > max_udata) {
2452 		return (EOVERFLOW);
2453 	}
2454 
2455 	copy_to = (void *)((uintptr_t)fpup + xsave_off);
2456 	if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
2457 		return (EFAULT);
2458 	}
2459 
2460 	xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2461 	*udatap = *udatap + len;
2462 
2463 	return (0);
2464 }
2465 
2466 static int
fpu_signal_copyout_std(const xsave_proc_info_t * info,fpu_copyout_f copyfunc,uc_xsave_t * ucx,const void * fpup,uintptr_t udatap)2467 fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
2468     uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
2469 {
2470 	size_t len, xsave_off;
2471 	const void *copy_from;
2472 	void *copy_to;
2473 	int ret;
2474 
2475 	cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2476 	copy_from = (void *)(uintptr_t)fpup + xsave_off;
2477 	copy_to = (void *)(udatap + ucx->ucx_len);
2478 
2479 	ret = copyfunc(copy_from, copy_to, len);
2480 	if (ret != 0) {
2481 		return (ret);
2482 	}
2483 
2484 	ucx->ucx_len += len;
2485 	ucx->ucx_bv |= info->xi_bits;
2486 	return (0);
2487 }
2488 
2489 /*
2490  * This table contains information about the extended FPU states and synthetic
2491  * information we create for /proc, the ucontext_t, and signal handling. The
2492  * definition of the xsave_proc_info_t describes how each member is used.
2493  *
2494  * In general, this table is expected to be in the order of the xsave data
2495  * structure itself. Synthetic elements that we create can go anywhere and new
2496  * ones should be inserted at the end. This structure is walked in order to
2497  * produce the /proc and signal handling logic, so changing the order is
2498  * meaningful for those and should not be done lightly.
2499  */
2500 static const xsave_proc_info_t fpu_xsave_info[] = { {
2501 	.xi_type = PRX_INFO_XCR,
2502 	.xi_size = sizeof (prxregset_xcr_t),
2503 	.xi_align = alignof (prxregset_xcr_t),
2504 	.xi_always = true,
2505 	.xi_bits = 0,
2506 	.xi_fill = fpu_proc_xregs_xcr_fill,
2507 	.xi_valid = fpu_proc_xregs_xcr_valid
2508 }, {
2509 	/*
2510 	 * The XSAVE entry covers both the xsave header and the %xmm registers.
2511 	 * Note, there is no signal copyout information for the %xmm registers
2512 	 * because it is expected that that data is already in the fpregset_t.
2513 	 */
2514 	.xi_type = PRX_INFO_XSAVE,
2515 	.xi_size = sizeof (prxregset_xsave_t),
2516 	.xi_align = FPU_ALIGN_XMM,
2517 	.xi_always = true,
2518 	.xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
2519 	.xi_fill = fpu_proc_xregs_xsave_fill,
2520 	.xi_set = fpu_proc_xregs_xsave_set,
2521 	.xi_valid = fpu_proc_xregs_xsave_valid,
2522 	.xi_signal_in = fpu_signal_copyin_xmm
2523 }, {
2524 	.xi_type = PRX_INFO_YMM,
2525 	.xi_size = sizeof (prxregset_ymm_t),
2526 	.xi_align = FPU_ALIGN_YMM,
2527 	.xi_always = false,
2528 	.xi_bits = XFEATURE_AVX,
2529 	.xi_fill = fpu_proc_xregs_std_fill,
2530 	.xi_set = fpu_proc_xregs_std_set,
2531 	.xi_signal_in = fpu_signal_copyin_std,
2532 	.xi_valid = fpu_proc_xregs_ymm_valid,
2533 	.xi_signal_out = fpu_signal_copyout_std
2534 }, {
2535 	/*
2536 	 * There is no /proc validation function for the mask registers because
2537 	 * they are the same in ILP32 / LP64 and there is nothing for us to
2538 	 * actually validate.
2539 	 */
2540 	.xi_type = PRX_INFO_OPMASK,
2541 	.xi_size = sizeof (prxregset_opmask_t),
2542 	.xi_align = alignof (prxregset_opmask_t),
2543 	.xi_always = false,
2544 	.xi_bits = XFEATURE_AVX512_OPMASK,
2545 	.xi_fill = fpu_proc_xregs_std_fill,
2546 	.xi_set = fpu_proc_xregs_std_set,
2547 	.xi_signal_in = fpu_signal_copyin_std,
2548 	.xi_signal_out = fpu_signal_copyout_std
2549 }, {
2550 	.xi_type = PRX_INFO_ZMM,
2551 	.xi_size = sizeof (prxregset_zmm_t),
2552 	.xi_align = FPU_ALIGN_ZMM,
2553 	.xi_always = false,
2554 	.xi_bits = XFEATURE_AVX512_ZMM,
2555 	.xi_fill = fpu_proc_xregs_std_fill,
2556 	.xi_set = fpu_proc_xregs_std_set,
2557 	.xi_valid = fpu_proc_xregs_zmm_valid,
2558 	.xi_signal_in = fpu_signal_copyin_std,
2559 	.xi_signal_out = fpu_signal_copyout_std
2560 }, {
2561 	.xi_type = PRX_INFO_HI_ZMM,
2562 	.xi_size = sizeof (prxregset_hi_zmm_t),
2563 	.xi_align = FPU_ALIGN_ZMM,
2564 	.xi_always = false,
2565 	.xi_bits = XFEATURE_AVX512_HI_ZMM,
2566 	.xi_fill = fpu_proc_xregs_std_fill,
2567 	.xi_set = fpu_proc_xregs_std_set,
2568 	.xi_valid = fpu_proc_xregs_hi_zmm_valid,
2569 	.xi_signal_in = fpu_signal_copyin_std,
2570 	.xi_signal_out = fpu_signal_copyout_std
2571 } };
2572 
2573 static bool
fpu_proc_xregs_include(const xsave_proc_info_t * infop)2574 fpu_proc_xregs_include(const xsave_proc_info_t *infop)
2575 {
2576 	return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
2577 }
2578 
2579 void
fpu_proc_xregs_info(struct proc * p __unused,uint32_t * ninfop,uint32_t * sizep,uint32_t * dstart)2580 fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
2581     uint32_t *dstart)
2582 {
2583 	size_t ret = sizeof (prxregset_hdr_t);
2584 	uint32_t ninfo = 0;
2585 
2586 	ASSERT(fpu_xsave_enabled());
2587 
2588 	/*
2589 	 * Right now the set of flags that are enabled in the FPU is global.
2590 	 * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
2591 	 * actual things that might show up and we care about are all about what
2592 	 * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
2593 	 * move to per-process FPU enablement which is likely to come with AMX,
2594 	 * then this will need the proc_t to look at, hence why we've set things
2595 	 * up with the unused variable above.
2596 	 *
2597 	 * We take two passes through the array. The first is just to count up
2598 	 * how many informational entries we need.
2599 	 */
2600 	for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2601 		if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2602 			continue;
2603 		ninfo++;
2604 	}
2605 
2606 	ASSERT3U(ninfo, >, 0);
2607 	ret += sizeof (prxregset_info_t) * ninfo;
2608 
2609 	for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2610 		size_t curphase;
2611 		if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2612 			continue;
2613 
2614 		curphase = ret % fpu_xsave_info[i].xi_align;
2615 		if (ret < fpu_xsave_info[i].xi_align) {
2616 			ret = fpu_xsave_info[i].xi_align;
2617 		} else if (curphase != 0) {
2618 			ret += curphase;
2619 		}
2620 
2621 		if (i == 0 && dstart != NULL) {
2622 			*dstart = ret;
2623 		}
2624 
2625 		ret += fpu_xsave_info[i].xi_size;
2626 	}
2627 
2628 	VERIFY3U(ret, <=, UINT32_MAX);
2629 	if (sizep != NULL) {
2630 		*sizep = ret;
2631 	}
2632 
2633 	if (ninfop != NULL) {
2634 		*ninfop = ninfo;
2635 	}
2636 }
2637 
2638 /*
2639  * This function supports /proc. Because /proc does not have a process locked
2640  * while processing a PCSXREG, this tries to establish an upper bound that we
2641  * will validate later in fpu_proc_xregs_set(). We basically say that if you
2642  * take the maximum xsave size and add 1 KiB that is a good enough approximation
2643  * for the maximum size. The 1 KiB is us basically trying to rationalize the
2644  * overhead of our structures that we're adding right, while being cognisant of
2645  * differing alignments and the fact that the full xsave size is in some cases
2646  * (when supervisor states or features we don't support are present) going to be
2647  * larger than we would need for this.
2648  */
2649 size_t
fpu_proc_xregs_max_size(void)2650 fpu_proc_xregs_max_size(void)
2651 {
2652 	VERIFY(fpu_xsave_enabled());
2653 	return (cpuid_get_xsave_size() + 0x1000);
2654 }
2655 
2656 /*
2657  * This functions supports /proc. In particular, it's meant to perform the
2658  * following:
2659  *
2660  *  o Potentially save the current thread's registers.
2661  *  o Write out the x86 xsave /proc xregs format data from the xsave data we
2662  *    actually have. Note, this can be a little weird for cases where the FPU is
2663  *    not actually enabled, which happens for system processes.
2664  */
2665 void
fpu_proc_xregs_get(klwp_t * lwp,void * buf)2666 fpu_proc_xregs_get(klwp_t *lwp, void *buf)
2667 {
2668 	uint32_t size, ninfo, curinfo, dstart;
2669 	fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2670 	prxregset_hdr_t *hdr = buf;
2671 
2672 	ASSERT(fpu_xsave_enabled());
2673 	fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
2674 
2675 	/*
2676 	 * Before we get going, defensively zero out all the data buffer so that
2677 	 * the rest of the fill functions can assume a specific base.
2678 	 */
2679 	bzero(buf, size);
2680 
2681 	kpreempt_disable();
2682 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2683 		/*
2684 		 * This case suggests that thread in question doesn't have a
2685 		 * valid FPU save state which should only happen when it is on
2686 		 * CPU. If this is the case, we must ensure that we save the
2687 		 * current FPU state before proceeding. We also sanity check
2688 		 * several things here before doing this as using /proc on
2689 		 * yourself is always exciting. fp_save() will ensure that the
2690 		 * thread is flagged to go back to being an eager FPU before
2691 		 * returning back to userland.
2692 		 */
2693 		VERIFY3P(curthread, ==, lwptot(lwp));
2694 		VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2695 		fp_save(fpu);
2696 	}
2697 	kpreempt_enable();
2698 
2699 	hdr->pr_type = PR_TYPE_XSAVE;
2700 	hdr->pr_size = size;
2701 	hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
2702 	    hdr->pr_pad[3] = 0;
2703 	hdr->pr_ninfo = ninfo;
2704 
2705 	curinfo = 0;
2706 	for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2707 		void *startp;
2708 		uint32_t phase;
2709 
2710 		if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2711 			continue;
2712 
2713 		phase = dstart % fpu_xsave_info[i].xi_align;
2714 		if (dstart < fpu_xsave_info[i].xi_align) {
2715 			ASSERT3U(i, !=, 0);
2716 			dstart = fpu_xsave_info[i].xi_align;
2717 		} else if (phase != 0) {
2718 			ASSERT3U(i, !=, 0);
2719 			dstart += phase;
2720 		}
2721 
2722 		hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
2723 		hdr->pr_info[curinfo].pri_flags = 0;
2724 		hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
2725 		hdr->pr_info[curinfo].pri_offset = dstart;
2726 
2727 		startp = (void *)((uintptr_t)buf + dstart);
2728 		fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
2729 		dstart += fpu_xsave_info[i].xi_size;
2730 		ASSERT3U(curinfo, <=, ninfo);
2731 		curinfo++;
2732 	}
2733 }
2734 
2735 /*
2736  * We have been asked to set the data in the FPU for a given thread. Our
2737  * prmachdep code has already validated that the raw semantics of the data that
2738  * we have are valid (that is the appropriate sizes, offsets, and flags). We now
2739  * apply additional checking here:
2740  *
2741  *   o The xsave structure is present and only valid bits are set.
2742  *   o If the xsave component bit-vector is set, we have the corresponding proc
2743  *     info item.
2744  *   o Read-only items are ignored if and only if they actually match what we
2745  *     gave the user mostly as a courtesy to simplify things here.
2746  *   o ILP32 processes which can't support many of the regions are allowed to
2747  *     have the items here (as we likely gave them to them), but they must be
2748  *     zero if they are set.
2749  *
2750  * We take a first pass through all the data, validating it makes sense for the
2751  * FPU. Only after that point do we ensure that we have the FPU data in question
2752  * and then we clobber all the FPU data. Part of the semantics of setting this
2753  * is that we're setting the entire extended FPU.
2754  */
2755 int
fpu_proc_xregs_set(klwp_t * lwp,void * buf)2756 fpu_proc_xregs_set(klwp_t *lwp, void *buf)
2757 {
2758 	prxregset_hdr_t *prx = buf;
2759 	model_t model = lwp_getdatamodel(lwp);
2760 	uint64_t bv_found = 0;
2761 	const prxregset_xsave_t *xsave = NULL;
2762 	fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2763 
2764 	VERIFY(fpu_xsave_enabled());
2765 
2766 	/*
2767 	 * First, walk each note info header that we have from the user and
2768 	 * proceed to validate it. The prmachdep code has already validated that
2769 	 * the size, type, and offset information is valid, but it has not
2770 	 * validated the semantic contents of this or if someone is trying to
2771 	 * write something they shouldn't.
2772 	 *
2773 	 * While we walk this, we keep track of where the xsave header is. We
2774 	 * also track all of the bits that we have found along the way so we can
2775 	 * match up and ensure that everything that was set has a corresponding
2776 	 * bit in the xsave bitmap. If we have something in the xsave bitmap,
2777 	 * but not its corresponding data, then that is an error. However, we
2778 	 * allow folks to write data regions without the bit set in the xsave
2779 	 * data to make the read, modify, write process simpler.
2780 	 */
2781 	for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2782 		const prxregset_info_t *info = &prx->pr_info[i];
2783 		bool found = false;
2784 
2785 		for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2786 			void *data;
2787 			if (info->pri_type != fpu_xsave_info[pt].xi_type)
2788 				continue;
2789 
2790 			found = true;
2791 			data = (void *)((uintptr_t)buf + info->pri_offset);
2792 			if (fpu_xsave_info[pt].xi_valid != NULL &&
2793 			    !fpu_xsave_info[pt].xi_valid(model, data)) {
2794 				return (EINVAL);
2795 			}
2796 
2797 			if (info->pri_type == PRX_INFO_XSAVE) {
2798 				xsave = data;
2799 			}
2800 			bv_found |= fpu_xsave_info[pt].xi_bits;
2801 			break;
2802 		}
2803 
2804 		if (!found) {
2805 			return (EINVAL);
2806 		}
2807 	}
2808 
2809 	/*
2810 	 * No xsave data, no dice.
2811 	 */
2812 	if (xsave == NULL) {
2813 		return (EINVAL);
2814 	}
2815 
2816 	/*
2817 	 * If anything is set in the xsave header that was not found as we
2818 	 * walked structures, then that is an error. The opposite is not true as
2819 	 * discussed above.
2820 	 */
2821 	if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
2822 		return (EINVAL);
2823 	}
2824 
2825 	/*
2826 	 * At this point, we consider all the data actually valid. Now we must
2827 	 * set up this information in the save area. If this is our own lwp, we
2828 	 * must disable it first. Otherwise, we expect that it is already valid.
2829 	 * To try to sanitize this, we will defensively zero the entire region
2830 	 * as we are setting everything that will result in here.
2831 	 */
2832 	kpreempt_disable();
2833 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2834 		/*
2835 		 * This case suggests that thread in question doesn't have a
2836 		 * valid FPU save state which should only happen when it is on
2837 		 * CPU. If this is the case, we explicitly disable the FPU, but
2838 		 * do not save it before proceeding. We also sanity check
2839 		 * several things here before doing this as using /proc on
2840 		 * yourself is always exciting. Unlike fp_save(), fp_free() does
2841 		 * not signal that an update is required, so we unconditionally
2842 		 * set that for all threads.
2843 		 */
2844 		VERIFY3P(curthread, ==, lwptot(lwp));
2845 		VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2846 		fp_free(fpu);
2847 	}
2848 	PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
2849 	bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2850 	    cpuid_get_xsave_size());
2851 
2852 	for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2853 		const prxregset_info_t *info = &prx->pr_info[i];
2854 		bool found = false;
2855 
2856 		for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2857 			const void *data;
2858 			if (info->pri_type != fpu_xsave_info[pt].xi_type)
2859 				continue;
2860 
2861 			/*
2862 			 * Check if we have a set function and if we should
2863 			 * include this. We may not if this is something like
2864 			 * PRX_INFO_XCR which is read-only.
2865 			 *
2866 			 * We may not include a given entry as it may not have
2867 			 * been set in the actual xsave state that we have been
2868 			 * asked to restore, in which case to not break the
2869 			 * xsaveopt logic, we must leave it in its initial
2870 			 * state, e.g. zeroed (generally). XMM data initial
2871 			 * state is not zeroed, but is marked with xi_always to
2872 			 * help account for this.
2873 			 */
2874 			found = true;
2875 			if (fpu_xsave_info[pt].xi_set == NULL)
2876 				break;
2877 			if (!fpu_xsave_info[pt].xi_always &&
2878 			    (xsave->prx_xsh_xstate_bv &
2879 			    fpu_xsave_info[pt].xi_bits) !=
2880 			    fpu_xsave_info[pt].xi_bits) {
2881 				break;
2882 			}
2883 
2884 			data = (void *)((uintptr_t)buf + info->pri_offset);
2885 			fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
2886 			    xsave->prx_xsh_xstate_bv, data);
2887 		}
2888 
2889 		VERIFY(found);
2890 	}
2891 	kpreempt_enable();
2892 
2893 	return (0);
2894 }
2895 
2896 /*
2897  * To be included in the signal copyout logic we must have a copy function and
2898  * the bit in question must be included. Note, we don't consult xi_always here
2899  * as that is really part of what is always present for xsave logic and
2900  * therefore isn't really pertinent here because of our custom format. See the
2901  * big theory statement for more info.
2902  */
2903 static bool
fpu_signal_include(const xsave_proc_info_t * infop,uint64_t xs_bv)2904 fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
2905 {
2906 	return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
2907 	    infop->xi_signal_out != NULL);
2908 }
2909 
2910 /*
2911  * We need to fill out the xsave related data into the ucontext_t that we've
2912  * been given. We should have a valid user pointer at this point in the uc_xsave
2913  * member. This is much simpler than the copyin that we have. Here are the
2914  * current assumptions:
2915  *
2916  *   o This is being called for the current thread. This is not meant to operate
2917  *     on an arbitrary thread's state.
2918  *   o We cannot assume whether the FPU is valid in the pcb or not. While most
2919  *     callers will have just called getfpregs() which saved the state, don't
2920  *     assume that.
2921  *   o We assume that the user address has the requisite required space for this
2922  *     to be copied out.
2923  *   o We assume that copyfunc() will ensure we are not copying into a kernel
2924  *     address.
2925  *
2926  * For more information on the format of the data, see the 'Signal Handling and
2927  * the ucontext_t' portion of the big theory statement. We copy out all the
2928  * constituent parts and then come back and write out the actual final header
2929  * information.
2930  */
2931 int
fpu_signal_copyout(klwp_t * lwp,uintptr_t uaddr,fpu_copyout_f copyfunc)2932 fpu_signal_copyout(klwp_t *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
2933 {
2934 	struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
2935 	uint64_t xs_bv;
2936 	uc_xsave_t ucx;
2937 	int ret;
2938 
2939 	VERIFY3P(curthread, ==, lwptot(lwp));
2940 	VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2941 	VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
2942 
2943 	if (!fpu_xsave_enabled()) {
2944 		return (ENOTSUP);
2945 	}
2946 
2947 	/*
2948 	 * Unlike when we're dealing with /proc, we can unconditionally call
2949 	 * fp_save() because this is always called in the context where the lwp
2950 	 * we're operating on is always the one on CPU (which is what fp_save()
2951 	 * asserts).
2952 	 */
2953 	fp_save(fpu);
2954 
2955 	bzero(&ucx, sizeof (ucx));
2956 	ucx.ucx_vers = UC_XSAVE_VERS;
2957 	ucx.ucx_len += sizeof (uc_xsave_t);
2958 
2959 	xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
2960 	for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2961 		const xsave_proc_info_t *info = &fpu_xsave_info[i];
2962 
2963 		if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
2964 			continue;
2965 		ret = info->xi_signal_out(info, copyfunc, &ucx,
2966 		    lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2967 		    uaddr);
2968 		if (ret != 0) {
2969 			kpreempt_enable();
2970 			return (ret);
2971 		}
2972 	}
2973 
2974 	/*
2975 	 * Now that everything has been copied out, we should have an accurate
2976 	 * value in the uc_xsave_t header and we can copy that out at the start
2977 	 * of the user data.
2978 	 */
2979 	ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
2980 	return (ret);
2981 }
2982 
2983 /*
2984  * Here we've been given a ucontext_t which potentially has a user pointer to
2985  * xsave state that we've copied out previously. In this case we need to do the
2986  * following, assuming UC_XSAVE is present:
2987  *
2988  *   o Copy in our header and validate it.
2989  *   o Allocate an fpu context to use as a holding ground for all this data.
2990  *   o If UC_FPU is set, override the xsave structure with the saved XMM state,
2991  *     clear UC_FPU, and make sure that the correct xsave_bv bits are set.
2992  *
2993  * Currently we always allocate the additional state as a holding ground for the
2994  * FPU. What we're copying in may not be valid and we don't want to clobber the
2995  * existing FPU state or deal with merging it until we believe it's reasonable
2996  * enough. The proc_t is here to set us up for when we have per-process settings
2997  * in the extended feature disable MSRs.
2998  */
2999 int
fpu_signal_copyin(klwp_t * lwp,ucontext_t * kuc)3000 fpu_signal_copyin(klwp_t *lwp, ucontext_t *kuc)
3001 {
3002 	uc_xsave_t ucx;
3003 	uint64_t bv;
3004 	uintptr_t data, max_data;
3005 	void *fpu;
3006 	proc_t *p = lwp->lwp_procp;
3007 	size_t ksize;
3008 
3009 	/*
3010 	 * Because this has been opaque filler and the kernel has never
3011 	 * historically looked at it, we don't really care about the uc_xsave
3012 	 * pointer being garbage in the case that the flag is not set. While
3013 	 * this isn't perhaps the most sporting choice in some cases, this is on
3014 	 * the other hand, pragmatic.
3015 	 */
3016 	if ((kuc->uc_flags & UC_XSAVE) != 0) {
3017 		if (kuc->uc_xsave == 0) {
3018 			return (EINVAL);
3019 		}
3020 
3021 		if (!fpu_xsave_enabled()) {
3022 			return (ENOTSUP);
3023 		}
3024 	} else {
3025 		return (0);
3026 	}
3027 
3028 	if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
3029 	    0) {
3030 		return (EFAULT);
3031 	}
3032 
3033 	ksize = cpuid_get_xsave_size();
3034 	if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
3035 	    ucx.ucx_len > ksize ||
3036 	    (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
3037 	    (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
3038 	    (uintptr_t)kuc->uc_xsave) {
3039 		return (EINVAL);
3040 	}
3041 
3042 	/*
3043 	 * OK, our goal right now is to recreate a valid xsave_state structure
3044 	 * that we'll ultimately end up having to merge with our existing one in
3045 	 * the FPU save state. The reason we describe this as a merge is to help
3046 	 * future us when we want to retain supervisor state which will never be
3047 	 * part of userland signal state. The design of the userland signal
3048 	 * state is basically to compress it as much as we can. This is done for
3049 	 * two reasons:
3050 	 *
3051 	 *   1) We currently consider this a private interface.
3052 	 *   2) We really want to minimize the actual amount of stack space we
3053 	 *	use as much as possible. Most applications aren't using AVX-512
3054 	 *	right now, so doing our own compression style is worthwhile. If
3055 	 *	libc adopts AVX-512 routines, we may want to change this.
3056 	 *
3057 	 * On the allocation below, our assumption is that if a thread has taken
3058 	 * a signal, then it is likely to take a signal again in the future (or
3059 	 * be shortly headed to its demise). As such, when that happens we will
3060 	 * leave the allocated signal stack around for the process. Most
3061 	 * applications don't allow all threads to take signals, so this should
3062 	 * hopefully help amortize the cost of the allocation.
3063 	 */
3064 	max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
3065 	data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
3066 	bv = ucx.ucx_bv;
3067 	if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
3068 		lwp->lwp_pcb.pcb_fpu.fpu_signal =
3069 		    kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
3070 	}
3071 	fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
3072 
3073 	/*
3074 	 * Unconditionally initialize the memory we get in here to ensure that
3075 	 * it is in a reasonable state for ourselves. This ensures that unused
3076 	 * regions are mostly left in their initial state (the main exception
3077 	 * here is the x87/XMM state, but that should be OK). We don't fill in
3078 	 * the initial xsave state as we expect that to happen as part of our
3079 	 * processing.
3080 	 */
3081 	bzero(fpu, ksize);
3082 
3083 	for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3084 		int ret;
3085 		const xsave_proc_info_t *info = &fpu_xsave_info[i];
3086 		if (!info->xi_always && (info->xi_bits & bv) == 0)
3087 			continue;
3088 		bv &= ~info->xi_bits;
3089 
3090 		if (info->xi_signal_in == NULL)
3091 			continue;
3092 		ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
3093 		if (ret != 0) {
3094 			return (ret);
3095 		}
3096 	}
3097 	ASSERT0(bv);
3098 
3099 	/*
3100 	 * As described in the big theory statement section 'Signal Handling and
3101 	 * the ucontext_t', we always remove UC_FPU from here as we've taken
3102 	 * care of reassembling it ourselves.
3103 	 */
3104 	kuc->uc_flags &= ~UC_FPU;
3105 	kuc->uc_xsave = (uintptr_t)fpu;
3106 
3107 	return (0);
3108 }
3109 
3110 /*
3111  * This determines the size of the signal stack that we need for our custom form
3112  * of the xsave state.
3113  */
3114 size_t
fpu_signal_size(klwp_t * lwp)3115 fpu_signal_size(klwp_t *lwp)
3116 {
3117 	struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3118 	size_t len = sizeof (uc_xsave_t);
3119 	uint64_t xs_bv;
3120 
3121 	VERIFY3P(curthread, ==, lwptot(lwp));
3122 	VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3123 	VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3124 
3125 	if (!fpu_xsave_enabled()) {
3126 		return (0);
3127 	}
3128 
3129 	kpreempt_disable();
3130 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3131 		fp_save(fpu);
3132 	}
3133 
3134 	xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
3135 	for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3136 		size_t comp_size;
3137 
3138 		if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
3139 			continue;
3140 
3141 		cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
3142 		    NULL);
3143 		len += comp_size;
3144 	}
3145 
3146 	kpreempt_enable();
3147 	return (len);
3148 }
3149 
3150 /*
3151  * This function is used in service of restorecontext() to set the specified
3152  * thread's extended FPU state to the passed in data. Our assumptions at this
3153  * point from the system are:
3154  *
3155  *   o Someone has already verified that the actual xsave header is correct.
3156  *   o Any traditional XMM state that causes a #gp has been clamped.
3157  *   o That data is basically the correct sized xsave state structure. Right now
3158  *     that means it is not compressed and follows the CPUID-based rules for
3159  *     constructing and laying out data.
3160  *   o That the lwp argument refers to the current thread.
3161  *
3162  * Our primary purpose here is to merge the current FPU state with what exists
3163  * here. Right now, "merge", strictly speaking is just "replace". We can get
3164  * away with just replacing everything because all we currently save are user
3165  * states. If we start saving kernel states in here, this will get more nuanced
3166  * and we will need to be more careful about how we store data here.
3167  */
3168 void
fpu_set_xsave(klwp_t * lwp,const void * data)3169 fpu_set_xsave(klwp_t *lwp, const void *data)
3170 {
3171 	struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3172 	uint32_t status, xstatus;
3173 	struct xsave_state *dst_xsave;
3174 
3175 	VERIFY(fpu_xsave_enabled());
3176 	VERIFY3P(curthread, ==, lwptot(lwp));
3177 	VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3178 	ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3179 
3180 	/*
3181 	 * We use fp_save() here rather than a stock fpdisable() so we can
3182 	 * attempt to honor our invariants that when the thread state has been
3183 	 * saved, the valid flag is set, even though we're going to be
3184 	 * overwriting it shortly. If we just called fpdisable() then we would
3185 	 * basically be asking for trouble.
3186 	 *
3187 	 * Because we are modifying the state here and we don't want the system
3188 	 * to end up in an odd state, we are being a little paranoid and
3189 	 * disabling preemption across this operation. In particular, once the
3190 	 * state is properly tagged with FPU_VALID, there should be no other way
3191 	 * that this thread can return to userland and get cleared out because
3192 	 * we're resetting its context; however, we let paranoia win out.
3193 	 */
3194 	kpreempt_disable();
3195 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3196 		fp_save(fpu);
3197 	}
3198 
3199 	bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
3200 	    cpuid_get_xsave_size());
3201 	dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
3202 	status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
3203 	xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
3204 	dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
3205 	dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
3206 
3207 	/*
3208 	 * These two status words are information that the kernel itself uses to
3209 	 * track additional information and is part of the traditional fpregset,
3210 	 * but is not part of our xregs information. Because we are setting this
3211 	 * state, we leave it up to the rest of the kernel to determine whether
3212 	 * this came from an fpregset_t or is being reset to the default of 0.
3213 	 */
3214 	fpu->fpu_regs.kfpu_status = status;
3215 	fpu->fpu_regs.kfpu_xstatus = xstatus;
3216 
3217 	fpu->fpu_flags |= FPU_VALID;
3218 	PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3219 	kpreempt_enable();
3220 }
3221 
3222 /*
3223  * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
3224  * kernel, this is just an fxsave_state with additional values for the status
3225  * and xstatus members.
3226  *
3227  * This has the same nuance as the xregs cases discussed above, but is simpler
3228  * in that we only need to handle the fxsave state, but more complicated because
3229  * we need to check our save mechanism.
3230  */
3231 void
fpu_get_fpregset(klwp_t * lwp,fpregset_t * fp)3232 fpu_get_fpregset(klwp_t *lwp, fpregset_t *fp)
3233 {
3234 	struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3235 
3236 	kpreempt_disable();
3237 	fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
3238 	fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
3239 
3240 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3241 		/*
3242 		 * If we're requesting the fpregs of a thread that isn't
3243 		 * currently valid and isn't the one that we're executing, then
3244 		 * we consider getting this information to be a best-effort and
3245 		 * we will not stop the thread in question to serialize it,
3246 		 * which means possibly getting stale data. This is the
3247 		 * traditional semantics that the system has used to service
3248 		 * this for /proc.
3249 		 */
3250 		if (curthread == lwptot(lwp)) {
3251 			VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3252 			fp_save(fpu);
3253 		}
3254 	}
3255 
3256 	/*
3257 	 * If the FPU is not enabled and the state isn't valid (due to someone
3258 	 * else setting it), just copy the initial state.
3259 	 */
3260 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
3261 		bcopy(&sse_initial, fp, sizeof (sse_initial));
3262 		kpreempt_enable();
3263 		return;
3264 	}
3265 
3266 	/*
3267 	 * Given that we have an enabled FPU, we must look at the type of FPU
3268 	 * save mechanism to clean this up. In particular, while we can just
3269 	 * copy the save area with FXSAVE, with XSAVE we must carefully copy
3270 	 * only the bits that are valid and reset the rest to their default
3271 	 * state.
3272 	 */
3273 	switch (fp_save_mech) {
3274 	case FP_FXSAVE:
3275 		bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
3276 		    sizeof (struct fxsave_state));
3277 		break;
3278 	case FP_XSAVE:
3279 		fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
3280 		    (struct fxsave_state *)fp);
3281 		break;
3282 	default:
3283 		panic("Invalid fp_save_mech");
3284 	}
3285 
3286 	kpreempt_enable();
3287 }
3288 
3289 /*
3290  * This is a request to set the ABI fpregset_t into our actual hardware state.
3291  * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
3292  * 512-byte fxsave area.
3293  */
3294 void
fpu_set_fpregset(klwp_t * lwp,const fpregset_t * fp)3295 fpu_set_fpregset(klwp_t *lwp, const fpregset_t *fp)
3296 {
3297 	struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3298 
3299 	kpreempt_disable();
3300 	if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3301 		/*
3302 		 * We always save the entire FPU. This is required if we're
3303 		 * using xsave. If we're using fxsave, we could skip the
3304 		 * 512-byte write and instead just disable the FPU since we'd be
3305 		 * replacing it all. For now we don't bother with more
3306 		 * conditional logic.
3307 		 */
3308 		VERIFY3P(curthread, ==, lwptot(lwp));
3309 		VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3310 		fp_save(fpu);
3311 	}
3312 
3313 	fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
3314 	fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
3315 	switch (fp_save_mech) {
3316 	case FP_FXSAVE:
3317 		bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
3318 		    sizeof (struct fxsave_state));
3319 		break;
3320 	case FP_XSAVE:
3321 		bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
3322 		    sizeof (struct fxsave_state));
3323 		fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
3324 		    XFEATURE_LEGACY_FP | XFEATURE_SSE;
3325 		break;
3326 	default:
3327 		panic("Invalid fp_save_mech");
3328 	}
3329 
3330 	fpu->fpu_flags |= FPU_VALID;
3331 	PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3332 	kpreempt_enable();
3333 }
3334