1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2021 Joyent, Inc.
24 * Copyright 2021 RackTop Systems, Inc.
25 * Copyright 2023 Oxide Computer Company
26 */
27
28 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
30 /* All Rights Reserved */
31
32 /* Copyright (c) 1987, 1988 Microsoft Corporation */
33 /* All Rights Reserved */
34
35 /*
36 * Copyright (c) 2009, Intel Corporation.
37 * All rights reserved.
38 */
39
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/signal.h>
43 #include <sys/regset.h>
44 #include <sys/privregs.h>
45 #include <sys/psw.h>
46 #include <sys/trap.h>
47 #include <sys/fault.h>
48 #include <sys/systm.h>
49 #include <sys/user.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/pcb.h>
53 #include <sys/lwp.h>
54 #include <sys/cpuvar.h>
55 #include <sys/thread.h>
56 #include <sys/disp.h>
57 #include <sys/fp.h>
58 #include <sys/siginfo.h>
59 #include <sys/archsystm.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <sys/x86_archext.h>
63 #include <sys/sysmacros.h>
64 #include <sys/cmn_err.h>
65 #include <sys/kfpu.h>
66 #include <sys/stdbool.h>
67 #include <sys/stdalign.h>
68 #include <sys/procfs_isa.h>
69 #include <sys/sunddi.h>
70
71 /*
72 * FPU Management Overview
73 * -----------------------
74 *
75 * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
76 * however, many aspects of its life as a coprocessor are still around in x86.
77 *
78 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
79 * While that state still exists, there is much more that is covered by the FPU.
80 * Today, this includes not just traditional FPU state, but also supervisor only
81 * state. The following state is currently managed and covered logically by the
82 * idea of the FPU registers and more generally is called the Extended Processor
83 * States:
84 *
85 * o Traditional x87 FPU
86 * o Vector Registers (%xmm, %ymm, %zmm)
87 * o Memory Protection Extensions (MPX) Bounds Registers
88 * o Protected Key Rights Registers (PKRU)
89 * o Processor Trace data
90 * o Control-Flow Enforcement state
91 * o Hardware Duty Cycle
92 * o Hardware P-states
93 *
94 * The rest of this covers how the FPU is managed and controlled, how state is
95 * saved and restored between threads, interactions with hypervisors, and other
96 * information exported to userland through aux vectors. A lot of background
97 * information is here to synthesize major parts of the Intel SDM, but
98 * unfortunately, it is not a replacement for reading it.
99 *
100 * FPU Control Registers
101 * ---------------------
102 *
103 * Because the x87 FPU began its life as a co-processor and the FPU was
104 * optional there are several bits that show up in %cr0 that we have to
105 * manipulate when dealing with the FPU. These are:
106 *
107 * o CR0.ET The 'extension type' bit. This was used originally to indicate
108 * that the FPU co-processor was present. Now it is forced on for
109 * compatibility. This is often used to verify whether or not the
110 * FPU is present.
111 *
112 * o CR0.NE The 'native error' bit. Used to indicate that native error
113 * mode should be enabled. This indicates that we should take traps
114 * on FPU errors. The OS enables this early in boot.
115 *
116 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not
117 * wait/fwait instructions generate a #NM if CR0.TS is set.
118 *
119 * o CR0.EM The 'Emulation' bit. This is used to cause floating point
120 * operations (x87 through SSE4) to trap with a #UD so they can be
121 * emulated. The system never sets this bit, but makes sure it is
122 * clear on processor start up.
123 *
124 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating
125 * point operation will generate a #NM. An fwait will as well,
126 * depending on the value in CR0.MP.
127 *
128 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
129 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
130 * complicated role. Historically it has been used to allow running systems to
131 * restore the FPU registers lazily. This will be discussed in greater depth
132 * later on.
133 *
134 * %cr4 is also used as part of the FPU control. Specifically we need to worry
135 * about the following bits in the system:
136 *
137 * o CR4.OSFXSR This bit is used to indicate that the OS understands and
138 * supports the execution of the fxsave and fxrstor
139 * instructions. This bit is required to be set to enable
140 * the use of the SSE->SSE4 instructions.
141 *
142 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand
143 * and take a SIMD floating point exception (#XM). This bit
144 * is always enabled by the system.
145 *
146 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and
147 * supports the execution of the xsave and xrstor family of
148 * instructions. This bit is required to use any of the AVX
149 * and newer feature sets.
150 *
151 * Because all supported processors are 64-bit, they'll always support the XMM
152 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
153 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
154 *
155 * %xcr0 is used to manage the behavior of the xsave feature set and is only
156 * present on the system if xsave is supported. %xcr0 is read and written to
157 * through by the xgetbv and xsetbv instructions. This register is present
158 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
159 * different component of the xsave state and controls whether or not that
160 * information is saved and restored. For newer feature sets like AVX and MPX,
161 * it also controls whether or not the corresponding instructions can be
162 * executed (much like CR0.OSFXSR does for the SSE feature sets).
163 *
164 * Everything in %xcr0 is around features available to users. There is also the
165 * IA32_XSS MSR which is used to control supervisor-only features that are still
166 * part of the xsave state. Bits that can be set in %xcr0 are reserved in
167 * IA32_XSS and vice versa. This is an important property that is particularly
168 * relevant to how the xsave instructions operate.
169 *
170 * Save Mechanisms
171 * ---------------
172 *
173 * When switching between running threads the FPU state needs to be saved and
174 * restored by the OS. If this state was not saved, users would rightfully
175 * complain about corrupt state. There are three mechanisms that exist on the
176 * processor for saving and restoring these state images:
177 *
178 * o fsave
179 * o fxsave
180 * o xsave
181 *
182 * fsave saves and restores only the x87 FPU and is the oldest of these
183 * mechanisms. This mechanism is never used in the kernel today because we are
184 * always running on systems that support fxsave.
185 *
186 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
187 * state to be saved and restored to and from a struct fxsave_state. This is the
188 * default mechanism that is used to save and restore the FPU on amd64. An
189 * important aspect of fxsave that was different from the original i386 fsave
190 * mechanism is that the restoring of FPU state with pending exceptions will not
191 * generate an exception, it will be deferred to the next use of the FPU.
192 *
193 * The final and by far the most complex mechanism is that of the xsave set.
194 * xsave allows for saving and restoring all of the traditional x86 pieces (x87
195 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
196 * registers.
197 *
198 * Data is saved and restored into and out of a struct xsave_state. The first
199 * part of the struct xsave_state is equivalent to the struct fxsave_state.
200 * After that, there is a header which is used to describe the remaining
201 * portions of the state. The header is a 64-byte value of which the first two
202 * uint64_t values are defined and the rest are reserved and must be zero. The
203 * first uint64_t is the xstate_bv member. This describes which values in the
204 * xsave_state are actually valid and present. This is updated on a save and
205 * used on restore. The second member is the xcomp_bv member. Its last bit
206 * determines whether or not a compressed version of the structure is used.
207 *
208 * When the uncompressed structure is used (currently the only format we
209 * support), then each state component is at a fixed offset in the structure,
210 * even if it is not being used. For example, if you only saved the AVX related
211 * state, but did not save the MPX related state, the offset would not change
212 * for any component. With the compressed format, components that aren't used
213 * are all elided (though the x87 and SSE state are always there).
214 *
215 * Unlike fxsave which saves all state, the xsave family does not always save
216 * and restore all the state that could be covered by the xsave_state. The
217 * instructions all take an argument which is a mask of what to consider. This
218 * is the same mask that will be used in the xstate_bv vector and it is also the
219 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
220 * considered with the xsaves and xrstors instructions.
221 *
222 * When a save or restore is requested, a bitwise and is performed between the
223 * requested bits and those that have been enabled in %xcr0. Only the bits that
224 * match that are then saved or restored. Others will be silently ignored by
225 * the processor. This idea is used often in the OS. We will always request that
226 * we save and restore all of the state, but only those portions that are
227 * actually enabled in %xcr0 will be touched.
228 *
229 * If a feature has been asked to be restored that is not set in the xstate_bv
230 * feature vector of the save state, then it will be set to its initial state by
231 * the processor (usually zeros). Also, when asked to save state, the processor
232 * may not write out data that is in its initial state as an optimization. This
233 * optimization only applies to saving data and not to restoring data.
234 *
235 * There are a few different variants of the xsave and xrstor instruction. They
236 * are:
237 *
238 * o xsave This is the original save instruction. It will save all of the
239 * requested data in the xsave state structure. It only saves data
240 * in the uncompressed (xcomp_bv[63] is zero) format. It may be
241 * executed at all privilege levels.
242 *
243 * o xrstor This is the original restore instruction. It will restore all of
244 * the requested data. The xrstor function can handle both the
245 * compressed and uncompressed formats. It may be executed at all
246 * privilege levels.
247 *
248 * o xsaveopt This is a variant of the xsave instruction that employs
249 * optimizations to try and only write out state that has been
250 * modified since the last time an xrstor instruction was called.
251 * The processor tracks a tuple of information about the last
252 * xrstor and tries to ensure that the same buffer is being used
253 * when this optimization is being used. However, because of the
254 * way that it tracks the xrstor buffer based on the address of it,
255 * it is not suitable for use if that buffer can be easily reused.
256 * The most common case is trying to save data to the stack in
257 * rtld. It may be executed at all privilege levels.
258 *
259 * o xsavec This is a variant of the xsave instruction that writes out the
260 * compressed form of the xsave_state. Otherwise it behaves as
261 * xsave. It may be executed at all privilege levels.
262 *
263 * o xsaves This is a variant of the xsave instruction. It is similar to
264 * xsavec in that it always writes the compressed form of the
265 * buffer. Unlike all the other forms, this instruction looks at
266 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
267 * what to save and restore. xsaves also implements the same
268 * optimization that xsaveopt does around modified pieces. User
269 * land may not execute the instruction.
270 *
271 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves
272 * it can save and restore both the user and privileged states.
273 * Unlike xrstor it can only operate on the compressed form.
274 * User land may not execute the instruction.
275 *
276 * Based on all of these, the kernel has a precedence for what it will use.
277 * Basically, xsaves (not supported) is preferred to xsaveopt, which is
278 * preferred to xsave. A similar scheme is used when informing rtld (more later)
279 * about what it should use. xsavec is preferred to xsave. xsaveopt is not
280 * recommended due to the modified optimization not being appropriate for this
281 * use.
282 *
283 * Finally, there is one last gotcha with the xsave state. Importantly some AMD
284 * processors did not always save and restore some of the FPU exception state in
285 * some cases like Intel did. In those cases the OS will make up for this fact
286 * itself.
287 *
288 * FPU Initialization
289 * ------------------
290 *
291 * One difference with the FPU registers is that not all threads have FPU state,
292 * only those that have an lwp. Generally this means kernel threads, which all
293 * share p0 and its lwp, do not have FPU state. Though there are definitely
294 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
295 * and lwp interchangeably, just think of thread meaning a thread that has a
296 * lwp.
297 *
298 * Each lwp has its FPU state allocated in its pcb (process control block). The
299 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
300 * dynamically at start up based on the save mechanism that we're using and the
301 * amount of memory required for it. This is dynamic because the xsave_state
302 * size varies based on the supported feature set.
303 *
304 * The hardware side of the FPU is initialized early in boot before we mount the
305 * root file system. This is effectively done in fpu_probe(). This is where we
306 * make the final decision about what the save and restore mechanisms we should
307 * use are, create the fpsave_cachep kmem cache, and initialize a number of
308 * function pointers that use save and restoring logic.
309 *
310 * The thread/lwp side is a a little more involved. There are two different
311 * things that we need to concern ourselves with. The first is how the FPU
312 * resources are allocated and the second is how the FPU state is initialized
313 * for a given lwp.
314 *
315 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
316 * This is always called unconditionally by the system as part of creating an
317 * LWP.
318 *
319 * There are three different initialization paths that we deal with. The first
320 * is when we are executing a new process. As part of exec all of the register
321 * state is reset. The exec case is particularly important because init is born
322 * like Athena, sprouting from the head of the kernel, without any true parent
323 * to fork from. The second is used whenever we fork or create a new lwp. The
324 * third is to deal with special lwps like the agent lwp.
325 *
326 * During exec, we will call fp_exec() which will initialize and set up the FPU
327 * state for the process. That will fill in the initial state for the FPU and
328 * also set that state in the FPU itself. As part of fp_exec() we also install a
329 * thread context operations vector that takes care of dealing with the saving
330 * and restoring of the FPU. These context handlers will also be called whenever
331 * an lwp is created or forked. In those cases, to initialize the FPU we will
332 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
333 * operations vector for the new thread.
334 *
335 * Next we'll end up in the context operation fp_new_lwp(). This saves the
336 * current thread's state, initializes the new thread's state, and copies over
337 * the relevant parts of the originating thread's state. It's as this point that
338 * we also install the FPU context operations into the new thread, which ensures
339 * that all future threads that are descendants of the current one get the
340 * thread context operations (unless they call exec).
341 *
342 * To deal with some things like the agent lwp, we double check the state of the
343 * FPU in sys_rtt_common() to make sure that it has been enabled before
344 * returning to userland. In general, this path should be rare, but it's useful
345 * for the odd lwp here and there.
346 *
347 * The FPU state will remain valid most of the time. There are times that
348 * the state will be rewritten. For example in restorecontext, due to /proc, or
349 * the lwp calls exec(). Whether the context is being freed or we are resetting
350 * the state, we will call fp_free() to disable the FPU and our context.
351 *
352 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
353 * state by calling fp_lwp_cleanup().
354 *
355 * Kernel FPU Multiplexing
356 * -----------------------
357 *
358 * Just as the kernel has to maintain all of the general purpose registers when
359 * switching between scheduled threads, the same is true of the FPU registers.
360 *
361 * When a thread has FPU state, it also has a set of context operations
362 * installed. These context operations take care of making sure that the FPU is
363 * properly saved and restored during a context switch (fpsave_ctxt and
364 * fprestore_ctxt respectively). This means that the current implementation of
365 * the FPU is 'eager', when a thread is running the CPU will have its FPU state
366 * loaded. While this is always true when executing in userland, there are a few
367 * cases where this is not true in the kernel.
368 *
369 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
370 * employed. This meant that the FPU would be saved on a context switch and the
371 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
372 * then take a #NM trap, at which point we would restore the FPU from the save
373 * area and return to userland. Given the frequency of use of the FPU alone by
374 * libc, there's no point returning to userland just to trap again.
375 *
376 * There are a few cases though where the FPU state may need to be changed for a
377 * thread on its behalf. The most notable cases are in the case of processes
378 * using /proc, restorecontext, forking, etc. In all of these cases the kernel
379 * will force a threads FPU state to be saved into the PCB through the fp_save()
380 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
381 * pcb. This indicates that the save state holds currently valid data. As a side
382 * effect of this, CR0.TS will be set. To make sure that all of the state is
383 * updated before returning to userland, in these cases, we set a flag on the
384 * PCB that says the FPU needs to be updated. This will make sure that we take
385 * the slow path out of a system call to fix things up for the thread. Due to
386 * the fact that this is a rather rare case, effectively setting the equivalent
387 * of t_postsys is acceptable.
388 *
389 * CR0.TS will be set after a save occurs and cleared when a restore occurs.
390 * Generally this means it will be cleared immediately by the new thread that is
391 * running in a context switch. However, this isn't the case for kernel threads.
392 * They currently operate with CR0.TS set as no kernel state is restored for
393 * them. This means that using the FPU will cause a #NM and panic.
394 *
395 * The FPU_VALID flag on the currently executing thread's pcb is meant to track
396 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
397 * However, because we eagerly restore, the only time that CR0.TS should be set
398 * for a non-kernel thread is during operations where it will be cleared before
399 * returning to userland and importantly, the only data that is in it is its
400 * own.
401 *
402 * Kernel FPU Usage
403 * ----------------
404 *
405 * Traditionally the kernel never used the FPU since it had no need for
406 * floating point operations. However, modern FPU hardware supports a variety
407 * of SIMD extensions which can speed up code such as parity calculations or
408 * encryption.
409 *
410 * To allow the kernel to take advantage of these features, the
411 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
412 * around any usage of the FPU by the kernel to ensure that user-level context
413 * is properly saved/restored, as well as to properly setup the FPU for use by
414 * the kernel. There are a variety of ways this wrapping can be used, as
415 * discussed in this section below.
416 *
417 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
418 * operations, the kernel_fpu_alloc() function should be used to allocate a
419 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
420 * state. This structure is not tied to any thread. That is, different threads
421 * can reuse the same kfpu_state_t structure, although not concurrently. A
422 * kfpu_state_t structure is freed by the kernel_fpu_free() function.
423 *
424 * In some cases, the kernel may need to use the FPU for a short operation
425 * without the overhead to manage a kfpu_state_t structure and without
426 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
427 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
428 * parameter. This indicates that there is no kfpu_state_t. When used this way,
429 * kernel preemption should be disabled by the caller (kpreempt_disable) before
430 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
431 * For this usage, it is important to limit the kernel's FPU use to short
432 * operations. The tradeoff between using the FPU without a kfpu_state_t
433 * structure vs. the overhead of allowing a context switch while using the FPU
434 * should be carefully considered on a case by case basis.
435 *
436 * In other cases, kernel threads have an LWP, but never execute in user space.
437 * In this situation, the LWP's pcb_fpu area can be used to save/restore the
438 * kernel's FPU state if the thread is context switched, instead of having to
439 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
440 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
441 * enable this behavior. It is the caller's responsibility to ensure that this
442 * is only used for a kernel thread which never executes in user space.
443 *
444 * FPU Exceptions
445 * --------------
446 *
447 * Certain operations can cause the kernel to take traps due to FPU activity.
448 * Generally these events will cause a user process to receive a SIGFPU and if
449 * the kernel receives it in kernel context, we will die. Traditionally the #NM
450 * (Device Not Available / No Math) exception generated by CR0.TS would have
451 * caused us to restore the FPU. Now it is a fatal event regardless of whether
452 * or not userland causes it.
453 *
454 * While there are some cases where the kernel uses the FPU, it is up to the
455 * kernel to use the FPU in a way such that it cannot receive a trap or to use
456 * the appropriate trap protection mechanisms.
457 *
458 * Hypervisors
459 * -----------
460 *
461 * When providing support for hypervisors things are a little bit more
462 * complicated because the FPU is not virtualized at all. This means that they
463 * need to save and restore the FPU and %xcr0 across entry and exit to the
464 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
465 * allow us to use the full native state to make sure that we are always saving
466 * and restoring the full FPU that the host sees, even when the guest is using a
467 * subset.
468 *
469 * One tricky aspect of this is that the guest may be using a subset of %xcr0
470 * and therefore changing our %xcr0 on the fly. It is vital that when we're
471 * saving and restoring the FPU that we always use the largest %xcr0 contents
472 * otherwise we will end up leaving behind data in it.
473 *
474 * ELF PLT Support
475 * ---------------
476 *
477 * rtld has to preserve a subset of the FPU when it is saving and restoring
478 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
479 * more information. As a result, we set up an aux vector that contains
480 * information about what save and restore mechanisms it should be using and
481 * the sizing thereof based on what the kernel supports. This is passed down in
482 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
483 * initialized in fpu_subr.c.
484 *
485 * Signal Handling and the ucontext_t
486 * ----------------------------------
487 *
488 * One of the many gifts that signals give us is the twofold fact that when a
489 * signal occurs, the signal handler is allowed to change the CPU's state
490 * arbitrarily and when the signal handler is done executing, we must restore it
491 * back to the original state. However, the second part of this is that the
492 * signal handler is actually allowed to modify the state that the thread will
493 * return to! To create this facade, the kernel will create a full ucontext_t
494 * state, effectively calling getcontext(2) on the thread's behalf, and a
495 * pointer to that is given to the signal handler (the void * argument for the
496 * sa_sigaction function pointer in sigaction(2)). When libc is done with a
497 * signal, it will call setcontext(2) with that same ucontext_t.
498 *
499 * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and
500 * it's often declared on the stack itself, with the signal handler spilling all
501 * this state to the stack. The ucontext_t machine portion was broken into the
502 * general purpose and floating point registers. In 64-bit code, the floating
503 * point registers were mostly the same as the results of the fxsave instruction
504 * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent
505 * starting point for information, it is transformed into a different shape to
506 * deal with the history of the 32-bit SYS V ABI.
507 *
508 * While this worked, if you're reading this, you're aware that the x86 FPU and
509 * extended register states didn't stop at the initial 16 128-bit %xmm
510 * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k
511 * opmask registers. None of these fit inside the standard ucontext_t; however,
512 * they must all be preserved and restored across a signal. While the various
513 * x86 platform-specific ABIs all suggest that these registers are not preserved
514 * across a function call, receiving a signal is not a function call and must be
515 * thought of like a process receiving an interrupt. In other words, this
516 * extended state must be preserved.
517 *
518 * To facilitate this, we have extended the ucontext_t structure with an
519 * additional flag, UC_XSAVE, which indicates that the traditional padding
520 * member, uc_xsave, actually is a pointer to the extended state. While this is
521 * accessible outside of a signal handling context through the combination of
522 * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this
523 * state is focused on signal handling. Signal handling spills all this state to
524 * the stack and if we cannot spill the entire state to the stack then our
525 * inability to deliver the signal results in the process being killed! While
526 * there are separate efforts to ensure that the signal stack sizing that is
527 * used for the minimum and maximum signal sizes are sufficient, we still need
528 * to do our part to minimize the likelihood here.
529 *
530 * In designing this, we make the following observations which have helped us
531 * focus our design:
532 *
533 * o While the start of an xsave area is the traditional 512-byte fxsave XMM
534 * region, we already have that in the fpregs. Thus there is no reason to
535 * duplicate it. This not only saves 512 bytes of additional stack space,
536 * but it also means we don't have to ask which of the version of it to take
537 * if they were to differ.
538 *
539 * o Many applications out there aren't necessarily using the extended vectors
540 * and even when we do make libc and others take advantage of it, it will
541 * behoove us to ensure that they are put back into their initial state
542 * after use. This leads us to expect that in a number of cases, the actual
543 * extended register state will be in its initial state.
544 *
545 * o While the signal handler does allow contents to be modified, we are
546 * starting with making the interface private and thus allowing us to excise
547 * components that are in their initial state.
548 *
549 * o There are similarities to what we want to create with the compressed
550 * xsave format; however, because we don't always have support for the
551 * compressed format, we can't just arbitrarily say let's do a compressed
552 * save to the user stack.
553 *
554 * o Because we are not handing this state directly to and from hardware, we
555 * don't need to meet some of the constraints of the compressed xsave format
556 * around wanting alignment for the initial save or additional components.
557 *
558 * All of the above lead us to our own unique format for this data. When the
559 * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a
560 * uc_xsave_t structure which has a magic version number, a 32-bit length of the
561 * overall structure, and the 64-bit state bit-vector to represent which
562 * components are valid. Following this 8-byte header, each component that is
563 * present in the bit vector is immediately written out in roughly ascending bit
564 * order (the order is determined based on the order of the fpu_xsave_info
565 * array).
566 *
567 * This makes the rough logic that we have here when taking a signal and writing
568 * out this state as:
569 *
570 * 1. Ensure that the FPU is saved and that the contents of the pcb save area
571 * are valid. That is, call fp_save() if the state is not already flagged
572 * with FPU_VALID.
573 *
574 * 2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP
575 * and XFEATURE_SSE bits as these will be placed in the xsave area.
576 *
577 * 3. Initialize the uc_xsave_t by setting our version field, initializing the
578 * length to the length of the current structure, and then setting the
579 * modified bit vector above.
580 *
581 * 4. Walk each remaining bit of the bit-vector. For each set bit, copy out
582 * its extended state starting at the current length in the header and then
583 * increase the header size by that length.
584 *
585 * 5. Finally write out the final uc_xsave_t structure.
586 *
587 * The above process is also used when someone manually calls getcontext_extd(2)
588 * to get this state. The main difference between the two is which copyout
589 * function we use. This deserves some explanation. Our main starting point for
590 * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows
591 * the signal handling context to operate with a different copyout than we
592 * normally use in say getcontext_extd(2).
593 *
594 * When we've received a signal, we're at the intersection of several different
595 * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is,
596 * the watchpoints effectively set a copyout override function (t_copyops) that
597 * we end up vectoring to rather than a normal copyout. This allows the data to
598 * be modified and for the watchpoint to fire. While this is all well and good
599 * normally, it is problematic if we are trying to handle a signal. The signal
600 * deliver logic, sendsig(), goes through and disables the watchpoint for the
601 * region of the stack that we are copying out to. However, disabling
602 * watchpoints is not sufficient, we also need to use the copyout_noerr
603 * variants.
604 *
605 * These variants also require the use of on_fault() and no_fault() for error
606 * handling. While it is tempting to try and on_fault() the entire
607 * fpu_signal_copyout() operation, that is actually fraught for a few reasons.
608 * The first is that we don't want to disable faults during the entire operation
609 * as if the kernel messes up we will treat that as a user error. That isn't
610 * theoretical and happened during development. The second and perhaps more
611 * important issue is that correctly bounding the on_fault() / no_fault() means
612 * being careful about state. For example, kernel pre-emption is often disabled
613 * during parts of these operations, but it needs to be re-enabled when we're
614 * done. This would require tracking in some volatile variable that this had
615 * been enabled and disabled and tracking that.
616 *
617 * Instead, this is why fpu_signal_copyout() takes a copy out function as an
618 * argument. When we're in signal handling context, the function will use
619 * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms.
620 *
621 * RESTORING STATE
622 *
623 * Copying out our current state is the easier half of this problem. When the
624 * kernel is done with a signal it calls setcontext(2) with the ucontext_t we
625 * assembled for it as described above. setcontext(2) isn't just used for
626 * returning from signals.
627 *
628 * The process for this goes in two steps. The first step is to copy in,
629 * validate, and transform the ucontext_t UC_XSAVE that we created above into an
630 * equivalent xsave format that we can use the appropriate xrstor function on.
631 * This first phase is implemented in fpu_signal_copyin(). Once that is done, we
632 * come back through a second phase that is driven out of restorecontext() and
633 * is implemented in fpu_set_xsave().
634 *
635 * Let's start by discussing the second part of this, which is more
636 * straightforward. In particular, the second phase assumes that all of the
637 * validation and error handling has been done by the first phase. This means
638 * here, we have a buffer that is already the appropriate size
639 * (cpuid_get_xsave_size()) and all we need to do is make sure that we can
640 * replace the actual save state with the current one.
641 *
642 * The only piece of shenanigans we have to do is around the kernel provided
643 * notion of 'status' and 'xstatus', which are cached versions of the x87 and
644 * SSE exception vectors. These are part of the fpregset ABI and therefore we
645 * need to propagate them from the temporary storage that part 1 sets up in the
646 * ignored region of the fxsave data. We use that because it is not persisted by
647 * the CPU, so clobbering it is generally alright.
648 *
649 * Once that is done, we simply note that we need a PCB update to occur to
650 * refresh the FPU state before we return to userland. Given that someone has
651 * called setcontext(2), this was always going to happen because we have to
652 * update segment registers and related, so this isn't so bad. With that, let's
653 * move onto the more nuanced part (1).
654 *
655 * When we're handling a setcontext(2) we have, in userland, a data structure
656 * that should match one we serialized out, though we cannot assume that a user
657 * has not modified it either accidentally or maliciously. Our goal is to set up
658 * the appropriate xsave state that can be passed to the CPU's xrstor. The first
659 * problem we have to deal with is where do we actually put this state?
660 *
661 * While not many programs actually call setcontext(2) of their own volition,
662 * this is going to get hit every time we take a signal. The first thought was
663 * to re-use the existing thread's save area; however, that's a bit challenging
664 * for a few reasons. In particular, we would need to ensure that we don't go
665 * off-CPU for any reason, which we cannot assume with a copyin from a user
666 * address space. In particular, it is trivial for us to hit a case where the
667 * stack has been paged out for some reason, which eschews that path.
668 *
669 * Instead, whenever a thread first calls setcontext(2), generally from signal
670 * context, we will at that time allocate another entry from the 'fpsave_cachep'
671 * kmem cache, giving us a buffer of the appropriate space to handle this. Once
672 * this buffer has been allocated, we leave it assigned to the thread's pcb and
673 * only tear it down when the thread itself finally exits. We reason that a
674 * thread that takes a signal once is either going to have the process exit
675 * shortly thereafter or is much more likely to take a signal again in the
676 * future. Many daemons and other processes set things up so signals are
677 * dispatched via one location, masking signals in other thread, using
678 * sigsuspend(2), signalfd(3C), or something similar.
679 *
680 * With this buffer in hand, we begin our task of reassembling state. Note, all
681 * of this is conditional on UC_XSAVE being set in the uc_flags member of the
682 * ucontext_t. If it is not set, then we assume that there is no extended state
683 * and will use the traditional path of setting the fpregset_t into the system
684 * via setfpregs().
685 *
686 * We first will copyin and validate the uc_xsave_t. In particular, we need to
687 * make sure the version makes sense, that the xsave component bit-vector
688 * doesn't have anything unexpected and more importantly unsupported in it, and
689 * that the addresses we've been given are within the user address space. At
690 * this point we can walk through our table of implemented bits and process
691 * them.
692 *
693 * For most components in here, the processing is straightforward. We continue
694 * walking our cursor and copy data into the kernel and place it in the
695 * appropriate place in our xsave state. If a xsave state component bit-vector
696 * isn't set, then we must ensure that we have the item in the initial state,
697 * which for everything other than the x87/SSE state is the memory being zeroed.
698 *
699 * The most unique case in the copyin state is that of the x87/SSE state. You
700 * might recall that we didn't copy it out explicitly as part of the uc_xsave_t,
701 * but instead have opted to use the single definition in the fpregset_t. Thus
702 * here, we copy it out of the fpregset_t, which the kernel has helpfully
703 * already unified into the 64-bit fxsave version prior to calling us, and
704 * install that into the save area we're building up.
705 *
706 * As part of this, there are two important pieces to be aware of. The first is
707 * that because the fpregset_t has both the status and xstatus members
708 * mentioned earlier, we temporarily copy them to the software-usable ignored
709 * areas of the fxsave state so we can corral this extra state into part (2)
710 * without needing to allocate additional space. The second piece is that when
711 * we're done processing this we explicitly remove the UC_FPU flag that would
712 * tell the kernel to proceed with updating that region. The problem is that
713 * that goes directly into the pcb's save area and not to the intermediate
714 * buffer as it uses the same entry point as /proc, mainly setfpregs().
715 *
716 * We don't do much validation of the actual contents of the registers that are
717 * being set with the exception of ensuring that no reserved bits of the mxcsr
718 * are used. This is not as strict as /proc, but failure here means the process
719 * is likely going to die (returning from setcontext() in a signal handler is
720 * fatal).
721 *
722 * /proc xregs
723 * -----------
724 *
725 * Observability of the state of the extended registers is important for
726 * understanding the system. While on the surface this is similar to signal
727 * handling, it is crucially different in a number of ways:
728 *
729 * o In signal handling, we're trying to conserve every byte of stack that we
730 * can.
731 * o The /proc xregs file will end up in core files, which means that we need
732 * a way of knowing what components are present and not present in it,
733 * because this will vary from CPU to CPU due to the addition of
734 * architectural features. For example, some CPUs support AVX-512, but
735 * others do not.
736 *
737 * o The signal handling structure (uc_xsave_t) is private and we're not
738 * trying to have software modify it, on the other hand, the /proc
739 * interfaces that we support we do want software to be able to interrogate
740 * and manipulate. These need to be something that we can introduce
741 * additional components into and make other changes that still allow it to
742 * work.
743 *
744 * The x86 xregs format is documented in proc(5). The short form is that the
745 * prxregset_hdr_t has a number of information entries, which are of the type
746 * prxregset_info_t. Each of the information headers has a type, size, and
747 * offset which indicate where to find the additional data.
748 *
749 * Each entry is described as one of the entries in the fpu_xsave_info[]. These
750 * items either are a 1:1 correspondence with a xsave related feature (e.g.
751 * there is one entry for each of the three AVX-512 components) or it is
752 * something synthetic that we provide as additional information such as the
753 * PRX_INFO_XCR, which is a way of getting information about the system such as
754 * what is enabled in %xcr0 out there.
755 *
756 * Unlike signal handling, we are given the buffer to place everything that
757 * needs to be written out. This is partially the design of the /proc APIs. That
758 * is, we will always assemble everything into the entire buffer that /proc asks
759 * us to, and then it will use as much or as little of it as is required.
760 * Similarly, when setting things, we don't have to worry about copying in
761 * information in the same way as signal handling does, because /proc takes care
762 * of it and always hands us a full buffer. Sizing that is a little nuanced, but
763 * is all handled in prmachdep.c.
764 *
765 * When someone performs a read of the xregs and thus is asking us for the
766 * current state, there is a little bit of nuance that we need to deal with.
767 * The first, is whether or not the FPU is enabled and the second is if the FPU
768 * is enabled, whether a given component is noted as being in its initial state.
769 * This basically gives us three possible states for a given component:
770 *
771 * 1. FPU_EN is not set and FPU_VALID is not set. This means we need to take
772 * the illumos FPU default for an item. More on that in a moment.
773 * 2. The saved xsave state indicates that the bit for a given component is
774 * zero -- specifically the xsh_xstate_bv member of the struct xsave_state.
775 * In this case, we must take the CPU's default for an item. This is
776 * usually the same as illumos, but not always.
777 * 3. The saved xsave state indicates that a given component's state bit is
778 * valid. The simplest of our cases. We can just take what we have from the
779 * xsave state.
780 *
781 * The CPU's default state for most components other than the x87/SSE state is
782 * to have it be zeroed. This is what we treat as our default state as well. The
783 * primary difference is in the initialization of the x87/SSE state. The SYS V
784 * ABI requires that we enable a different floating point control word then the
785 * hardware default. This means that when we're dealing with case (1) for
786 * x87/SSE we have to be more careful than the other components. Thankfully for
787 * everything else this is just keeping it zeroed.
788 *
789 * A reasonable question would be why not just skip components that aren't
790 * marked as present. There are a few reasons we take a different approach and
791 * always include them. Both of these are to make lives simpler for consumers.
792 * In the first case, when someone is performing a read and wants to reassemble
793 * and answer the question of 'what is the value of %ymm0 or %zmm15', they have
794 * to combine multiple disparate parts. If one knows that the data we put into
795 * there is always valid and represents what is in hardware and doesn't have to
796 * keep track of what are the defaults in different circumstances, then that
797 * greatly simplifies consumers lives. It also helps us for core files and other
798 * observability cases because the answer to what is the operating system's
799 * default may change over time.
800 *
801 * Similarly, including all the possible structures means that we have
802 * simplified writes. Writes are always setting the full state of a thread,
803 * meaning that if someone wants to modify only a single register they must do a
804 * read, modify, and write. By including everything that they might need, it
805 * makes it easier for consumers to do this and not have to cons up the whole
806 * structure on their own.
807 *
808 * When we're setting state, things change around a little bit. We have a few
809 * constraints that are laid out in proc(5). In particular, we require that the
810 * PRX_INFO_XSAVE component always be present to tell us which other components
811 * we expect to be here and which ones we don't. We also are much stricter about
812 * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only
813 * and may not be modified by a calling process. In addition, when we have
814 * 32-bit applications which have reserved registers in the %ymm, %zmm, etc.
815 * components, if they are being written to and have modifications, then we will
816 * indicate an error there.
817 *
818 * Because we are given the entire buffer from userland and don't need to have
819 * an intermediate place to copy it in, we will validate the entire thing in
820 * advance. Once it has been validated and we consider it legal, then we will
821 * translate each entry into its corresponding entry in pcb's normal floating
822 * point state. This is different from signal handling mostly because of the
823 * fact that we are not using copyin, and once we get to this point, there is
824 * no more validation, so we don't have the same concerns around blocking while
825 * pre-emption is disabled.
826 *
827 * The Wrinkle with fpregs
828 * -----------------------
829 *
830 * When we instead turn our attention to the fpregs, whether we're gathering
831 * them as part of the ucontext_t or as part of /proc, there are a few
832 * complications that we need to be aware of when we're operating on a kernel
833 * that is using xsave as the save mechanism. When we're using fxsave as the
834 * save mechanism, the CPU will always save the entire 512-byte fxsave region.
835 * The fpregs ABI that the kernel expects is basically this structure itself,
836 * which is transformed into a 32-bit compatible form in archdep.c.
837 *
838 * But xsave makes this much more complex and has historically been a source of
839 * bugs in the system. In particular, unlike fxsave, xsave has its component bit
840 * vector that is written out to indicate validity. This means that blindly
841 * copying the fxsave area without checking those bits will lead us to do the
842 * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers,
843 * while the x87 legacy fp flag covers the rest of the state. This is all good,
844 * aside from the MCXSR.
845 *
846 * One of the more complicated pieces of xsave state management is correctly
847 * answering the question of when the MXCSR is written out to xsave_state. In
848 * practice, this is rather convoluted and varies. If either the XMM or AVX
849 * feature bits are set then the CPU will write out the MXCSR and its mask
850 * register into the traditional fxsave state region. This behavior is dependent
851 * on the type of save function that we use. xsave and xsaveopt will look at the
852 * AVX feature bit; however, xsavec does not and only considers the SSE feature
853 * bit. This means that when we're retrieving things, we need to check both of
854 * those bits to determine if we should use the initial state or the value
855 * written out.
856 *
857 * When we come to someone trying to set the fpregs through /proc, the main
858 * question we have is what happens to the extended registers. We have opted to
859 * implement and document it such that a write to the fpregs only impacts the
860 * fpregs. Put differently, we will save the FPU state with fp_save() ahead of
861 * copying the data into the save area, set the state bits for x87 and XMM
862 * state, and then set the FPU to be restored. All in all, this basically means
863 * that writing to fpregs does not touch any of the %ymm, %zmm, or other state
864 * that we might have present.
865 *
866 * Forward Looking: Adding Intel AMX Support
867 * -----------------------------------------
868 *
869 * Nothing can stop the march of features being added into the FPU. One of the
870 * larger chunks that we will need to wrangle with is Intel's Advanced Matrix
871 * Extensions (AMX), which add a large chunk of xsave state to each process.
872 * While things like AVX and AVX-512 have been enabled by default, the broader
873 * OS community has not been wanting to do this for AMX ,because of the size of
874 * the state which exceeds 8 KiB. While the signal handling state went out of
875 * its way to minimize the size it wrote to the stack, if this is used, it would
876 * need to be preserved.
877 *
878 * To deal with this reality and the fact that folks don't really want to
879 * enable it by default for all purposes when its use will be quite special
880 * purpose, Intel has also added a MSR around extended feature disable or xfd.
881 * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting
882 * assumption, and the reason that so much of the /proc and signal logic ensures
883 * that we have the thread and process around, taking as an example the unused
884 * process argument in fpu_proc_xregs_info(), is that we will follow suit and
885 * default to having support disabled, but that a process will be able to opt
886 * into it, which will result in several different assumptions around signal
887 * stack sizing and cause us to reallocate and extend the pcb's FPU save state.
888 *
889 * The following is a list of items to pay attention to for future folks who
890 * work on this:
891 *
892 * o We will want to confirm whether other systems have opted to make this
893 * process-wide or thread-wide. Assuming process-wide, we will need to do a
894 * hold of all lwps while making a change. The interface for that probably
895 * doesn't want to be /proc, as a process probably doesn't want to write to
896 * its own control file. Changing it for another process could be done
897 * through the agent-lwp.
898 * o Opting into this should probably be a one-way street.
899 * o Opting into this will need to evaluate all threads and in particular
900 * stack sizes to confirm they adhere to the new minimum.
901 * o We will need to make sure that setting and clearing the xfd MSR is part
902 * of the FPU context ops and something we set by default on every CPU.
903 * o We will need to add a new interface to allow opting into this feature.
904 * o We will need to ensure that all subsequently created signal stacks adhere
905 * to a required minimum size that we communicate through libc.
906 * o We will need to make sure that both rtld and libc no longer rely on a
907 * static value of the AT_SUN_FPSIZE, but rather realize that this can be
908 * dynamic. At that time, we should evaluate if we can get away with not
909 * needing to save this for rtld, even though signal handlers should assume
910 * they will.
911 * o The various components (because there is more than one) will want to be
912 * added to the fpu_xsave_info[]. Consulting the processes's xfd will be
913 * required and probably require logic changes.
914 *
915 * The above is not exhaustive. We'll probably have some other issues and fun
916 * while doing this.
917 */
918
919 /*
920 * The kind of FPU we advertise to rtld so it knows what to do when working
921 * through the PLT.
922 */
923 int fp_elf = AT_386_FPINFO_FXSAVE;
924
925 /*
926 * Mechanism to save FPU state.
927 */
928 int fp_save_mech = FP_FXSAVE;
929
930 kmem_cache_t *fpsave_cachep;
931
932 /* Legacy fxsave layout + xsave header + ymm */
933 #define AVX_XSAVE_SIZE (512 + 64 + 256)
934
935 /*
936 * Various sanity checks.
937 */
938 CTASSERT(sizeof (struct fxsave_state) == 512);
939 CTASSERT(sizeof (struct fnsave_state) == 108);
940 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
941 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
942
943 /*
944 * Basic architectural alignment information.
945 */
946 #define FPU_ALIGN_XMM 16
947 #define FPU_ALIGN_YMM 32
948 #define FPU_ALIGN_ZMM 64
949
950 /*
951 * This structure is the x86 implementation of the kernel FPU that is defined in
952 * uts/common/sys/kfpu.h.
953 */
954
955 typedef enum kfpu_flags {
956 /*
957 * This indicates that the save state has initial FPU data.
958 */
959 KFPU_F_INITIALIZED = 0x01
960 } kfpu_flags_t;
961
962 struct kfpu_state {
963 fpu_ctx_t kfpu_ctx;
964 kfpu_flags_t kfpu_flags;
965 kthread_t *kfpu_curthread;
966 };
967
968 /*
969 * Initial kfpu state for SSE/SSE2 used by fpinit()
970 */
971 const struct fxsave_state sse_initial = {
972 FPU_CW_INIT, /* fx_fcw */
973 0, /* fx_fsw */
974 0, /* fx_fctw */
975 0, /* fx_fop */
976 0, /* fx_rip */
977 0, /* fx_rdp */
978 SSE_MXCSR_INIT /* fx_mxcsr */
979 /* rest of structure is zero */
980 };
981
982 /*
983 * Initial kfpu state for AVX used by fpinit()
984 */
985 const struct xsave_state avx_initial = {
986 /*
987 * The definition below needs to be identical with sse_initial
988 * defined above.
989 */
990 .xs_fxsave = {
991 .fx_fcw = FPU_CW_INIT,
992 .fx_mxcsr = SSE_MXCSR_INIT,
993 },
994 .xs_header = {
995 /*
996 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are
997 * valid, and CPU should initialize XMM/YMM.
998 */
999 .xsh_xstate_bv = 1,
1000 .xsh_xcomp_bv = 0,
1001 },
1002 };
1003
1004 /*
1005 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
1006 * the #gp exception caused by setting unsupported bits in the
1007 * MXCSR register
1008 */
1009 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
1010
1011 /*
1012 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
1013 * have an XSAVE-capable chip in fpu_probe.
1014 */
1015 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
1016 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
1017
1018 /*
1019 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
1020 */
1021 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
1022
1023 static int fpe_sicode(uint_t);
1024 static int fpe_simd_sicode(uint_t);
1025 static void fp_new_lwp(void *, void *);
1026 static void fp_free_ctx(void *, int);
1027
1028 static struct ctxop *
fp_ctxop_allocate(struct fpu_ctx * fp)1029 fp_ctxop_allocate(struct fpu_ctx *fp)
1030 {
1031 const struct ctxop_template tpl = {
1032 .ct_rev = CTXOP_TPL_REV,
1033 .ct_save = fpsave_ctxt,
1034 .ct_restore = fprestore_ctxt,
1035 .ct_fork = fp_new_lwp,
1036 .ct_lwp_create = fp_new_lwp,
1037 .ct_free = fp_free_ctx,
1038 };
1039 return (ctxop_allocate(&tpl, fp));
1040 }
1041
1042 /*
1043 * Copy the state of parent lwp's floating point context into the new lwp.
1044 * Invoked for both fork() and lwp_create().
1045 *
1046 * Note that we inherit -only- the control state (e.g. exception masks,
1047 * rounding, precision control, etc.); the FPU registers are otherwise
1048 * reset to their initial state.
1049 */
1050 static void
fp_new_lwp(void * parent,void * child)1051 fp_new_lwp(void *parent, void *child)
1052 {
1053 kthread_id_t t = parent, ct = child;
1054 struct fpu_ctx *fp; /* parent fpu context */
1055 struct fpu_ctx *cfp; /* new fpu context */
1056 struct fxsave_state *fx, *cfx;
1057 struct xsave_state *cxs;
1058
1059 ASSERT(fp_kind != FP_NO);
1060
1061 fp = &t->t_lwp->lwp_pcb.pcb_fpu;
1062 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
1063
1064 /*
1065 * If the parent FPU state is still in the FPU hw then save it;
1066 * conveniently, fp_save() already does this for us nicely.
1067 */
1068 fp_save(fp);
1069
1070 cfp->fpu_flags = FPU_EN | FPU_VALID;
1071 cfp->fpu_regs.kfpu_status = 0;
1072 cfp->fpu_regs.kfpu_xstatus = 0;
1073
1074 /*
1075 * Make sure that the child's FPU is cleaned up and made ready for user
1076 * land.
1077 */
1078 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
1079
1080 switch (fp_save_mech) {
1081 case FP_FXSAVE:
1082 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1083 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
1084 bcopy(&sse_initial, cfx, sizeof (*cfx));
1085 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
1086 cfx->fx_fcw = fx->fx_fcw;
1087 break;
1088
1089 case FP_XSAVE:
1090 cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
1091
1092 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
1093
1094 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1095 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
1096 cfx = &cxs->xs_fxsave;
1097
1098 bcopy(&avx_initial, cxs, sizeof (*cxs));
1099 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
1100 cfx->fx_fcw = fx->fx_fcw;
1101 cxs->xs_header.xsh_xstate_bv |=
1102 (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL);
1103 break;
1104 default:
1105 panic("Invalid fp_save_mech");
1106 /*NOTREACHED*/
1107 }
1108
1109 /*
1110 * Mark that both the parent and child need to have the FPU cleaned up
1111 * before returning to userland.
1112 */
1113
1114 ctxop_attach(ct, fp_ctxop_allocate(cfp));
1115 }
1116
1117 /*
1118 * Free any state associated with floating point context.
1119 * Fp_free can be called in three cases:
1120 * 1) from reaper -> thread_free -> freectx-> fp_free
1121 * fp context belongs to a thread on deathrow
1122 * nothing to do, thread will never be resumed
1123 * thread calling ctxfree is reaper
1124 *
1125 * 2) from exec -> freectx -> fp_free
1126 * fp context belongs to the current thread
1127 * must disable fpu, thread calling ctxfree is curthread
1128 *
1129 * 3) from restorecontext -> setfpregs -> fp_free
1130 * we have a modified context in the memory (lwp->pcb_fpu)
1131 * disable fpu and release the fp context for the CPU
1132 *
1133 */
1134 void
fp_free(struct fpu_ctx * fp)1135 fp_free(struct fpu_ctx *fp)
1136 {
1137 ASSERT(fp_kind != FP_NO);
1138
1139 if (fp->fpu_flags & FPU_VALID)
1140 return;
1141
1142 kpreempt_disable();
1143 /*
1144 * We want to do fpsave rather than fpdisable so that we can
1145 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
1146 */
1147 fp->fpu_flags |= FPU_VALID;
1148 /* If for current thread disable FP to track FPU_VALID */
1149 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
1150 /* Clear errors if any to prevent frstor from complaining */
1151 (void) fperr_reset();
1152 if (fp_kind & __FP_SSE)
1153 (void) fpxerr_reset();
1154 fpdisable();
1155 }
1156 kpreempt_enable();
1157 }
1158
1159 /*
1160 * Wrapper for freectx to make the types line up for fp_free()
1161 */
1162 static void
fp_free_ctx(void * arg,int isexec __unused)1163 fp_free_ctx(void *arg, int isexec __unused)
1164 {
1165 fp_free((struct fpu_ctx *)arg);
1166 }
1167
1168 /*
1169 * Store the floating point state and disable the floating point unit.
1170 */
1171 void
fp_save(struct fpu_ctx * fp)1172 fp_save(struct fpu_ctx *fp)
1173 {
1174 ASSERT(fp_kind != FP_NO);
1175
1176 kpreempt_disable();
1177 if (!fp || fp->fpu_flags & FPU_VALID ||
1178 (fp->fpu_flags & FPU_EN) == 0) {
1179 kpreempt_enable();
1180 return;
1181 }
1182 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
1183
1184 switch (fp_save_mech) {
1185 case FP_FXSAVE:
1186 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
1187 break;
1188
1189 case FP_XSAVE:
1190 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
1191 break;
1192 default:
1193 panic("Invalid fp_save_mech");
1194 /*NOTREACHED*/
1195 }
1196
1197 fp->fpu_flags |= FPU_VALID;
1198
1199 /*
1200 * We save the FPU as part of forking, execing, modifications via /proc,
1201 * restorecontext, etc. As such, we need to make sure that we return to
1202 * userland with valid state in the FPU. If we're context switched out
1203 * before we hit sys_rtt_common() we'll end up having restored the FPU
1204 * as part of the context ops operations. The restore logic always makes
1205 * sure that FPU_VALID is set before doing a restore so we don't restore
1206 * it a second time.
1207 */
1208 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
1209
1210 kpreempt_enable();
1211 }
1212
1213 /*
1214 * Restore the FPU context for the thread:
1215 * The possibilities are:
1216 * 1. No active FPU context: Load the new context into the FPU hw
1217 * and enable the FPU.
1218 */
1219 void
fp_restore(struct fpu_ctx * fp)1220 fp_restore(struct fpu_ctx *fp)
1221 {
1222 switch (fp_save_mech) {
1223 case FP_FXSAVE:
1224 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
1225 break;
1226
1227 case FP_XSAVE:
1228 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
1229 break;
1230 default:
1231 panic("Invalid fp_save_mech");
1232 /*NOTREACHED*/
1233 }
1234
1235 fp->fpu_flags &= ~FPU_VALID;
1236 }
1237
1238 /*
1239 * Reset the FPU such that it is in a valid state for a new thread that is
1240 * coming out of exec. The FPU will be in a usable state at this point. At this
1241 * point we know that the FPU state has already been allocated and if this
1242 * wasn't an init process, then it will have had fp_free() previously called.
1243 */
1244 void
fp_exec(void)1245 fp_exec(void)
1246 {
1247 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1248
1249 if (fp_save_mech == FP_XSAVE) {
1250 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
1251 }
1252
1253 struct ctxop *ctx = fp_ctxop_allocate(fp);
1254 /*
1255 * Make sure that we're not preempted in the middle of initializing the
1256 * FPU on CPU.
1257 */
1258 kpreempt_disable();
1259 ctxop_attach(curthread, ctx);
1260 fpinit();
1261 fp->fpu_flags = FPU_EN;
1262 kpreempt_enable();
1263 }
1264
1265
1266 /*
1267 * Seeds the initial state for the current thread. The possibilities are:
1268 * 1. Another process has modified the FPU state before we have done any
1269 * initialization: Load the FPU state from the LWP state.
1270 * 2. The FPU state has not been externally modified: Load a clean state.
1271 */
1272 void
fp_seed(void)1273 fp_seed(void)
1274 {
1275 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1276
1277 ASSERT(curthread->t_preempt >= 1);
1278 ASSERT((fp->fpu_flags & FPU_EN) == 0);
1279
1280 /*
1281 * Always initialize a new context and initialize the hardware.
1282 */
1283 if (fp_save_mech == FP_XSAVE) {
1284 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
1285 }
1286
1287 ctxop_attach(curthread, fp_ctxop_allocate(fp));
1288 fpinit();
1289
1290 /*
1291 * If FPU_VALID is set, it means someone has modified registers via
1292 * /proc. In this case, restore the current lwp's state.
1293 */
1294 if (fp->fpu_flags & FPU_VALID)
1295 fp_restore(fp);
1296
1297 ASSERT((fp->fpu_flags & FPU_VALID) == 0);
1298 fp->fpu_flags = FPU_EN;
1299 }
1300
1301 /*
1302 * When using xsave/xrstor, these three functions are used by the lwp code to
1303 * manage the memory for the xsave area.
1304 */
1305 void
fp_lwp_init(klwp_t * lwp)1306 fp_lwp_init(klwp_t *lwp)
1307 {
1308 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1309
1310 /*
1311 * We keep a copy of the pointer in lwp_fpu so that we can restore the
1312 * value in forklwp() after we duplicate the parent's LWP state.
1313 */
1314 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
1315 kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
1316 fp->fpu_signal = NULL;
1317
1318 if (fp_save_mech == FP_XSAVE) {
1319 /*
1320 *
1321 * We bzero since the fpinit() code path will only
1322 * partially initialize the xsave area using avx_inital.
1323 */
1324 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
1325 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
1326 }
1327 }
1328
1329 void
fp_lwp_cleanup(klwp_t * lwp)1330 fp_lwp_cleanup(klwp_t *lwp)
1331 {
1332 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
1333
1334 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
1335 kmem_cache_free(fpsave_cachep,
1336 fp->fpu_regs.kfpu_u.kfpu_generic);
1337 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
1338 }
1339
1340 if (fp->fpu_signal != NULL) {
1341 kmem_cache_free(fpsave_cachep, fp->fpu_signal);
1342 fp->fpu_signal = NULL;
1343 }
1344 }
1345
1346 /*
1347 * Called during the process of forklwp(). The kfpu_u pointer will have been
1348 * overwritten while copying the parent's LWP structure. We have a valid copy
1349 * stashed in the child's lwp_fpu which we use to restore the correct value.
1350 */
1351 void
fp_lwp_dup(klwp_t * lwp)1352 fp_lwp_dup(klwp_t *lwp)
1353 {
1354 void *xp = lwp->lwp_fpu;
1355 size_t sz;
1356
1357 switch (fp_save_mech) {
1358 case FP_FXSAVE:
1359 sz = sizeof (struct fxsave_state);
1360 break;
1361 case FP_XSAVE:
1362 sz = cpuid_get_xsave_size();
1363 break;
1364 default:
1365 panic("Invalid fp_save_mech");
1366 /*NOTREACHED*/
1367 }
1368
1369 /* copy the parent's values into the new lwp's struct */
1370 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
1371 /* now restore the pointer */
1372 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
1373 /* Ensure that we don't inherit our parent's signal state */
1374 lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL;
1375 }
1376
1377 /*
1378 * Handle a processor extension error fault
1379 * Returns non zero for error.
1380 */
1381
1382 /*ARGSUSED*/
1383 int
fpexterrflt(struct regs * rp)1384 fpexterrflt(struct regs *rp)
1385 {
1386 uint32_t fpcw, fpsw;
1387 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1388
1389 ASSERT(fp_kind != FP_NO);
1390
1391 /*
1392 * Now we can enable the interrupts.
1393 * (NOTE: x87 fp exceptions come thru interrupt gate)
1394 */
1395 sti();
1396
1397 if (!fpu_exists)
1398 return (FPE_FLTINV);
1399
1400 /*
1401 * Do an unconditional save of the FP state. If it's dirty (TS=0),
1402 * it'll be saved into the fpu context area passed in (that of the
1403 * current thread). If it's not dirty (it may not be, due to
1404 * an intervening save due to a context switch between the sti(),
1405 * above and here, then it's safe to just use the stored values in
1406 * the context save area to determine the cause of the fault.
1407 */
1408 fp_save(fp);
1409
1410 /* clear exception flags in saved state, as if by fnclex */
1411 switch (fp_save_mech) {
1412 case FP_FXSAVE:
1413 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1414 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
1415 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
1416 break;
1417
1418 case FP_XSAVE:
1419 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1420 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
1421 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
1422 /*
1423 * Always set LEGACY_FP as it may have been cleared by XSAVE
1424 * instruction
1425 */
1426 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1427 XFEATURE_LEGACY_FP;
1428 break;
1429 default:
1430 panic("Invalid fp_save_mech");
1431 /*NOTREACHED*/
1432 }
1433
1434 fp->fpu_regs.kfpu_status = fpsw;
1435
1436 if ((fpsw & FPS_ES) == 0)
1437 return (0); /* No exception */
1438
1439 /*
1440 * "and" the exception flags with the complement of the mask
1441 * bits to determine which exception occurred
1442 */
1443 return (fpe_sicode(fpsw & ~fpcw & 0x3f));
1444 }
1445
1446 /*
1447 * Handle an SSE/SSE2 precise exception.
1448 * Returns a non-zero sicode for error.
1449 */
1450 /*ARGSUSED*/
1451 int
fpsimderrflt(struct regs * rp)1452 fpsimderrflt(struct regs *rp)
1453 {
1454 uint32_t mxcsr, xmask;
1455 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
1456
1457 ASSERT(fp_kind & __FP_SSE);
1458
1459 /*
1460 * NOTE: Interrupts are disabled during execution of this
1461 * function. They are enabled by the caller in trap.c.
1462 */
1463
1464 /*
1465 * The only way we could have gotten here if there is no FP unit
1466 * is via a user executing an INT $19 instruction, so there is
1467 * no fault in that case.
1468 */
1469 if (!fpu_exists)
1470 return (0);
1471
1472 /*
1473 * Do an unconditional save of the FP state. If it's dirty (TS=0),
1474 * it'll be saved into the fpu context area passed in (that of the
1475 * current thread). If it's not dirty, then it's safe to just use
1476 * the stored values in the context save area to determine the
1477 * cause of the fault.
1478 */
1479 fp_save(fp); /* save the FPU state */
1480
1481 if (fp_save_mech == FP_XSAVE) {
1482 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
1483 fp->fpu_regs.kfpu_status =
1484 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
1485 } else {
1486 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
1487 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
1488 }
1489 fp->fpu_regs.kfpu_xstatus = mxcsr;
1490
1491 /*
1492 * compute the mask that determines which conditions can cause
1493 * a #xm exception, and use this to clean the status bits so that
1494 * we can identify the true cause of this one.
1495 */
1496 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
1497 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
1498 }
1499
1500 /*
1501 * In the unlikely event that someone is relying on this subcode being
1502 * FPE_FLTILL for denormalize exceptions, it can always be patched back
1503 * again to restore old behaviour.
1504 */
1505 int fpe_fltden = FPE_FLTDEN;
1506
1507 /*
1508 * Map from the FPU status word to the FP exception si_code.
1509 */
1510 static int
fpe_sicode(uint_t sw)1511 fpe_sicode(uint_t sw)
1512 {
1513 if (sw & FPS_IE)
1514 return (FPE_FLTINV);
1515 if (sw & FPS_ZE)
1516 return (FPE_FLTDIV);
1517 if (sw & FPS_DE)
1518 return (fpe_fltden);
1519 if (sw & FPS_OE)
1520 return (FPE_FLTOVF);
1521 if (sw & FPS_UE)
1522 return (FPE_FLTUND);
1523 if (sw & FPS_PE)
1524 return (FPE_FLTRES);
1525 return (FPE_FLTINV); /* default si_code for other exceptions */
1526 }
1527
1528 /*
1529 * Map from the SSE status word to the FP exception si_code.
1530 */
1531 static int
fpe_simd_sicode(uint_t sw)1532 fpe_simd_sicode(uint_t sw)
1533 {
1534 if (sw & SSE_IE)
1535 return (FPE_FLTINV);
1536 if (sw & SSE_ZE)
1537 return (FPE_FLTDIV);
1538 if (sw & SSE_DE)
1539 return (FPE_FLTDEN);
1540 if (sw & SSE_OE)
1541 return (FPE_FLTOVF);
1542 if (sw & SSE_UE)
1543 return (FPE_FLTUND);
1544 if (sw & SSE_PE)
1545 return (FPE_FLTRES);
1546 return (FPE_FLTINV); /* default si_code for other exceptions */
1547 }
1548
1549 /*
1550 * This routine is invoked as part of libc's __fpstart implementation
1551 * via sysi86(2).
1552 *
1553 * It may be called -before- any context has been assigned in which case
1554 * we try and avoid touching the hardware. Or it may be invoked well
1555 * after the context has been assigned and fiddled with, in which case
1556 * just tweak it directly.
1557 */
1558 void
fpsetcw(uint16_t fcw,uint32_t mxcsr)1559 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1560 {
1561 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1562 struct fxsave_state *fx;
1563
1564 if (!fpu_exists || fp_kind == FP_NO)
1565 return;
1566
1567 if ((fp->fpu_flags & FPU_EN) == 0) {
1568 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1569 /*
1570 * Common case. Floating point unit not yet
1571 * enabled, and kernel already intends to initialize
1572 * the hardware the way the caller wants.
1573 */
1574 return;
1575 }
1576 /*
1577 * Hmm. Userland wants a different default.
1578 * Do a fake "first trap" to establish the context, then
1579 * handle as if we already had a context before we came in.
1580 */
1581 kpreempt_disable();
1582 fp_seed();
1583 kpreempt_enable();
1584 }
1585
1586 /*
1587 * Ensure that the current hardware state is flushed back to the
1588 * pcb, then modify that copy. Next use of the fp will
1589 * restore the context.
1590 */
1591 fp_save(fp);
1592
1593 switch (fp_save_mech) {
1594 case FP_FXSAVE:
1595 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1596 fx->fx_fcw = fcw;
1597 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1598 break;
1599
1600 case FP_XSAVE:
1601 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1602 fx->fx_fcw = fcw;
1603 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1604 /*
1605 * Always set LEGACY_FP as it may have been cleared by XSAVE
1606 * instruction
1607 */
1608 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
1609 XFEATURE_LEGACY_FP;
1610 break;
1611 default:
1612 panic("Invalid fp_save_mech");
1613 /*NOTREACHED*/
1614 }
1615 }
1616
1617 static void
kernel_fpu_fpstate_init(kfpu_state_t * kfpu)1618 kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
1619 {
1620 struct xsave_state *xs;
1621
1622 switch (fp_save_mech) {
1623 case FP_FXSAVE:
1624 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
1625 sizeof (struct fxsave_state));
1626 kfpu->kfpu_ctx.fpu_xsave_mask = 0;
1627 break;
1628 case FP_XSAVE:
1629 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
1630 bzero(xs, cpuid_get_xsave_size());
1631 bcopy(&avx_initial, xs, sizeof (*xs));
1632 xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
1633 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
1634 break;
1635 default:
1636 panic("invalid fp_save_mech");
1637 }
1638
1639 /*
1640 * Set the corresponding flags that the system expects on the FPU state
1641 * to indicate that this is our state. The FPU_EN flag is required to
1642 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
1643 * not set below as it represents that this state is being suppressed
1644 * by the kernel.
1645 */
1646 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
1647 kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
1648 }
1649
1650 kfpu_state_t *
kernel_fpu_alloc(int kmflags)1651 kernel_fpu_alloc(int kmflags)
1652 {
1653 kfpu_state_t *kfpu;
1654
1655 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
1656 return (NULL);
1657 }
1658
1659 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
1660 kmem_cache_alloc(fpsave_cachep, kmflags);
1661 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
1662 kmem_free(kfpu, sizeof (kfpu_state_t));
1663 return (NULL);
1664 }
1665
1666 kernel_fpu_fpstate_init(kfpu);
1667
1668 return (kfpu);
1669 }
1670
1671 void
kernel_fpu_free(kfpu_state_t * kfpu)1672 kernel_fpu_free(kfpu_state_t *kfpu)
1673 {
1674 kmem_cache_free(fpsave_cachep,
1675 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
1676 kmem_free(kfpu, sizeof (kfpu_state_t));
1677 }
1678
1679 static void
kernel_fpu_ctx_save(void * arg)1680 kernel_fpu_ctx_save(void *arg)
1681 {
1682 kfpu_state_t *kfpu = arg;
1683 fpu_ctx_t *pf;
1684
1685 if (kfpu == NULL) {
1686 /*
1687 * A NULL kfpu implies this is a kernel thread with an LWP and
1688 * no user-level FPU usage. Use the lwp fpu save area.
1689 */
1690 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1691
1692 ASSERT(curthread->t_procp->p_flag & SSYS);
1693 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1694
1695 fp_save(pf);
1696 } else {
1697 pf = &kfpu->kfpu_ctx;
1698
1699 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1700 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
1701
1702 /*
1703 * Note, we can't use fp_save because it assumes that we're
1704 * saving to the thread's PCB and not somewhere else. Because
1705 * this is a different FPU context, we instead have to do this
1706 * ourselves.
1707 */
1708 switch (fp_save_mech) {
1709 case FP_FXSAVE:
1710 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
1711 break;
1712 case FP_XSAVE:
1713 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
1714 break;
1715 default:
1716 panic("Invalid fp_save_mech");
1717 }
1718
1719 /*
1720 * Because we have saved context here, our save state is no
1721 * longer valid and therefore needs to be reinitialized.
1722 */
1723 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
1724 }
1725
1726 pf->fpu_flags |= FPU_VALID;
1727
1728 /*
1729 * Clear KFPU flag. This allows swtch to check for improper kernel
1730 * usage of the FPU (i.e. switching to a new thread while the old
1731 * thread was in the kernel and using the FPU, but did not perform a
1732 * context save).
1733 */
1734 curthread->t_flag &= ~T_KFPU;
1735 }
1736
1737 static void
kernel_fpu_ctx_restore(void * arg)1738 kernel_fpu_ctx_restore(void *arg)
1739 {
1740 kfpu_state_t *kfpu = arg;
1741 fpu_ctx_t *pf;
1742
1743 if (kfpu == NULL) {
1744 /*
1745 * A NULL kfpu implies this is a kernel thread with an LWP and
1746 * no user-level FPU usage. Use the lwp fpu save area.
1747 */
1748 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1749
1750 ASSERT(curthread->t_procp->p_flag & SSYS);
1751 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1752 } else {
1753 pf = &kfpu->kfpu_ctx;
1754
1755 ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
1756 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
1757 }
1758
1759 fp_restore(pf);
1760 curthread->t_flag |= T_KFPU;
1761 }
1762
1763 /*
1764 * Validate that the thread is not switching off-cpu while actively using the
1765 * FPU within the kernel.
1766 */
1767 void
kernel_fpu_no_swtch(void)1768 kernel_fpu_no_swtch(void)
1769 {
1770 if ((curthread->t_flag & T_KFPU) != 0) {
1771 panic("curthread swtch-ing while the kernel is using the FPU");
1772 }
1773 }
1774
1775 static const struct ctxop_template kfpu_ctxop_tpl = {
1776 .ct_rev = CTXOP_TPL_REV,
1777 .ct_save = kernel_fpu_ctx_save,
1778 .ct_restore = kernel_fpu_ctx_restore,
1779 };
1780
1781 void
kernel_fpu_begin(kfpu_state_t * kfpu,uint_t flags)1782 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
1783 {
1784 klwp_t *pl = curthread->t_lwp;
1785 struct ctxop *ctx;
1786
1787 if ((curthread->t_flag & T_KFPU) != 0) {
1788 panic("curthread attempting to nest kernel FPU states");
1789 }
1790
1791 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
1792 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
1793 (KFPU_USE_LWP | KFPU_NO_STATE));
1794
1795 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
1796 /*
1797 * Since we don't have a kfpu_state or usable lwp pcb_fpu to
1798 * hold our kernel FPU context, we depend on the caller doing
1799 * kpreempt_disable for the duration of our FPU usage. This
1800 * should only be done for very short periods of time.
1801 */
1802 ASSERT(curthread->t_preempt > 0);
1803 ASSERT(kfpu == NULL);
1804
1805 if (pl != NULL) {
1806 /*
1807 * We might have already saved once so FPU_VALID could
1808 * be set. This is handled in fp_save.
1809 */
1810 fp_save(&pl->lwp_pcb.pcb_fpu);
1811 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1812 }
1813
1814 curthread->t_flag |= T_KFPU;
1815
1816 /* Always restore the fpu to the initial state. */
1817 fpinit();
1818
1819 return;
1820 }
1821
1822 /*
1823 * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
1824 */
1825
1826 if ((flags & KFPU_USE_LWP) == 0) {
1827 if (kfpu->kfpu_curthread != NULL)
1828 panic("attempting to reuse kernel FPU state at %p when "
1829 "another thread already is using", kfpu);
1830
1831 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
1832 kernel_fpu_fpstate_init(kfpu);
1833
1834 kfpu->kfpu_curthread = curthread;
1835 }
1836
1837 /*
1838 * Not all threads may have an active LWP. If they do and we're not
1839 * going to re-use the LWP, then we should go ahead and save the state.
1840 * We must also note that the fpu is now being used by the kernel and
1841 * therefore we do not want to manage the fpu state via the user-level
1842 * thread's context handlers.
1843 *
1844 * We might have already saved once (due to a prior use of the kernel
1845 * FPU or another code path) so FPU_VALID could be set. This is handled
1846 * by fp_save, as is the FPU_EN check.
1847 */
1848 ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu);
1849 kpreempt_disable();
1850 if (pl != NULL) {
1851 if ((flags & KFPU_USE_LWP) == 0)
1852 fp_save(&pl->lwp_pcb.pcb_fpu);
1853 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
1854 }
1855
1856 /*
1857 * Set the context operations for kernel FPU usage. Because kernel FPU
1858 * setup and ctxop attachment needs to happen under the protection of
1859 * kpreempt_disable(), we allocate the ctxop outside the guard so its
1860 * sleeping allocation will not cause a voluntary swtch(). This allows
1861 * the rest of the initialization to proceed, ensuring valid state for
1862 * the ctxop handlers.
1863 */
1864 ctxop_attach(curthread, ctx);
1865 curthread->t_flag |= T_KFPU;
1866
1867 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
1868 /*
1869 * For pure kernel threads with an LWP, we can use the LWP's
1870 * pcb_fpu to save/restore context.
1871 */
1872 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
1873
1874 VERIFY(curthread->t_procp->p_flag & SSYS);
1875 VERIFY(kfpu == NULL);
1876 ASSERT((pf->fpu_flags & FPU_EN) == 0);
1877
1878 /* Always restore the fpu to the initial state. */
1879 if (fp_save_mech == FP_XSAVE)
1880 pf->fpu_xsave_mask = XFEATURE_FP_ALL;
1881 fpinit();
1882 pf->fpu_flags = FPU_EN | FPU_KERNEL;
1883 } else {
1884 /* initialize the kfpu state */
1885 kernel_fpu_ctx_restore(kfpu);
1886 }
1887 kpreempt_enable();
1888 }
1889
1890 void
kernel_fpu_end(kfpu_state_t * kfpu,uint_t flags)1891 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
1892 {
1893 if ((curthread->t_flag & T_KFPU) == 0) {
1894 panic("curthread attempting to clear kernel FPU state "
1895 "without using it");
1896 }
1897
1898 /*
1899 * General comments on why the rest of this function is structured the
1900 * way it is. Be aware that there is a lot of subtlety here.
1901 *
1902 * If a user-level thread ever uses the fpu while in the kernel, then
1903 * we cannot call fpdisable since that does STTS. That will set the
1904 * ts bit in %cr0 which will cause an exception if anything touches the
1905 * fpu. However, the user-level context switch handler (fpsave_ctxt)
1906 * needs to access the fpu to save the registers into the pcb.
1907 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
1908 * fprestore_ctxt when the thread context switched onto the CPU.
1909 *
1910 * Calling fpdisable only effects the current CPU's %cr0 register.
1911 *
1912 * During ctxop_remove and kpreempt_enable, we can voluntarily context
1913 * switch, so the CPU we were on when we entered this function might
1914 * not be the same one we're on when we return from ctxop_remove or end
1915 * the function. Note there can be user-level context switch handlers
1916 * still installed if this is a user-level thread.
1917 *
1918 * We also must be careful in the unlikely chance we're running in an
1919 * interrupt thread, since we can't leave the CPU's %cr0 TS state set
1920 * incorrectly for the "real" thread to resume on this CPU.
1921 */
1922
1923 if ((flags & KFPU_NO_STATE) == 0) {
1924 kpreempt_disable();
1925 } else {
1926 ASSERT(curthread->t_preempt > 0);
1927 }
1928
1929 curthread->t_flag &= ~T_KFPU;
1930
1931 /*
1932 * When we are ending things, we explicitly don't save the current
1933 * kernel FPU state back to the temporary state. The kfpu API is not
1934 * intended to be a permanent save location.
1935 *
1936 * If this is a user-level thread and we were to context switch
1937 * before returning to user-land, fpsave_ctxt will be a no-op since we
1938 * already saved the user-level FPU state the first time we run
1939 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
1940 * the user-level fpu state). The fpsave_ctxt functions only save if
1941 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
1942 * fprestore_ctxt will be done in sys_rtt_common when the thread
1943 * finally returns to user-land.
1944 */
1945
1946 if ((curthread->t_procp->p_flag & SSYS) != 0 &&
1947 curthread->t_intr == NULL) {
1948 /*
1949 * A kernel thread which is not an interrupt thread, so we
1950 * STTS now.
1951 */
1952 fpdisable();
1953 }
1954
1955 if ((flags & KFPU_NO_STATE) == 0) {
1956 ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu);
1957
1958 if (kfpu != NULL) {
1959 if (kfpu->kfpu_curthread != curthread) {
1960 panic("attempting to end kernel FPU state "
1961 "for %p, but active thread is not "
1962 "curthread", kfpu);
1963 } else {
1964 kfpu->kfpu_curthread = NULL;
1965 }
1966 }
1967
1968 kpreempt_enable();
1969 }
1970
1971 if (curthread->t_lwp != NULL) {
1972 uint_t f;
1973
1974 if (flags & KFPU_USE_LWP) {
1975 f = FPU_EN | FPU_KERNEL;
1976 } else {
1977 f = FPU_KERNEL;
1978 }
1979 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
1980 }
1981 }
1982
1983 /*
1984 * Fill in FPU information that is required by exec.
1985 */
1986 void
fpu_auxv_info(int * typep,size_t * lenp)1987 fpu_auxv_info(int *typep, size_t *lenp)
1988 {
1989 *typep = fp_elf;
1990 switch (fp_save_mech) {
1991 case FP_FXSAVE:
1992 *lenp = sizeof (struct fxsave_state);
1993 break;
1994 case FP_XSAVE:
1995 *lenp = cpuid_get_xsave_size();
1996 break;
1997 default:
1998 *lenp = 0;
1999 break;
2000 }
2001 }
2002
2003 /*
2004 * This function exists to transform an xsave_state into an fxsave_state. The
2005 * way that we have to do this is nuanced. We assume that callers have already
2006 * handled FPU_EN and thus we only need to consider the xsave_state and its
2007 * component vector itself. This results in the following cases that we need to
2008 * consider:
2009 *
2010 * o Neither the x87 / XMM state bits are set. We use the hardware default and
2011 * need to ensure to copy the xsave header.
2012 * o Both x87 / XMM state bits are set. We can copy everything.
2013 * o Only the x87 bit is set. We need to copy the x87 state but make the XMM
2014 * state be in the initial case.
2015 * o Only the XMM bit is set. The reverse of the above case.
2016 *
2017 * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are
2018 * generally the same; however, the default floating point control word is
2019 * different.
2020 *
2021 * Finally, we have the complication of the MXCSR and MCXSR_MASK registers.
2022 * Because we are using xsave and xsaveopt in the kernel right now and not
2023 * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the
2024 * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX
2025 * is set, we must also come back and copy out the MXCSR register. Sorry, we
2026 * don't make the rules.
2027 */
2028 static void
fpu_xsave_to_fxsave(const struct xsave_state * xsave,struct fxsave_state * fx)2029 fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx)
2030 {
2031 const uint64_t comps = xsave->xs_header.xsh_xstate_bv;
2032
2033 switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) {
2034 case XFEATURE_LEGACY_FP | XFEATURE_SSE:
2035 bcopy(xsave, fx, sizeof (*fx));
2036 return;
2037 case XFEATURE_LEGACY_FP:
2038 bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm));
2039 fx->fx_mxcsr = SSE_MXCSR_INIT;
2040 fx->fx_mxcsr_mask = 0;
2041 break;
2042 case XFEATURE_SSE:
2043 bcopy(&sse_initial, fx, offsetof(struct fxsave_state,
2044 fx_mxcsr));
2045
2046 fx->fx_fcw = FPU_CW_INIT_HW;
2047 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2048 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2049 bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm));
2050 break;
2051 default:
2052 bcopy(&sse_initial, fx, sizeof (*fx));
2053 fx->fx_fcw = FPU_CW_INIT_HW;
2054 break;
2055 }
2056
2057 /*
2058 * Account for the AVX causing MXCSR to be valid.
2059 */
2060 if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 &&
2061 (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) {
2062 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr;
2063 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask;
2064 }
2065 }
2066
2067 /*
2068 * This function is designed to answer the question of are we using any xsave
2069 * family of instructions in context switch and therefore we have this state.
2070 * This should still remain true if we are using xsavec or xsaves in the kernel
2071 * in the future.
2072 */
2073 boolean_t
fpu_xsave_enabled(void)2074 fpu_xsave_enabled(void)
2075 {
2076 return (fp_save_mech == FP_XSAVE);
2077 }
2078
2079 /*
2080 * The following structure is used to track and manage the programmatic
2081 * construction of /proc and signal stack spilling of xsave information. All
2082 * known xsave types that the kernel supports must be included here.
2083 */
2084 typedef struct xsave_proc_info {
2085 /*
2086 * This matches the /proc xregs type that this data represents. This s
2087 * used for /proc only.
2088 */
2089 uint32_t xi_type;
2090 /*
2091 * This indicates the size of the /proc data that we're operating on.
2092 * This is only used for /proc.
2093 */
2094 size_t xi_size;
2095 /*
2096 * This indicates the alignment that we want to have for the member when
2097 * we're writing out. This is not used when setting data. This is only
2098 * used for /proc.
2099 */
2100 size_t xi_align;
2101 /*
2102 * This indicates whether this member must always be considered or not.
2103 * This is used in both /proc and context/signal handling.
2104 */
2105 bool xi_always;
2106 /*
2107 * This contains the corresponding bits in the xsave bit vector that
2108 * corresponds to this entry. This is used for both /proc and
2109 * context/signal handling.
2110 */
2111 uint64_t xi_bits;
2112 /*
2113 * The xi_fill function pointer is used to write out the /proc regset
2114 * data (e.g. when a user reads xregs). This is only used for the /proc
2115 * handling. The xi_valid function pointer is used instead to validate a
2116 * given set of data that we've read in, while the xi_set pointer is
2117 * used to actually transform the data in the underlying fpu save area.
2118 */
2119 void (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *,
2120 void *);
2121 bool (*xi_valid)(model_t, const void *);
2122 void (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *,
2123 uint64_t, const void *);
2124 /*
2125 * The xi_signal_in and xi_signal_out function pointers are used for
2126 * extended context and signal handling information. They are used when
2127 * reading in data from a ucontext_t and writing it out respectively.
2128 * These are only used for context/signal handling.
2129 */
2130 int (*xi_signal_in)(const struct xsave_proc_info *,
2131 const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *,
2132 const uintptr_t);
2133 int (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f,
2134 uc_xsave_t *, const void *fpup, uintptr_t);
2135 } xsave_proc_info_t;
2136
2137 static bool
fpu_proc_xregs_initial_state(const fpu_ctx_t * fpu,uint64_t feats)2138 fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats)
2139 {
2140 const struct xsave_state *xs = fpu->fpu_regs.kfpu_u.kfpu_xs;
2141
2142 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
2143 return (true);
2144 }
2145
2146 return ((xs->xs_header.xsh_xstate_bv & feats) == 0);
2147 }
2148
2149 static void
fpu_proc_xregs_xcr_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2150 fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2151 void *datap)
2152 {
2153 prxregset_xcr_t *xcr = datap;
2154
2155 xcr->prx_xcr_xcr0 = xsave_bv_all;
2156 }
2157
2158 /*
2159 * Unlike other instruction portions, we treat the xsave header and the legacy
2160 * XMM section together as both are somewhat tied at the instruction hip. Unlike
2161 * the when dealing with other xsave regions like the ymm and zmm components,
2162 * the initial state here is much more nuanced as it has to match what we actual
2163 * do in the OS and depends on the components that are present.
2164 */
2165 static void
fpu_proc_xregs_xsave_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2166 fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2167 void *datap)
2168 {
2169 prxregset_xsave_t *prxsave = datap;
2170 const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs;
2171 size_t hdr_off;
2172
2173 /*
2174 * In the x87/XMM case, the no device vs. initial state is different
2175 * because the initial state case still wants us to copy the real xsave
2176 * header. It's also worth calling out that the actual illumos default
2177 * fxsave state is not the same as what Intel documents. The main
2178 * difference is in what the x87 FPU control word is. This results in
2179 * the following different cases that we need to think about:
2180 *
2181 * o FPU_EN is not set. So we use the illumos default.
2182 */
2183 if ((fpu->fpu_flags & FPU_EN) == 0) {
2184 bcopy(&avx_initial, prxsave, sizeof (*prxsave));
2185 return;
2186 }
2187
2188 /*
2189 * Convert all the fxsave region while taking into account the validity
2190 * of the xsave bits. The prxregset_xsave_t structure is the same as the
2191 * xsave structure in our ABI and Intel designed the xsave header to
2192 * begin with the 512-bit fxsave structure.
2193 */
2194 fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave);
2195
2196 /*
2197 * Now that we've dealt with the x87 and XMM state, take care of the
2198 * header.
2199 */
2200 hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv);
2201 bcopy((const void *)((uintptr_t)xsave + hdr_off),
2202 (void *)((uintptr_t)prxsave + hdr_off),
2203 sizeof (struct xsave_header));
2204 }
2205
2206 static void
fpu_proc_xregs_std_fill(const fpu_ctx_t * fpu,const xsave_proc_info_t * info,void * datap)2207 fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2208 void *datap)
2209 {
2210 if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) {
2211 size_t size, off;
2212 const void *xsave_off;
2213
2214 cpuid_get_xsave_info(info->xi_bits, &size, &off);
2215 ASSERT3U(size, ==, info->xi_size);
2216 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2217 off);
2218 bcopy(xsave_off, datap, info->xi_size);
2219 }
2220 }
2221
2222 /*
2223 * Users are not allowed to actually set the xcr information this way. However,
2224 * to make it easier for someone to just do a read, modify, write, of the xregs
2225 * data, if it is identical, then we will accept it (and do nothing).
2226 */
2227 static bool
fpu_proc_xregs_xcr_valid(model_t model,const void * datap)2228 fpu_proc_xregs_xcr_valid(model_t model, const void *datap)
2229 {
2230 const prxregset_xcr_t *xcr = datap;
2231
2232 return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 &&
2233 xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0);
2234 }
2235
2236 /*
2237 * To match traditional /proc semantics, we do not error if reserved bits of
2238 * MXCSR are set, they will be masked off when writing data. We do not allow
2239 * someone to indicate that they are asking for compressed xsave data, hence the
2240 * check that prx_xsh_comp_bv is zero. Separately, in fpu_proc_xregs_set() we
2241 * check that each component that was indicated in the xstate_bv is actually
2242 * present.
2243 */
2244 static bool
fpu_proc_xregs_xsave_valid(model_t model,const void * datap)2245 fpu_proc_xregs_xsave_valid(model_t model, const void *datap)
2246 {
2247 const prxregset_xsave_t *xsave = datap;
2248 uint64_t rsvd[6] = { 0 };
2249
2250 if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 ||
2251 xsave->prx_xsh_xcomp_bv != 0) {
2252 return (false);
2253 }
2254
2255 if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) {
2256 return (false);
2257 }
2258
2259 return (true);
2260 }
2261
2262 /*
2263 * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment
2264 * on x86; however, when operating in ILP32, subsets are reserved. We require
2265 * that all reserved portions are set to zero.
2266 */
2267 static bool
fpu_proc_xregs_ymm_valid(model_t model,const void * datap)2268 fpu_proc_xregs_ymm_valid(model_t model, const void *datap)
2269 {
2270 upad128_t ymm_zero[8];
2271 const prxregset_ymm_t *ymm = datap;
2272
2273 if (model == DATAMODEL_LP64) {
2274 return (true);
2275 }
2276
2277 bzero(&ymm_zero, sizeof (ymm_zero));
2278 return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0);
2279 }
2280
2281 static bool
fpu_proc_xregs_zmm_valid(model_t model,const void * datap)2282 fpu_proc_xregs_zmm_valid(model_t model, const void *datap)
2283 {
2284 upad256_t zmm_zero[8];
2285 const prxregset_zmm_t *zmm = datap;
2286
2287 if (model == DATAMODEL_LP64) {
2288 return (true);
2289 }
2290
2291 bzero(&zmm_zero, sizeof (zmm_zero));
2292 return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0);
2293 }
2294
2295 static bool
fpu_proc_xregs_hi_zmm_valid(model_t model,const void * datap)2296 fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap)
2297 {
2298 prxregset_hi_zmm_t hi_zmm_zero;
2299 const prxregset_hi_zmm_t *hi_zmm = datap;
2300
2301 if (model == DATAMODEL_LP64) {
2302 return (true);
2303 }
2304
2305 bzero(&hi_zmm_zero, sizeof (hi_zmm_zero));
2306 return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0);
2307 }
2308
2309 /*
2310 * The xsave state consists of the first 512 bytes of the XMM state and then the
2311 * xsave header itself. Because of the xsave header, this structure is marked
2312 * with xi_always, so we must always process and consider it.
2313 *
2314 * Semantically if either of the bits around SSE / x87 is set, then we will copy
2315 * the entire thing. This may mean that we end up copying a region that is not
2316 * valid into the save area; however, that should be OK as we still have the
2317 * specific bit flags that indicate what we should consider or not.
2318 *
2319 * There is one additional wrinkle we need to consider and honor here. The CPU
2320 * will load the MXCSR values if the AVX bit is set in an xrstor regardless of
2321 * anything else. So if this is set and we do not have a valid x87/XMM bits
2322 * set then we will set the MXCSR to its default state in case the processor
2323 * tries to load it. For reference see:
2324 *
2325 * o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR
2326 * o AMD64 Volume 2: Section 11.5.9 MXCSR State Management
2327 *
2328 * Note, the behavior around this changes depending on whether using the
2329 * compressed xrstor or not. We are not, but it's worth being aware of. We do
2330 * not worry about MXCSR_MASK because the instructions ignore it.
2331 */
2332 static void
fpu_proc_xregs_xsave_set(fpu_ctx_t * fpu,const xsave_proc_info_t * info,uint64_t xsave_bv,const void * datap)2333 fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2334 uint64_t xsave_bv, const void *datap)
2335 {
2336 const struct xsave_state *src_xs = datap;
2337 struct xsave_state *targ_xs = fpu->fpu_regs.kfpu_u.kfpu_xs;
2338
2339 if ((xsave_bv & info->xi_bits) != 0) {
2340 bcopy(&src_xs->xs_fxsave, &targ_xs->xs_fxsave,
2341 sizeof (struct fxsave_state));
2342 } else if ((xsave_bv & XFEATURE_AVX) != 0) {
2343 targ_xs->xs_fxsave.fx_mxcsr = SSE_MXCSR_INIT;
2344 }
2345
2346 bcopy(&src_xs->xs_header, &targ_xs->xs_header,
2347 sizeof (struct xsave_header));
2348 targ_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2349 }
2350
2351 static void
fpu_proc_xregs_std_set(fpu_ctx_t * fpu,const xsave_proc_info_t * info,uint64_t xsave_bv,const void * datap)2352 fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info,
2353 uint64_t xsave_bv, const void *datap)
2354 {
2355 size_t size, off;
2356 void *xsave_off;
2357
2358 cpuid_get_xsave_info(info->xi_bits, &size, &off);
2359 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs +
2360 off);
2361 bcopy(datap, xsave_off, size);
2362 }
2363
2364 /*
2365 * Dealing with XMM data is a little more annoying in signal context. If UC_FPU
2366 * is set, the ucontext_t's fpregset_t contains a copy of the XMM region. That
2367 * must take priority over an XMM region that showed up in the uc_xsave_t data.
2368 * In the signal copyout code we do not save XMM region in the uc_xsave_t or set
2369 * it as a present component because of it being kept in the fpregset_t. Because
2370 * of this behavior, if we find the XMM (or x87) state bits present, we treat
2371 * that as an error.
2372 *
2373 * The system has always gone through and cleaned up the reserved bits in the
2374 * fxsave state when someone calls setcontext(). Therefore we need to do the
2375 * same thing which is why you see the masking of the mxcsr below.
2376 *
2377 * Finally, there is one last wrinkle here that we need to consider. The
2378 * fpregset_t has two private words which cache the status/exception
2379 * information. Therefore, we well... cheat. Intel has left bytes 464 (0x1d0)
2380 * through 511 (0x1ff) available for us to do what we want. So we will pass this
2381 * through that for the moment to help us pass this state around without too
2382 * much extra allocation.
2383 */
2384 static int
fpu_signal_copyin_xmm(const xsave_proc_info_t * info,const ucontext_t * kuc,const uc_xsave_t * ucx,void * fpup,uintptr_t * udatap,const uintptr_t max_udata)2385 fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc,
2386 const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2387 const uintptr_t max_udata)
2388 {
2389 struct xsave_state *xsave = fpup;
2390
2391 if ((ucx->ucx_bv & info->xi_bits) != 0) {
2392 return (EINVAL);
2393 }
2394
2395 if ((kuc->uc_flags & UC_FPU) != 0) {
2396 bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave,
2397 sizeof (struct fxsave_state));
2398 xsave->xs_fxsave.__fx_ign2[3]._l[0] =
2399 kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status;
2400 xsave->xs_fxsave.__fx_ign2[3]._l[1] =
2401 kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus;
2402 xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask;
2403 xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2404 }
2405
2406 return (0);
2407 }
2408
2409 static int
fpu_signal_copyin_std(const xsave_proc_info_t * info,const ucontext_t * kuc,const uc_xsave_t * ucx,void * fpup,uintptr_t * udatap,const uintptr_t max_udata)2410 fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc,
2411 const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap,
2412 const uintptr_t max_udata)
2413 {
2414 size_t len, xsave_off;
2415 void *copy_to;
2416 struct xsave_state *xsave = fpup;
2417
2418 cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2419 if (*udatap + len > max_udata) {
2420 return (EOVERFLOW);
2421 }
2422
2423 copy_to = (void *)((uintptr_t)fpup + xsave_off);
2424 if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) {
2425 return (EFAULT);
2426 }
2427
2428 xsave->xs_header.xsh_xstate_bv |= info->xi_bits;
2429 *udatap = *udatap + len;
2430
2431 return (0);
2432 }
2433
2434 static int
fpu_signal_copyout_std(const xsave_proc_info_t * info,fpu_copyout_f copyfunc,uc_xsave_t * ucx,const void * fpup,uintptr_t udatap)2435 fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc,
2436 uc_xsave_t *ucx, const void *fpup, uintptr_t udatap)
2437 {
2438 size_t len, xsave_off;
2439 const void *copy_from;
2440 void *copy_to;
2441 int ret;
2442
2443 cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off);
2444 copy_from = (void *)(uintptr_t)fpup + xsave_off;
2445 copy_to = (void *)(udatap + ucx->ucx_len);
2446
2447 ret = copyfunc(copy_from, copy_to, len);
2448 if (ret != 0) {
2449 return (ret);
2450 }
2451
2452 ucx->ucx_len += len;
2453 ucx->ucx_bv |= info->xi_bits;
2454 return (0);
2455 }
2456
2457 /*
2458 * This table contains information about the extended FPU states and synthetic
2459 * information we create for /proc, the ucontext_t, and signal handling. The
2460 * definition of the xsave_proc_info_t describes how each member is used.
2461 *
2462 * In general, this table is expected to be in the order of the xsave data
2463 * structure itself. Synthetic elements that we create can go anywhere and new
2464 * ones should be inserted at the end. This structure is walked in order to
2465 * produce the /proc and signal handling logic, so changing the order is
2466 * meaningful for those and should not be done lightly.
2467 */
2468 static const xsave_proc_info_t fpu_xsave_info[] = { {
2469 .xi_type = PRX_INFO_XCR,
2470 .xi_size = sizeof (prxregset_xcr_t),
2471 .xi_align = alignof (prxregset_xcr_t),
2472 .xi_always = true,
2473 .xi_bits = 0,
2474 .xi_fill = fpu_proc_xregs_xcr_fill,
2475 .xi_valid = fpu_proc_xregs_xcr_valid
2476 }, {
2477 /*
2478 * The XSAVE entry covers both the xsave header and the %xmm registers.
2479 * Note, there is no signal copyout information for the %xmm registers
2480 * because it is expected that that data is already in the fpregset_t.
2481 */
2482 .xi_type = PRX_INFO_XSAVE,
2483 .xi_size = sizeof (prxregset_xsave_t),
2484 .xi_align = FPU_ALIGN_XMM,
2485 .xi_always = true,
2486 .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE,
2487 .xi_fill = fpu_proc_xregs_xsave_fill,
2488 .xi_set = fpu_proc_xregs_xsave_set,
2489 .xi_valid = fpu_proc_xregs_xsave_valid,
2490 .xi_signal_in = fpu_signal_copyin_xmm
2491 }, {
2492 .xi_type = PRX_INFO_YMM,
2493 .xi_size = sizeof (prxregset_ymm_t),
2494 .xi_align = FPU_ALIGN_YMM,
2495 .xi_always = false,
2496 .xi_bits = XFEATURE_AVX,
2497 .xi_fill = fpu_proc_xregs_std_fill,
2498 .xi_set = fpu_proc_xregs_std_set,
2499 .xi_signal_in = fpu_signal_copyin_std,
2500 .xi_valid = fpu_proc_xregs_ymm_valid,
2501 .xi_signal_out = fpu_signal_copyout_std
2502 }, {
2503 /*
2504 * There is no /proc validation function for the mask registers because
2505 * they are the same in ILP32 / LP64 and there is nothing for us to
2506 * actually validate.
2507 */
2508 .xi_type = PRX_INFO_OPMASK,
2509 .xi_size = sizeof (prxregset_opmask_t),
2510 .xi_align = alignof (prxregset_opmask_t),
2511 .xi_always = false,
2512 .xi_bits = XFEATURE_AVX512_OPMASK,
2513 .xi_fill = fpu_proc_xregs_std_fill,
2514 .xi_set = fpu_proc_xregs_std_set,
2515 .xi_signal_in = fpu_signal_copyin_std,
2516 .xi_signal_out = fpu_signal_copyout_std
2517 }, {
2518 .xi_type = PRX_INFO_ZMM,
2519 .xi_size = sizeof (prxregset_zmm_t),
2520 .xi_align = FPU_ALIGN_ZMM,
2521 .xi_always = false,
2522 .xi_bits = XFEATURE_AVX512_ZMM,
2523 .xi_fill = fpu_proc_xregs_std_fill,
2524 .xi_set = fpu_proc_xregs_std_set,
2525 .xi_valid = fpu_proc_xregs_zmm_valid,
2526 .xi_signal_in = fpu_signal_copyin_std,
2527 .xi_signal_out = fpu_signal_copyout_std
2528 }, {
2529 .xi_type = PRX_INFO_HI_ZMM,
2530 .xi_size = sizeof (prxregset_hi_zmm_t),
2531 .xi_align = FPU_ALIGN_ZMM,
2532 .xi_always = false,
2533 .xi_bits = XFEATURE_AVX512_HI_ZMM,
2534 .xi_fill = fpu_proc_xregs_std_fill,
2535 .xi_set = fpu_proc_xregs_std_set,
2536 .xi_valid = fpu_proc_xregs_hi_zmm_valid,
2537 .xi_signal_in = fpu_signal_copyin_std,
2538 .xi_signal_out = fpu_signal_copyout_std
2539 } };
2540
2541 static bool
fpu_proc_xregs_include(const xsave_proc_info_t * infop)2542 fpu_proc_xregs_include(const xsave_proc_info_t *infop)
2543 {
2544 return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0);
2545 }
2546
2547 void
fpu_proc_xregs_info(struct proc * p __unused,uint32_t * ninfop,uint32_t * sizep,uint32_t * dstart)2548 fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep,
2549 uint32_t *dstart)
2550 {
2551 size_t ret = sizeof (prxregset_hdr_t);
2552 uint32_t ninfo = 0;
2553
2554 ASSERT(fpu_xsave_enabled());
2555
2556 /*
2557 * Right now the set of flags that are enabled in the FPU is global.
2558 * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the
2559 * actual things that might show up and we care about are all about what
2560 * is set up in %xcr0 which is stored in the global xsave_bv_all. If we
2561 * move to per-process FPU enablement which is likely to come with AMX,
2562 * then this will need the proc_t to look at, hence why we've set things
2563 * up with the unused variable above.
2564 *
2565 * We take two passes through the array. The first is just to count up
2566 * how many informational entries we need.
2567 */
2568 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2569 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2570 continue;
2571 ninfo++;
2572 }
2573
2574 ASSERT3U(ninfo, >, 0);
2575 ret += sizeof (prxregset_info_t) * ninfo;
2576
2577 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2578 size_t curphase;
2579 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2580 continue;
2581
2582 curphase = ret % fpu_xsave_info[i].xi_align;
2583 if (ret < fpu_xsave_info[i].xi_align) {
2584 ret = fpu_xsave_info[i].xi_align;
2585 } else if (curphase != 0) {
2586 ret += curphase;
2587 }
2588
2589 if (i == 0 && dstart != NULL) {
2590 *dstart = ret;
2591 }
2592
2593 ret += fpu_xsave_info[i].xi_size;
2594 }
2595
2596 VERIFY3U(ret, <=, UINT32_MAX);
2597 if (sizep != NULL) {
2598 *sizep = ret;
2599 }
2600
2601 if (ninfop != NULL) {
2602 *ninfop = ninfo;
2603 }
2604 }
2605
2606 /*
2607 * This function supports /proc. Because /proc does not have a process locked
2608 * while processing a PCSXREG, this tries to establish an upper bound that we
2609 * will validate later in fpu_proc_xregs_set(). We basically say that if you
2610 * take the maximum xsave size and add 1 KiB that is a good enough approximation
2611 * for the maximum size. The 1 KiB is us basically trying to rationalize the
2612 * overhead of our structures that we're adding right, while being cognisant of
2613 * differing alignments and the fact that the full xsave size is in some cases
2614 * (when supervisor states or features we don't support are present) going to be
2615 * larger than we would need for this.
2616 */
2617 size_t
fpu_proc_xregs_max_size(void)2618 fpu_proc_xregs_max_size(void)
2619 {
2620 VERIFY(fpu_xsave_enabled());
2621 return (cpuid_get_xsave_size() + 0x1000);
2622 }
2623
2624 /*
2625 * This functions supports /proc. In particular, it's meant to perform the
2626 * following:
2627 *
2628 * o Potentially save the current thread's registers.
2629 * o Write out the x86 xsave /proc xregs format data from the xsave data we
2630 * actually have. Note, this can be a little weird for cases where the FPU is
2631 * not actually enabled, which happens for system processes.
2632 */
2633 void
fpu_proc_xregs_get(klwp_t * lwp,void * buf)2634 fpu_proc_xregs_get(klwp_t *lwp, void *buf)
2635 {
2636 uint32_t size, ninfo, curinfo, dstart;
2637 fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2638 prxregset_hdr_t *hdr = buf;
2639
2640 ASSERT(fpu_xsave_enabled());
2641 fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart);
2642
2643 /*
2644 * Before we get going, defensively zero out all the data buffer so that
2645 * the rest of the fill functions can assume a specific base.
2646 */
2647 bzero(buf, size);
2648
2649 kpreempt_disable();
2650 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2651 /*
2652 * This case suggests that thread in question doesn't have a
2653 * valid FPU save state which should only happen when it is on
2654 * CPU. If this is the case, we must ensure that we save the
2655 * current FPU state before proceeding. We also sanity check
2656 * several things here before doing this as using /proc on
2657 * yourself is always exciting. fp_save() will ensure that the
2658 * thread is flagged to go back to being an eager FPU before
2659 * returning back to userland.
2660 */
2661 VERIFY3P(curthread, ==, lwptot(lwp));
2662 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2663 fp_save(fpu);
2664 }
2665 kpreempt_enable();
2666
2667 hdr->pr_type = PR_TYPE_XSAVE;
2668 hdr->pr_size = size;
2669 hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] =
2670 hdr->pr_pad[3] = 0;
2671 hdr->pr_ninfo = ninfo;
2672
2673 curinfo = 0;
2674 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2675 void *startp;
2676 uint32_t phase;
2677
2678 if (!fpu_proc_xregs_include(&fpu_xsave_info[i]))
2679 continue;
2680
2681 phase = dstart % fpu_xsave_info[i].xi_align;
2682 if (dstart < fpu_xsave_info[i].xi_align) {
2683 ASSERT3U(i, !=, 0);
2684 dstart = fpu_xsave_info[i].xi_align;
2685 } else if (phase != 0) {
2686 ASSERT3U(i, !=, 0);
2687 dstart += phase;
2688 }
2689
2690 hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type;
2691 hdr->pr_info[curinfo].pri_flags = 0;
2692 hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size;
2693 hdr->pr_info[curinfo].pri_offset = dstart;
2694
2695 startp = (void *)((uintptr_t)buf + dstart);
2696 fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp);
2697 dstart += fpu_xsave_info[i].xi_size;
2698 ASSERT3U(curinfo, <=, ninfo);
2699 curinfo++;
2700 }
2701 }
2702
2703 /*
2704 * We have been asked to set the data in the FPU for a given thread. Our
2705 * prmachdep code has already validated that the raw semantics of the data that
2706 * we have are valid (that is the appropriate sizes, offsets, and flags). We now
2707 * apply additional checking here:
2708 *
2709 * o The xsave structure is present and only valid bits are set.
2710 * o If the xsave component bit-vector is set, we have the corresponding proc
2711 * info item.
2712 * o Read-only items are ignored if and only if they actually match what we
2713 * gave the user mostly as a courtesy to simplify things here.
2714 * o ILP32 processes which can't support many of the regions are allowed to
2715 * have the items here (as we likely gave them to them), but they must be
2716 * zero if they are set.
2717 *
2718 * We take a first pass through all the data, validating it makes sense for the
2719 * FPU. Only after that point do we ensure that we have the FPU data in question
2720 * and then we clobber all the FPU data. Part of the semantics of setting this
2721 * is that we're setting the entire extended FPU.
2722 */
2723 int
fpu_proc_xregs_set(klwp_t * lwp,void * buf)2724 fpu_proc_xregs_set(klwp_t *lwp, void *buf)
2725 {
2726 prxregset_hdr_t *prx = buf;
2727 model_t model = lwp_getdatamodel(lwp);
2728 uint64_t bv_found = 0;
2729 const prxregset_xsave_t *xsave = NULL;
2730 fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu;
2731
2732 VERIFY(fpu_xsave_enabled());
2733
2734 /*
2735 * First, walk each note info header that we have from the user and
2736 * proceed to validate it. The prmachdep code has already validated that
2737 * the size, type, and offset information is valid, but it has not
2738 * validated the semantic contents of this or if someone is trying to
2739 * write something they shouldn't.
2740 *
2741 * While we walk this, we keep track of where the xsave header is. We
2742 * also track all of the bits that we have found along the way so we can
2743 * match up and ensure that everything that was set has a corresponding
2744 * bit in the xsave bitmap. If we have something in the xsave bitmap,
2745 * but not its corresponding data, then that is an error. However, we
2746 * allow folks to write data regions without the bit set in the xsave
2747 * data to make the read, modify, write process simpler.
2748 */
2749 for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2750 const prxregset_info_t *info = &prx->pr_info[i];
2751 bool found = false;
2752
2753 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2754 void *data;
2755 if (info->pri_type != fpu_xsave_info[pt].xi_type)
2756 continue;
2757
2758 found = true;
2759 data = (void *)((uintptr_t)buf + info->pri_offset);
2760 if (fpu_xsave_info[pt].xi_valid != NULL &&
2761 !fpu_xsave_info[pt].xi_valid(model, data)) {
2762 return (EINVAL);
2763 }
2764
2765 if (info->pri_type == PRX_INFO_XSAVE) {
2766 xsave = data;
2767 }
2768 bv_found |= fpu_xsave_info[pt].xi_bits;
2769 break;
2770 }
2771
2772 if (!found) {
2773 return (EINVAL);
2774 }
2775 }
2776
2777 /*
2778 * No xsave data, no dice.
2779 */
2780 if (xsave == NULL) {
2781 return (EINVAL);
2782 }
2783
2784 /*
2785 * If anything is set in the xsave header that was not found as we
2786 * walked structures, then that is an error. The opposite is not true as
2787 * discussed above.
2788 */
2789 if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) {
2790 return (EINVAL);
2791 }
2792
2793 /*
2794 * At this point, we consider all the data actually valid. Now we must
2795 * set up this information in the save area. If this is our own lwp, we
2796 * must disable it first. Otherwise, we expect that it is already valid.
2797 * To try to sanitize this, we will defensively zero the entire region
2798 * as we are setting everything that will result in here.
2799 */
2800 kpreempt_disable();
2801 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
2802 /*
2803 * This case suggests that thread in question doesn't have a
2804 * valid FPU save state which should only happen when it is on
2805 * CPU. If this is the case, we explicitly disable the FPU, but
2806 * do not save it before proceeding. We also sanity check
2807 * several things here before doing this as using /proc on
2808 * yourself is always exciting. Unlike fp_save(), fp_free() does
2809 * not signal that an update is required, so we unconditionally
2810 * set that for all threads.
2811 */
2812 VERIFY3P(curthread, ==, lwptot(lwp));
2813 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2814 fp_free(fpu);
2815 }
2816 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
2817 bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2818 cpuid_get_xsave_size());
2819
2820 for (uint32_t i = 0; i < prx->pr_ninfo; i++) {
2821 const prxregset_info_t *info = &prx->pr_info[i];
2822 bool found = false;
2823
2824 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) {
2825 const void *data;
2826 if (info->pri_type != fpu_xsave_info[pt].xi_type)
2827 continue;
2828
2829 /*
2830 * Check if we have a set function and if we should
2831 * include this. We may not if this is something like
2832 * PRX_INFO_XCR which is read-only.
2833 *
2834 * We may not include a given entry as it may not have
2835 * been set in the actual xsave state that we have been
2836 * asked to restore, in which case to not break the
2837 * xsaveopt logic, we must leave it in its initial
2838 * state, e.g. zeroed (generally). XMM data initial
2839 * state is not zeroed, but is marked with xi_always to
2840 * help account for this.
2841 */
2842 found = true;
2843 if (fpu_xsave_info[pt].xi_set == NULL)
2844 break;
2845 if (!fpu_xsave_info[pt].xi_always &&
2846 (xsave->prx_xsh_xstate_bv &
2847 fpu_xsave_info[pt].xi_bits) !=
2848 fpu_xsave_info[pt].xi_bits) {
2849 break;
2850 }
2851
2852 data = (void *)((uintptr_t)buf + info->pri_offset);
2853 fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt],
2854 xsave->prx_xsh_xstate_bv, data);
2855 }
2856
2857 VERIFY(found);
2858 }
2859 kpreempt_enable();
2860
2861 return (0);
2862 }
2863
2864 /*
2865 * To be included in the signal copyout logic we must have a copy function and
2866 * the bit in question must be included. Note, we don't consult xi_always here
2867 * as that is really part of what is always present for xsave logic and
2868 * therefore isn't really pertinent here because of our custom format. See the
2869 * big theory statement for more info.
2870 */
2871 static bool
fpu_signal_include(const xsave_proc_info_t * infop,uint64_t xs_bv)2872 fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv)
2873 {
2874 return ((infop->xi_bits & xs_bv) == infop->xi_bits &&
2875 infop->xi_signal_out != NULL);
2876 }
2877
2878 /*
2879 * We need to fill out the xsave related data into the ucontext_t that we've
2880 * been given. We should have a valid user pointer at this point in the uc_xsave
2881 * member. This is much simpler than the copyin that we have. Here are the
2882 * current assumptions:
2883 *
2884 * o This is being called for the current thread. This is not meant to operate
2885 * on an arbitrary thread's state.
2886 * o We cannot assume whether the FPU is valid in the pcb or not. While most
2887 * callers will have just called getfpregs() which saved the state, don't
2888 * assume that.
2889 * o We assume that the user address has the requisite required space for this
2890 * to be copied out.
2891 * o We assume that copyfunc() will ensure we are not copying into a kernel
2892 * address.
2893 *
2894 * For more information on the format of the data, see the 'Signal Handling and
2895 * the ucontext_t' portion of the big theory statement. We copy out all the
2896 * constituent parts and then come back and write out the actual final header
2897 * information.
2898 */
2899 int
fpu_signal_copyout(klwp_t * lwp,uintptr_t uaddr,fpu_copyout_f copyfunc)2900 fpu_signal_copyout(klwp_t *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc)
2901 {
2902 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
2903 uint64_t xs_bv;
2904 uc_xsave_t ucx;
2905 int ret;
2906
2907 VERIFY3P(curthread, ==, lwptot(lwp));
2908 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
2909 VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
2910
2911 if (!fpu_xsave_enabled()) {
2912 return (ENOTSUP);
2913 }
2914
2915 /*
2916 * Unlike when we're dealing with /proc, we can unconditionally call
2917 * fp_save() because this is always called in the context where the lwp
2918 * we're operating on is always the one on CPU (which is what fp_save()
2919 * asserts).
2920 */
2921 fp_save(fpu);
2922
2923 bzero(&ucx, sizeof (ucx));
2924 ucx.ucx_vers = UC_XSAVE_VERS;
2925 ucx.ucx_len += sizeof (uc_xsave_t);
2926
2927 xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
2928 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
2929 const xsave_proc_info_t *info = &fpu_xsave_info[i];
2930
2931 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
2932 continue;
2933 ret = info->xi_signal_out(info, copyfunc, &ucx,
2934 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
2935 uaddr);
2936 if (ret != 0) {
2937 kpreempt_enable();
2938 return (ret);
2939 }
2940 }
2941
2942 /*
2943 * Now that everything has been copied out, we should have an accurate
2944 * value in the uc_xsave_t header and we can copy that out at the start
2945 * of the user data.
2946 */
2947 ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx));
2948 return (ret);
2949 }
2950
2951 /*
2952 * Here we've been given a ucontext_t which potentially has a user pointer to
2953 * xsave state that we've copied out previously. In this case we need to do the
2954 * following, assuming UC_XSAVE is present:
2955 *
2956 * o Copy in our header and validate it.
2957 * o Allocate an fpu context to use as a holding ground for all this data.
2958 * o If UC_FPU is set, override the xsave structure with the saved XMM state,
2959 * clear UC_FPU, and make sure that the correct xsave_bv bits are set.
2960 *
2961 * Currently we always allocate the additional state as a holding ground for the
2962 * FPU. What we're copying in may not be valid and we don't want to clobber the
2963 * existing FPU state or deal with merging it until we believe it's reasonable
2964 * enough. The proc_t is here to set us up for when we have per-process settings
2965 * in the extended feature disable MSRs.
2966 */
2967 int
fpu_signal_copyin(klwp_t * lwp,ucontext_t * kuc)2968 fpu_signal_copyin(klwp_t *lwp, ucontext_t *kuc)
2969 {
2970 uc_xsave_t ucx;
2971 uint64_t bv;
2972 uintptr_t data, max_data;
2973 void *fpu;
2974 proc_t *p = lwp->lwp_procp;
2975 size_t ksize;
2976
2977 /*
2978 * Because this has been opaque filler and the kernel has never
2979 * historically looked at it, we don't really care about the uc_xsave
2980 * pointer being garbage in the case that the flag is not set. While
2981 * this isn't perhaps the most sporting choice in some cases, this is on
2982 * the other hand, pragmatic.
2983 */
2984 if ((kuc->uc_flags & UC_XSAVE) != 0) {
2985 if (kuc->uc_xsave == 0) {
2986 return (EINVAL);
2987 }
2988
2989 if (!fpu_xsave_enabled()) {
2990 return (ENOTSUP);
2991 }
2992 } else {
2993 return (0);
2994 }
2995
2996 if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) !=
2997 0) {
2998 return (EFAULT);
2999 }
3000
3001 ksize = cpuid_get_xsave_size();
3002 if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) ||
3003 ucx.ucx_len > ksize ||
3004 (ucx.ucx_bv & ~xsave_bv_all) != 0 ||
3005 (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len <
3006 (uintptr_t)kuc->uc_xsave) {
3007 return (EINVAL);
3008 }
3009
3010 /*
3011 * OK, our goal right now is to recreate a valid xsave_state structure
3012 * that we'll ultimately end up having to merge with our existing one in
3013 * the FPU save state. The reason we describe this as a merge is to help
3014 * future us when we want to retain supervisor state which will never be
3015 * part of userland signal state. The design of the userland signal
3016 * state is basically to compress it as much as we can. This is done for
3017 * two reasons:
3018 *
3019 * 1) We currently consider this a private interface.
3020 * 2) We really want to minimize the actual amount of stack space we
3021 * use as much as possible. Most applications aren't using AVX-512
3022 * right now, so doing our own compression style is worthwhile. If
3023 * libc adopts AVX-512 routines, we may want to change this.
3024 *
3025 * On the allocation below, our assumption is that if a thread has taken
3026 * a signal, then it is likely to take a signal again in the future (or
3027 * be shortly headed to its demise). As such, when that happens we will
3028 * leave the allocated signal stack around for the process. Most
3029 * applications don't allow all threads to take signals, so this should
3030 * hopefully help amortize the cost of the allocation.
3031 */
3032 max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len;
3033 data = (uintptr_t)kuc->uc_xsave + sizeof (ucx);
3034 bv = ucx.ucx_bv;
3035 if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) {
3036 lwp->lwp_pcb.pcb_fpu.fpu_signal =
3037 kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
3038 }
3039 fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal;
3040
3041 /*
3042 * Unconditionally initialize the memory we get in here to ensure that
3043 * it is in a reasonable state for ourselves. This ensures that unused
3044 * regions are mostly left in their initial state (the main exception
3045 * here is the x87/XMM state, but that should be OK). We don't fill in
3046 * the initial xsave state as we expect that to happen as part of our
3047 * processing.
3048 */
3049 bzero(fpu, ksize);
3050
3051 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3052 int ret;
3053 const xsave_proc_info_t *info = &fpu_xsave_info[i];
3054 if (!info->xi_always && (info->xi_bits & bv) == 0)
3055 continue;
3056 bv &= ~info->xi_bits;
3057
3058 if (info->xi_signal_in == NULL)
3059 continue;
3060 ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data);
3061 if (ret != 0) {
3062 return (ret);
3063 }
3064 }
3065 ASSERT0(bv);
3066
3067 /*
3068 * As described in the big theory statement section 'Signal Handling and
3069 * the ucontext_t', we always remove UC_FPU from here as we've taken
3070 * care of reassembling it ourselves.
3071 */
3072 kuc->uc_flags &= ~UC_FPU;
3073 kuc->uc_xsave = (uintptr_t)fpu;
3074
3075 return (0);
3076 }
3077
3078 /*
3079 * This determines the size of the signal stack that we need for our custom form
3080 * of the xsave state.
3081 */
3082 size_t
fpu_signal_size(klwp_t * lwp)3083 fpu_signal_size(klwp_t *lwp)
3084 {
3085 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3086 size_t len = sizeof (uc_xsave_t);
3087 uint64_t xs_bv;
3088
3089 VERIFY3P(curthread, ==, lwptot(lwp));
3090 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3091 VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3092
3093 if (!fpu_xsave_enabled()) {
3094 return (0);
3095 }
3096
3097 kpreempt_disable();
3098 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3099 fp_save(fpu);
3100 }
3101
3102 xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv;
3103 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) {
3104 size_t comp_size;
3105
3106 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv))
3107 continue;
3108
3109 cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size,
3110 NULL);
3111 len += comp_size;
3112 }
3113
3114 kpreempt_enable();
3115 return (len);
3116 }
3117
3118 /*
3119 * This function is used in service of restorecontext() to set the specified
3120 * thread's extended FPU state to the passed in data. Our assumptions at this
3121 * point from the system are:
3122 *
3123 * o Someone has already verified that the actual xsave header is correct.
3124 * o Any traditional XMM state that causes a #gp has been clamped.
3125 * o That data is basically the correct sized xsave state structure. Right now
3126 * that means it is not compressed and follows the CPUID-based rules for
3127 * constructing and laying out data.
3128 * o That the lwp argument refers to the current thread.
3129 *
3130 * Our primary purpose here is to merge the current FPU state with what exists
3131 * here. Right now, "merge", strictly speaking is just "replace". We can get
3132 * away with just replacing everything because all we currently save are user
3133 * states. If we start saving kernel states in here, this will get more nuanced
3134 * and we will need to be more careful about how we store data here.
3135 */
3136 void
fpu_set_xsave(klwp_t * lwp,const void * data)3137 fpu_set_xsave(klwp_t *lwp, const void *data)
3138 {
3139 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3140 uint32_t status, xstatus;
3141 struct xsave_state *dst_xsave;
3142
3143 VERIFY(fpu_xsave_enabled());
3144 VERIFY3P(curthread, ==, lwptot(lwp));
3145 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3146 ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN);
3147
3148 /*
3149 * We use fp_save() here rather than a stock fpdisable() so we can
3150 * attempt to honor our invariants that when the thread state has been
3151 * saved, the valid flag is set, even though we're going to be
3152 * overwriting it shortly. If we just called fpdisable() then we would
3153 * basically be asking for trouble.
3154 *
3155 * Because we are modifying the state here and we don't want the system
3156 * to end up in an odd state, we are being a little paranoid and
3157 * disabling preemption across this operation. In particular, once the
3158 * state is properly tagged with FPU_VALID, there should be no other way
3159 * that this thread can return to userland and get cleared out because
3160 * we're resetting its context; however, we let paranoia win out.
3161 */
3162 kpreempt_disable();
3163 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3164 fp_save(fpu);
3165 }
3166
3167 bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic,
3168 cpuid_get_xsave_size());
3169 dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic;
3170 status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0];
3171 xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1];
3172 dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0;
3173 dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0;
3174
3175 /*
3176 * These two status words are information that the kernel itself uses to
3177 * track additional information and is part of the traditional fpregset,
3178 * but is not part of our xregs information. Because we are setting this
3179 * state, we leave it up to the rest of the kernel to determine whether
3180 * this came from an fpregset_t or is being reset to the default of 0.
3181 */
3182 fpu->fpu_regs.kfpu_status = status;
3183 fpu->fpu_regs.kfpu_xstatus = xstatus;
3184
3185 fpu->fpu_flags |= FPU_VALID;
3186 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3187 kpreempt_enable();
3188 }
3189
3190 /*
3191 * Convert the current FPU state to the traditional fpregset_t. In the 64-bit
3192 * kernel, this is just an fxsave_state with additional values for the status
3193 * and xstatus members.
3194 *
3195 * This has the same nuance as the xregs cases discussed above, but is simpler
3196 * in that we only need to handle the fxsave state, but more complicated because
3197 * we need to check our save mechanism.
3198 */
3199 void
fpu_get_fpregset(klwp_t * lwp,fpregset_t * fp)3200 fpu_get_fpregset(klwp_t *lwp, fpregset_t *fp)
3201 {
3202 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3203
3204 kpreempt_disable();
3205 fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
3206 fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus;
3207
3208 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3209 /*
3210 * If we're requesting the fpregs of a thread that isn't
3211 * currently valid and isn't the one that we're executing, then
3212 * we consider getting this information to be a best-effort and
3213 * we will not stop the thread in question to serialize it,
3214 * which means possibly getting stale data. This is the
3215 * traditional semantics that the system has used to service
3216 * this for /proc.
3217 */
3218 if (curthread == lwptot(lwp)) {
3219 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3220 fp_save(fpu);
3221 }
3222 }
3223
3224 /*
3225 * If the FPU is not enabled and the state isn't valid (due to someone
3226 * else setting it), just copy the initial state.
3227 */
3228 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) {
3229 bcopy(&sse_initial, fp, sizeof (sse_initial));
3230 kpreempt_enable();
3231 return;
3232 }
3233
3234 /*
3235 * Given that we have an enabled FPU, we must look at the type of FPU
3236 * save mechanism to clean this up. In particular, while we can just
3237 * copy the save area with FXSAVE, with XSAVE we must carefully copy
3238 * only the bits that are valid and reset the rest to their default
3239 * state.
3240 */
3241 switch (fp_save_mech) {
3242 case FP_FXSAVE:
3243 bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp,
3244 sizeof (struct fxsave_state));
3245 break;
3246 case FP_XSAVE:
3247 fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs,
3248 (struct fxsave_state *)fp);
3249 break;
3250 default:
3251 panic("Invalid fp_save_mech");
3252 }
3253
3254 kpreempt_enable();
3255 }
3256
3257 /*
3258 * This is a request to set the ABI fpregset_t into our actual hardware state.
3259 * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the
3260 * 512-byte fxsave area.
3261 */
3262 void
fpu_set_fpregset(klwp_t * lwp,const fpregset_t * fp)3263 fpu_set_fpregset(klwp_t *lwp, const fpregset_t *fp)
3264 {
3265 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
3266
3267 kpreempt_disable();
3268 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) {
3269 /*
3270 * We always save the entire FPU. This is required if we're
3271 * using xsave. If we're using fxsave, we could skip the
3272 * 512-byte write and instead just disable the FPU since we'd be
3273 * replacing it all. For now we don't bother with more
3274 * conditional logic.
3275 */
3276 VERIFY3P(curthread, ==, lwptot(lwp));
3277 VERIFY0(lwptot(lwp)->t_flag & T_KFPU);
3278 fp_save(fpu);
3279 }
3280
3281 fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus;
3282 fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
3283 switch (fp_save_mech) {
3284 case FP_FXSAVE:
3285 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx,
3286 sizeof (struct fxsave_state));
3287 break;
3288 case FP_XSAVE:
3289 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs,
3290 sizeof (struct fxsave_state));
3291 fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |=
3292 XFEATURE_LEGACY_FP | XFEATURE_SSE;
3293 break;
3294 default:
3295 panic("Invalid fp_save_mech");
3296 }
3297
3298 fpu->fpu_flags |= FPU_VALID;
3299 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
3300 kpreempt_enable();
3301 }
3302