1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2021 Joyent, Inc. 24 * Copyright 2021 RackTop Systems, Inc. 25 * Copyright 2023 Oxide Computer Company 26 */ 27 28 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 30 /* All Rights Reserved */ 31 32 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 33 /* All Rights Reserved */ 34 35 /* 36 * Copyright (c) 2009, Intel Corporation. 37 * All rights reserved. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/signal.h> 43 #include <sys/regset.h> 44 #include <sys/privregs.h> 45 #include <sys/psw.h> 46 #include <sys/trap.h> 47 #include <sys/fault.h> 48 #include <sys/systm.h> 49 #include <sys/user.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/pcb.h> 53 #include <sys/lwp.h> 54 #include <sys/cpuvar.h> 55 #include <sys/thread.h> 56 #include <sys/disp.h> 57 #include <sys/fp.h> 58 #include <sys/siginfo.h> 59 #include <sys/archsystm.h> 60 #include <sys/kmem.h> 61 #include <sys/debug.h> 62 #include <sys/x86_archext.h> 63 #include <sys/sysmacros.h> 64 #include <sys/cmn_err.h> 65 #include <sys/kfpu.h> 66 #include <sys/stdbool.h> 67 #include <sys/stdalign.h> 68 #include <sys/procfs_isa.h> 69 #include <sys/sunddi.h> 70 71 /* 72 * FPU Management Overview 73 * ----------------------- 74 * 75 * The x86 FPU has evolved substantially since its days as the x87 coprocessor; 76 * however, many aspects of its life as a coprocessor are still around in x86. 77 * 78 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU. 79 * While that state still exists, there is much more that is covered by the FPU. 80 * Today, this includes not just traditional FPU state, but also supervisor only 81 * state. The following state is currently managed and covered logically by the 82 * idea of the FPU registers and more generally is called the Extended Processor 83 * States: 84 * 85 * o Traditional x87 FPU 86 * o Vector Registers (%xmm, %ymm, %zmm) 87 * o Memory Protection Extensions (MPX) Bounds Registers 88 * o Protected Key Rights Registers (PKRU) 89 * o Processor Trace data 90 * o Control-Flow Enforcement state 91 * o Hardware Duty Cycle 92 * o Hardware P-states 93 * 94 * The rest of this covers how the FPU is managed and controlled, how state is 95 * saved and restored between threads, interactions with hypervisors, and other 96 * information exported to userland through aux vectors. A lot of background 97 * information is here to synthesize major parts of the Intel SDM, but 98 * unfortunately, it is not a replacement for reading it. 99 * 100 * FPU Control Registers 101 * --------------------- 102 * 103 * Because the x87 FPU began its life as a co-processor and the FPU was 104 * optional there are several bits that show up in %cr0 that we have to 105 * manipulate when dealing with the FPU. These are: 106 * 107 * o CR0.ET The 'extension type' bit. This was used originally to indicate 108 * that the FPU co-processor was present. Now it is forced on for 109 * compatibility. This is often used to verify whether or not the 110 * FPU is present. 111 * 112 * o CR0.NE The 'native error' bit. Used to indicate that native error 113 * mode should be enabled. This indicates that we should take traps 114 * on FPU errors. The OS enables this early in boot. 115 * 116 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not 117 * wait/fwait instructions generate a #NM if CR0.TS is set. 118 * 119 * o CR0.EM The 'Emulation' bit. This is used to cause floating point 120 * operations (x87 through SSE4) to trap with a #UD so they can be 121 * emulated. The system never sets this bit, but makes sure it is 122 * clear on processor start up. 123 * 124 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating 125 * point operation will generate a #NM. An fwait will as well, 126 * depending on the value in CR0.MP. 127 * 128 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by 129 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more 130 * complicated role. Historically it has been used to allow running systems to 131 * restore the FPU registers lazily. This will be discussed in greater depth 132 * later on. 133 * 134 * %cr4 is also used as part of the FPU control. Specifically we need to worry 135 * about the following bits in the system: 136 * 137 * o CR4.OSFXSR This bit is used to indicate that the OS understands and 138 * supports the execution of the fxsave and fxrstor 139 * instructions. This bit is required to be set to enable 140 * the use of the SSE->SSE4 instructions. 141 * 142 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand 143 * and take a SIMD floating point exception (#XM). This bit 144 * is always enabled by the system. 145 * 146 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and 147 * supports the execution of the xsave and xrstor family of 148 * instructions. This bit is required to use any of the AVX 149 * and newer feature sets. 150 * 151 * Because all supported processors are 64-bit, they'll always support the XMM 152 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot. 153 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid. 154 * 155 * %xcr0 is used to manage the behavior of the xsave feature set and is only 156 * present on the system if xsave is supported. %xcr0 is read and written to 157 * through by the xgetbv and xsetbv instructions. This register is present 158 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a 159 * different component of the xsave state and controls whether or not that 160 * information is saved and restored. For newer feature sets like AVX and MPX, 161 * it also controls whether or not the corresponding instructions can be 162 * executed (much like CR0.OSFXSR does for the SSE feature sets). 163 * 164 * Everything in %xcr0 is around features available to users. There is also the 165 * IA32_XSS MSR which is used to control supervisor-only features that are still 166 * part of the xsave state. Bits that can be set in %xcr0 are reserved in 167 * IA32_XSS and vice versa. This is an important property that is particularly 168 * relevant to how the xsave instructions operate. 169 * 170 * Save Mechanisms 171 * --------------- 172 * 173 * When switching between running threads the FPU state needs to be saved and 174 * restored by the OS. If this state was not saved, users would rightfully 175 * complain about corrupt state. There are three mechanisms that exist on the 176 * processor for saving and restoring these state images: 177 * 178 * o fsave 179 * o fxsave 180 * o xsave 181 * 182 * fsave saves and restores only the x87 FPU and is the oldest of these 183 * mechanisms. This mechanism is never used in the kernel today because we are 184 * always running on systems that support fxsave. 185 * 186 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register 187 * state to be saved and restored to and from a struct fxsave_state. This is the 188 * default mechanism that is used to save and restore the FPU on amd64. An 189 * important aspect of fxsave that was different from the original i386 fsave 190 * mechanism is that the restoring of FPU state with pending exceptions will not 191 * generate an exception, it will be deferred to the next use of the FPU. 192 * 193 * The final and by far the most complex mechanism is that of the xsave set. 194 * xsave allows for saving and restoring all of the traditional x86 pieces (x87 195 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc. 196 * registers. 197 * 198 * Data is saved and restored into and out of a struct xsave_state. The first 199 * part of the struct xsave_state is equivalent to the struct fxsave_state. 200 * After that, there is a header which is used to describe the remaining 201 * portions of the state. The header is a 64-byte value of which the first two 202 * uint64_t values are defined and the rest are reserved and must be zero. The 203 * first uint64_t is the xstate_bv member. This describes which values in the 204 * xsave_state are actually valid and present. This is updated on a save and 205 * used on restore. The second member is the xcomp_bv member. Its last bit 206 * determines whether or not a compressed version of the structure is used. 207 * 208 * When the uncompressed structure is used (currently the only format we 209 * support), then each state component is at a fixed offset in the structure, 210 * even if it is not being used. For example, if you only saved the AVX related 211 * state, but did not save the MPX related state, the offset would not change 212 * for any component. With the compressed format, components that aren't used 213 * are all elided (though the x87 and SSE state are always there). 214 * 215 * Unlike fxsave which saves all state, the xsave family does not always save 216 * and restore all the state that could be covered by the xsave_state. The 217 * instructions all take an argument which is a mask of what to consider. This 218 * is the same mask that will be used in the xstate_bv vector and it is also the 219 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only 220 * considered with the xsaves and xrstors instructions. 221 * 222 * When a save or restore is requested, a bitwise and is performed between the 223 * requested bits and those that have been enabled in %xcr0. Only the bits that 224 * match that are then saved or restored. Others will be silently ignored by 225 * the processor. This idea is used often in the OS. We will always request that 226 * we save and restore all of the state, but only those portions that are 227 * actually enabled in %xcr0 will be touched. 228 * 229 * If a feature has been asked to be restored that is not set in the xstate_bv 230 * feature vector of the save state, then it will be set to its initial state by 231 * the processor (usually zeros). Also, when asked to save state, the processor 232 * may not write out data that is in its initial state as an optimization. This 233 * optimization only applies to saving data and not to restoring data. 234 * 235 * There are a few different variants of the xsave and xrstor instruction. They 236 * are: 237 * 238 * o xsave This is the original save instruction. It will save all of the 239 * requested data in the xsave state structure. It only saves data 240 * in the uncompressed (xcomp_bv[63] is zero) format. It may be 241 * executed at all privilege levels. 242 * 243 * o xrstor This is the original restore instruction. It will restore all of 244 * the requested data. The xrstor function can handle both the 245 * compressed and uncompressed formats. It may be executed at all 246 * privilege levels. 247 * 248 * o xsaveopt This is a variant of the xsave instruction that employs 249 * optimizations to try and only write out state that has been 250 * modified since the last time an xrstor instruction was called. 251 * The processor tracks a tuple of information about the last 252 * xrstor and tries to ensure that the same buffer is being used 253 * when this optimization is being used. However, because of the 254 * way that it tracks the xrstor buffer based on the address of it, 255 * it is not suitable for use if that buffer can be easily reused. 256 * The most common case is trying to save data to the stack in 257 * rtld. It may be executed at all privilege levels. 258 * 259 * o xsavec This is a variant of the xsave instruction that writes out the 260 * compressed form of the xsave_state. Otherwise it behaves as 261 * xsave. It may be executed at all privilege levels. 262 * 263 * o xsaves This is a variant of the xsave instruction. It is similar to 264 * xsavec in that it always writes the compressed form of the 265 * buffer. Unlike all the other forms, this instruction looks at 266 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine 267 * what to save and restore. xsaves also implements the same 268 * optimization that xsaveopt does around modified pieces. User 269 * land may not execute the instruction. 270 * 271 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves 272 * it can save and restore both the user and privileged states. 273 * Unlike xrstor it can only operate on the compressed form. 274 * User land may not execute the instruction. 275 * 276 * Based on all of these, the kernel has a precedence for what it will use. 277 * Basically, xsaves (not supported) is preferred to xsaveopt, which is 278 * preferred to xsave. A similar scheme is used when informing rtld (more later) 279 * about what it should use. xsavec is preferred to xsave. xsaveopt is not 280 * recommended due to the modified optimization not being appropriate for this 281 * use. 282 * 283 * Finally, there is one last gotcha with the xsave state. Importantly some AMD 284 * processors did not always save and restore some of the FPU exception state in 285 * some cases like Intel did. In those cases the OS will make up for this fact 286 * itself. 287 * 288 * FPU Initialization 289 * ------------------ 290 * 291 * One difference with the FPU registers is that not all threads have FPU state, 292 * only those that have an lwp. Generally this means kernel threads, which all 293 * share p0 and its lwp, do not have FPU state. Though there are definitely 294 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread 295 * and lwp interchangeably, just think of thread meaning a thread that has a 296 * lwp. 297 * 298 * Each lwp has its FPU state allocated in its pcb (process control block). The 299 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized 300 * dynamically at start up based on the save mechanism that we're using and the 301 * amount of memory required for it. This is dynamic because the xsave_state 302 * size varies based on the supported feature set. 303 * 304 * The hardware side of the FPU is initialized early in boot before we mount the 305 * root file system. This is effectively done in fpu_probe(). This is where we 306 * make the final decision about what the save and restore mechanisms we should 307 * use are, create the fpsave_cachep kmem cache, and initialize a number of 308 * function pointers that use save and restoring logic. 309 * 310 * The thread/lwp side is a a little more involved. There are two different 311 * things that we need to concern ourselves with. The first is how the FPU 312 * resources are allocated and the second is how the FPU state is initialized 313 * for a given lwp. 314 * 315 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init(). 316 * This is always called unconditionally by the system as part of creating an 317 * LWP. 318 * 319 * There are three different initialization paths that we deal with. The first 320 * is when we are executing a new process. As part of exec all of the register 321 * state is reset. The exec case is particularly important because init is born 322 * like Athena, sprouting from the head of the kernel, without any true parent 323 * to fork from. The second is used whenever we fork or create a new lwp. The 324 * third is to deal with special lwps like the agent lwp. 325 * 326 * During exec, we will call fp_exec() which will initialize and set up the FPU 327 * state for the process. That will fill in the initial state for the FPU and 328 * also set that state in the FPU itself. As part of fp_exec() we also install a 329 * thread context operations vector that takes care of dealing with the saving 330 * and restoring of the FPU. These context handlers will also be called whenever 331 * an lwp is created or forked. In those cases, to initialize the FPU we will 332 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context 333 * operations vector for the new thread. 334 * 335 * Next we'll end up in the context operation fp_new_lwp(). This saves the 336 * current thread's state, initializes the new thread's state, and copies over 337 * the relevant parts of the originating thread's state. It's as this point that 338 * we also install the FPU context operations into the new thread, which ensures 339 * that all future threads that are descendants of the current one get the 340 * thread context operations (unless they call exec). 341 * 342 * To deal with some things like the agent lwp, we double check the state of the 343 * FPU in sys_rtt_common() to make sure that it has been enabled before 344 * returning to userland. In general, this path should be rare, but it's useful 345 * for the odd lwp here and there. 346 * 347 * The FPU state will remain valid most of the time. There are times that 348 * the state will be rewritten. For example in restorecontext, due to /proc, or 349 * the lwp calls exec(). Whether the context is being freed or we are resetting 350 * the state, we will call fp_free() to disable the FPU and our context. 351 * 352 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU 353 * state by calling fp_lwp_cleanup(). 354 * 355 * Kernel FPU Multiplexing 356 * ----------------------- 357 * 358 * Just as the kernel has to maintain all of the general purpose registers when 359 * switching between scheduled threads, the same is true of the FPU registers. 360 * 361 * When a thread has FPU state, it also has a set of context operations 362 * installed. These context operations take care of making sure that the FPU is 363 * properly saved and restored during a context switch (fpsave_ctxt and 364 * fprestore_ctxt respectively). This means that the current implementation of 365 * the FPU is 'eager', when a thread is running the CPU will have its FPU state 366 * loaded. While this is always true when executing in userland, there are a few 367 * cases where this is not true in the kernel. 368 * 369 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was 370 * employed. This meant that the FPU would be saved on a context switch and the 371 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would 372 * then take a #NM trap, at which point we would restore the FPU from the save 373 * area and return to userland. Given the frequency of use of the FPU alone by 374 * libc, there's no point returning to userland just to trap again. 375 * 376 * There are a few cases though where the FPU state may need to be changed for a 377 * thread on its behalf. The most notable cases are in the case of processes 378 * using /proc, restorecontext, forking, etc. In all of these cases the kernel 379 * will force a threads FPU state to be saved into the PCB through the fp_save() 380 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the 381 * pcb. This indicates that the save state holds currently valid data. As a side 382 * effect of this, CR0.TS will be set. To make sure that all of the state is 383 * updated before returning to userland, in these cases, we set a flag on the 384 * PCB that says the FPU needs to be updated. This will make sure that we take 385 * the slow path out of a system call to fix things up for the thread. Due to 386 * the fact that this is a rather rare case, effectively setting the equivalent 387 * of t_postsys is acceptable. 388 * 389 * CR0.TS will be set after a save occurs and cleared when a restore occurs. 390 * Generally this means it will be cleared immediately by the new thread that is 391 * running in a context switch. However, this isn't the case for kernel threads. 392 * They currently operate with CR0.TS set as no kernel state is restored for 393 * them. This means that using the FPU will cause a #NM and panic. 394 * 395 * The FPU_VALID flag on the currently executing thread's pcb is meant to track 396 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set. 397 * However, because we eagerly restore, the only time that CR0.TS should be set 398 * for a non-kernel thread is during operations where it will be cleared before 399 * returning to userland and importantly, the only data that is in it is its 400 * own. 401 * 402 * Kernel FPU Usage 403 * ---------------- 404 * 405 * Traditionally the kernel never used the FPU since it had no need for 406 * floating point operations. However, modern FPU hardware supports a variety 407 * of SIMD extensions which can speed up code such as parity calculations or 408 * encryption. 409 * 410 * To allow the kernel to take advantage of these features, the 411 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped 412 * around any usage of the FPU by the kernel to ensure that user-level context 413 * is properly saved/restored, as well as to properly setup the FPU for use by 414 * the kernel. There are a variety of ways this wrapping can be used, as 415 * discussed in this section below. 416 * 417 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended 418 * operations, the kernel_fpu_alloc() function should be used to allocate a 419 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU 420 * state. This structure is not tied to any thread. That is, different threads 421 * can reuse the same kfpu_state_t structure, although not concurrently. A 422 * kfpu_state_t structure is freed by the kernel_fpu_free() function. 423 * 424 * In some cases, the kernel may need to use the FPU for a short operation 425 * without the overhead to manage a kfpu_state_t structure and without 426 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE 427 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags 428 * parameter. This indicates that there is no kfpu_state_t. When used this way, 429 * kernel preemption should be disabled by the caller (kpreempt_disable) before 430 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end(). 431 * For this usage, it is important to limit the kernel's FPU use to short 432 * operations. The tradeoff between using the FPU without a kfpu_state_t 433 * structure vs. the overhead of allowing a context switch while using the FPU 434 * should be carefully considered on a case by case basis. 435 * 436 * In other cases, kernel threads have an LWP, but never execute in user space. 437 * In this situation, the LWP's pcb_fpu area can be used to save/restore the 438 * kernel's FPU state if the thread is context switched, instead of having to 439 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the 440 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to 441 * enable this behavior. It is the caller's responsibility to ensure that this 442 * is only used for a kernel thread which never executes in user space. 443 * 444 * FPU Exceptions 445 * -------------- 446 * 447 * Certain operations can cause the kernel to take traps due to FPU activity. 448 * Generally these events will cause a user process to receive a SIGFPU and if 449 * the kernel receives it in kernel context, we will die. Traditionally the #NM 450 * (Device Not Available / No Math) exception generated by CR0.TS would have 451 * caused us to restore the FPU. Now it is a fatal event regardless of whether 452 * or not userland causes it. 453 * 454 * While there are some cases where the kernel uses the FPU, it is up to the 455 * kernel to use the FPU in a way such that it cannot receive a trap or to use 456 * the appropriate trap protection mechanisms. 457 * 458 * Hypervisors 459 * ----------- 460 * 461 * When providing support for hypervisors things are a little bit more 462 * complicated because the FPU is not virtualized at all. This means that they 463 * need to save and restore the FPU and %xcr0 across entry and exit to the 464 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These 465 * allow us to use the full native state to make sure that we are always saving 466 * and restoring the full FPU that the host sees, even when the guest is using a 467 * subset. 468 * 469 * One tricky aspect of this is that the guest may be using a subset of %xcr0 470 * and therefore changing our %xcr0 on the fly. It is vital that when we're 471 * saving and restoring the FPU that we always use the largest %xcr0 contents 472 * otherwise we will end up leaving behind data in it. 473 * 474 * ELF PLT Support 475 * --------------- 476 * 477 * rtld has to preserve a subset of the FPU when it is saving and restoring 478 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for 479 * more information. As a result, we set up an aux vector that contains 480 * information about what save and restore mechanisms it should be using and 481 * the sizing thereof based on what the kernel supports. This is passed down in 482 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is 483 * initialized in fpu_subr.c. 484 * 485 * Signal Handling and the ucontext_t 486 * ---------------------------------- 487 * 488 * One of the many gifts that signals give us is the twofold fact that when a 489 * signal occurs, the signal handler is allowed to change the CPU's state 490 * arbitrarily and when the signal handler is done executing, we must restore it 491 * back to the original state. However, the second part of this is that the 492 * signal handler is actually allowed to modify the state that the thread will 493 * return to! To create this facade, the kernel will create a full ucontext_t 494 * state, effectively calling getcontext(2) on the thread's behalf, and a 495 * pointer to that is given to the signal handler (the void * argument for the 496 * sa_sigaction function pointer in sigaction(2)). When libc is done with a 497 * signal, it will call setcontext(2) with that same ucontext_t. 498 * 499 * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and 500 * it's often declared on the stack itself, with the signal handler spilling all 501 * this state to the stack. The ucontext_t machine portion was broken into the 502 * general purpose and floating point registers. In 64-bit code, the floating 503 * point registers were mostly the same as the results of the fxsave instruction 504 * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent 505 * starting point for information, it is transformed into a different shape to 506 * deal with the history of the 32-bit SYS V ABI. 507 * 508 * While this worked, if you're reading this, you're aware that the x86 FPU and 509 * extended register states didn't stop at the initial 16 128-bit %xmm 510 * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k 511 * opmask registers. None of these fit inside the standard ucontext_t; however, 512 * they must all be preserved and restored across a signal. While the various 513 * x86 platform-specific ABIs all suggest that these registers are not preserved 514 * across a function call, receiving a signal is not a function call and must be 515 * thought of like a process receiving an interrupt. In other words, this 516 * extended state must be preserved. 517 * 518 * To facilitate this, we have extended the ucontext_t structure with an 519 * additional flag, UC_XSAVE, which indicates that the traditional padding 520 * member, uc_xsave, actually is a pointer to the extended state. While this is 521 * accessible outside of a signal handling context through the combination of 522 * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this 523 * state is focused on signal handling. Signal handling spills all this state to 524 * the stack and if we cannot spill the entire state to the stack then our 525 * inability to deliver the signal results in the process being killed! While 526 * there are separate efforts to ensure that the signal stack sizing that is 527 * used for the minimum and maximum signal sizes are sufficient, we still need 528 * to do our part to minimize the likelihood here. 529 * 530 * In designing this, we make the following observations which have helped us 531 * focus our design: 532 * 533 * o While the start of an xsave area is the traditional 512-byte fxsave XMM 534 * region, we already have that in the fpregs. Thus there is no reason to 535 * duplicate it. This not only saves 512 bytes of additional stack space, 536 * but it also means we don't have to ask which of the version of it to take 537 * if they were to differ. 538 * 539 * o Many applications out there aren't necessarily using the extended vectors 540 * and even when we do make libc and others take advantage of it, it will 541 * behoove us to ensure that they are put back into their initial state 542 * after use. This leads us to expect that in a number of cases, the actual 543 * extended register state will be in its initial state. 544 * 545 * o While the signal handler does allow contents to be modified, we are 546 * starting with making the interface private and thus allowing us to excise 547 * components that are in their initial state. 548 * 549 * o There are similarities to what we want to create with the compressed 550 * xsave format; however, because we don't always have support for the 551 * compressed format, we can't just arbitrarily say let's do a compressed 552 * save to the user stack. 553 * 554 * o Because we are not handing this state directly to and from hardware, we 555 * don't need to meet some of the constraints of the compressed xsave format 556 * around wanting alignment for the initial save or additional components. 557 * 558 * All of the above lead us to our own unique format for this data. When the 559 * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a 560 * uc_xsave_t structure which has a magic version number, a 32-bit length of the 561 * overall structure, and the 64-bit state bit-vector to represent which 562 * components are valid. Following this 8-byte header, each component that is 563 * present in the bit vector is immediately written out in roughly ascending bit 564 * order (the order is determined based on the order of the fpu_xsave_info 565 * array). 566 * 567 * This makes the rough logic that we have here when taking a signal and writing 568 * out this state as: 569 * 570 * 1. Ensure that the FPU is saved and that the contents of the pcb save area 571 * are valid. That is, call fp_save() if the state is not already flagged 572 * with FPU_VALID. 573 * 574 * 2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP 575 * and XFEATURE_SSE bits as these will be placed in the xsave area. 576 * 577 * 3. Initialize the uc_xsave_t by setting our version field, initializing the 578 * length to the length of the current structure, and then setting the 579 * modified bit vector above. 580 * 581 * 4. Walk each remaining bit of the bit-vector. For each set bit, copy out 582 * its extended state starting at the current length in the header and then 583 * increase the header size by that length. 584 * 585 * 5. Finally write out the final uc_xsave_t structure. 586 * 587 * The above process is also used when someone manually calls getcontext_extd(2) 588 * to get this state. The main difference between the two is which copyout 589 * function we use. This deserves some explanation. Our main starting point for 590 * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows 591 * the signal handling context to operate with a different copyout than we 592 * normally use in say getcontext_extd(2). 593 * 594 * When we've received a signal, we're at the intersection of several different 595 * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is, 596 * the watchpoints effectively set a copyout override function (t_copyops) that 597 * we end up vectoring to rather than a normal copyout. This allows the data to 598 * be modified and for the watchpoint to fire. While this is all well and good 599 * normally, it is problematic if we are trying to handle a signal. The signal 600 * deliver logic, sendsig(), goes through and disables the watchpoint for the 601 * region of the stack that we are copying out to. However, disabling 602 * watchpoints is not sufficient, we also need to use the copyout_noerr 603 * variants. 604 * 605 * These variants also require the use of on_fault() and no_fault() for error 606 * handling. While it is tempting to try and on_fault() the entire 607 * fpu_signal_copyout() operation, that is actually fraught for a few reasons. 608 * The first is that we don't want to disable faults during the entire operation 609 * as if the kernel messes up we will treat that as a user error. That isn't 610 * theoretical and happened during development. The second and perhaps more 611 * important issue is that correctly bounding the on_fault() / no_fault() means 612 * being careful about state. For example, kernel pre-emption is often disabled 613 * during parts of these operations, but it needs to be re-enabled when we're 614 * done. This would require tracking in some volatile variable that this had 615 * been enabled and disabled and tracking that. 616 * 617 * Instead, this is why fpu_signal_copyout() takes a copy out function as an 618 * argument. When we're in signal handling context, the function will use 619 * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms. 620 * 621 * RESTORING STATE 622 * 623 * Copying out our current state is the easier half of this problem. When the 624 * kernel is done with a signal it calls setcontext(2) with the ucontext_t we 625 * assembled for it as described above. setcontext(2) isn't just used for 626 * returning from signals. 627 * 628 * The process for this goes in two steps. The first step is to copy in, 629 * validate, and transform the ucontext_t UC_XSAVE that we created above into an 630 * equivalent xsave format that we can use the appropriate xrstor function on. 631 * This first phase is implemented in fpu_signal_copyin(). Once that is done, we 632 * come back through a second phase that is driven out of restorecontext() and 633 * is implemented in fpu_set_xsave(). 634 * 635 * Let's start by discussing the second part of this, which is more 636 * straightforward. In particular, the second phase assumes that all of the 637 * validation and error handling has been done by the first phase. This means 638 * here, we have a buffer that is already the appropriate size 639 * (cpuid_get_xsave_size()) and all we need to do is make sure that we can 640 * replace the actual save state with the current one. 641 * 642 * The only piece of shenanigans we have to do is around the kernel provided 643 * notion of 'status' and 'xstatus', which are cached versions of the x87 and 644 * SSE exception vectors. These are part of the fpregset ABI and therefore we 645 * need to propagate them from the temporary storage that part 1 sets up in the 646 * ignored region of the fxsave data. We use that because it is not persisted by 647 * the CPU, so clobbering it is generally alright. 648 * 649 * Once that is done, we simply note that we need a PCB update to occur to 650 * refresh the FPU state before we return to userland. Given that someone has 651 * called setcontext(2), this was always going to happen because we have to 652 * update segment registers and related, so this isn't so bad. With that, let's 653 * move onto the more nuanced part (1). 654 * 655 * When we're handling a setcontext(2) we have, in userland, a data structure 656 * that should match one we serialized out, though we cannot assume that a user 657 * has not modified it either accidentally or maliciously. Our goal is to set up 658 * the appropriate xsave state that can be passed to the CPU's xrstor. The first 659 * problem we have to deal with is where do we actually put this state? 660 * 661 * While not many programs actually call setcontext(2) of their own volition, 662 * this is going to get hit every time we take a signal. The first thought was 663 * to re-use the existing thread's save area; however, that's a bit challenging 664 * for a few reasons. In particular, we would need to ensure that we don't go 665 * off-CPU for any reason, which we cannot assume with a copyin from a user 666 * address space. In particular, it is trivial for us to hit a case where the 667 * stack has been paged out for some reason, which eschews that path. 668 * 669 * Instead, whenever a thread first calls setcontext(2), generally from signal 670 * context, we will at that time allocate another entry from the 'fpsave_cachep' 671 * kmem cache, giving us a buffer of the appropriate space to handle this. Once 672 * this buffer has been allocated, we leave it assigned to the thread's pcb and 673 * only tear it down when the thread itself finally exits. We reason that a 674 * thread that takes a signal once is either going to have the process exit 675 * shortly thereafter or is much more likely to take a signal again in the 676 * future. Many daemons and other processes set things up so signals are 677 * dispatched via one location, masking signals in other thread, using 678 * sigsuspend(2), signalfd(3C), or something similar. 679 * 680 * With this buffer in hand, we begin our task of reassembling state. Note, all 681 * of this is conditional on UC_XSAVE being set in the uc_flags member of the 682 * ucontext_t. If it is not set, then we assume that there is no extended state 683 * and will use the traditional path of setting the fpregset_t into the system 684 * via setfpregs(). 685 * 686 * We first will copyin and validate the uc_xsave_t. In particular, we need to 687 * make sure the version makes sense, that the xsave component bit-vector 688 * doesn't have anything unexpected and more importantly unsupported in it, and 689 * that the addresses we've been given are within the user address space. At 690 * this point we can walk through our table of implemented bits and process 691 * them. 692 * 693 * For most components in here, the processing is straightforward. We continue 694 * walking our cursor and copy data into the kernel and place it in the 695 * appropriate place in our xsave state. If a xsave state component bit-vector 696 * isn't set, then we must ensure that we have the item in the initial state, 697 * which for everything other than the x87/SSE state is the memory being zeroed. 698 * 699 * The most unique case in the copyin state is that of the x87/SSE state. You 700 * might recall that we didn't copy it out explicitly as part of the uc_xsave_t, 701 * but instead have opted to use the single definition in the fpregset_t. Thus 702 * here, we copy it out of the fpregset_t, which the kernel has helpfully 703 * already unified into the 64-bit fxsave version prior to calling us, and 704 * install that into the save area we're building up. 705 * 706 * As part of this, there are two important pieces to be aware of. The first is 707 * that because the fpregset_t has both the status and xstatus members 708 * mentioned earlier, we temporarily copy them to the software-usable ignored 709 * areas of the fxsave state so we can corral this extra state into part (2) 710 * without needing to allocate additional space. The second piece is that when 711 * we're done processing this we explicitly remove the UC_FPU flag that would 712 * tell the kernel to proceed with updating that region. The problem is that 713 * that goes directly into the pcb's save area and not to the intermediate 714 * buffer as it uses the same entry point as /proc, mainly setfpregs(). 715 * 716 * We don't do much validation of the actual contents of the registers that are 717 * being set with the exception of ensuring that no reserved bits of the mxcsr 718 * are used. This is not as strict as /proc, but failure here means the process 719 * is likely going to die (returning from setcontext() in a signal handler is 720 * fatal). 721 * 722 * /proc xregs 723 * ----------- 724 * 725 * Observability of the state of the extended registers is important for 726 * understanding the system. While on the surface this is similar to signal 727 * handling, it is crucially different in a number of ways: 728 * 729 * o In signal handling, we're trying to conserve every byte of stack that we 730 * can. 731 * o The /proc xregs file will end up in core files, which means that we need 732 * a way of knowing what components are present and not present in it, 733 * because this will vary from CPU to CPU due to the addition of 734 * architectural features. For example, some CPUs support AVX-512, but 735 * others do not. 736 * 737 * o The signal handling structure (uc_xsave_t) is private and we're not 738 * trying to have software modify it, on the other hand, the /proc 739 * interfaces that we support we do want software to be able to interrogate 740 * and manipulate. These need to be something that we can introduce 741 * additional components into and make other changes that still allow it to 742 * work. 743 * 744 * The x86 xregs format is documented in proc(5). The short form is that the 745 * prxregset_hdr_t has a number of information entries, which are of the type 746 * prxregset_info_t. Each of the information headers has a type, size, and 747 * offset which indicate where to find the additional data. 748 * 749 * Each entry is described as one of the entries in the fpu_xsave_info[]. These 750 * items either are a 1:1 correspondence with a xsave related feature (e.g. 751 * there is one entry for each of the three AVX-512 components) or it is 752 * something synthetic that we provide as additional information such as the 753 * PRX_INFO_XCR, which is a way of getting information about the system such as 754 * what is enabled in %xcr0 out there. 755 * 756 * Unlike signal handling, we are given the buffer to place everything that 757 * needs to be written out. This is partially the design of the /proc APIs. That 758 * is, we will always assemble everything into the entire buffer that /proc asks 759 * us to, and then it will use as much or as little of it as is required. 760 * Similarly, when setting things, we don't have to worry about copying in 761 * information in the same way as signal handling does, because /proc takes care 762 * of it and always hands us a full buffer. Sizing that is a little nuanced, but 763 * is all handled in prmachdep.c. 764 * 765 * When someone performs a read of the xregs and thus is asking us for the 766 * current state, there is a little bit of nuance that we need to deal with. 767 * The first, is whether or not the FPU is enabled and the second is if the FPU 768 * is enabled, whether a given component is noted as being in its initial state. 769 * This basically gives us three possible states for a given component: 770 * 771 * 1. FPU_EN is not set and FPU_VALID is not set. This means we need to take 772 * the illumos FPU default for an item. More on that in a moment. 773 * 2. The saved xsave state indicates that the bit for a given component is 774 * zero -- specifically the xsh_xstate_bv member of the struct xsave_state. 775 * In this case, we must take the CPU's default for an item. This is 776 * usually the same as illumos, but not always. 777 * 3. The saved xsave state indicates that a given component's state bit is 778 * valid. The simplest of our cases. We can just take what we have from the 779 * xsave state. 780 * 781 * The CPU's default state for most components other than the x87/SSE state is 782 * to have it be zeroed. This is what we treat as our default state as well. The 783 * primary difference is in the initialization of the x87/SSE state. The SYS V 784 * ABI requires that we enable a different floating point control word then the 785 * hardware default. This means that when we're dealing with case (1) for 786 * x87/SSE we have to be more careful than the other components. Thankfully for 787 * everything else this is just keeping it zeroed. 788 * 789 * A reasonable question would be why not just skip components that aren't 790 * marked as present. There are a few reasons we take a different approach and 791 * always include them. Both of these are to make lives simpler for consumers. 792 * In the first case, when someone is performing a read and wants to reassemble 793 * and answer the question of 'what is the value of %ymm0 or %zmm15', they have 794 * to combine multiple disparate parts. If one knows that the data we put into 795 * there is always valid and represents what is in hardware and doesn't have to 796 * keep track of what are the defaults in different circumstances, then that 797 * greatly simplifies consumers lives. It also helps us for core files and other 798 * observability cases because the answer to what is the operating system's 799 * default may change over time. 800 * 801 * Similarly, including all the possible structures means that we have 802 * simplified writes. Writes are always setting the full state of a thread, 803 * meaning that if someone wants to modify only a single register they must do a 804 * read, modify, and write. By including everything that they might need, it 805 * makes it easier for consumers to do this and not have to cons up the whole 806 * structure on their own. 807 * 808 * When we're setting state, things change around a little bit. We have a few 809 * constraints that are laid out in proc(5). In particular, we require that the 810 * PRX_INFO_XSAVE component always be present to tell us which other components 811 * we expect to be here and which ones we don't. We also are much stricter about 812 * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only 813 * and may not be modified by a calling process. In addition, when we have 814 * 32-bit applications which have reserved registers in the %ymm, %zmm, etc. 815 * components, if they are being written to and have modifications, then we will 816 * indicate an error there. 817 * 818 * Because we are given the entire buffer from userland and don't need to have 819 * an intermediate place to copy it in, we will validate the entire thing in 820 * advance. Once it has been validated and we consider it legal, then we will 821 * translate each entry into its corresponding entry in pcb's normal floating 822 * point state. This is different from signal handling mostly because of the 823 * fact that we are not using copyin, and once we get to this point, there is 824 * no more validation, so we don't have the same concerns around blocking while 825 * pre-emption is disabled. 826 * 827 * The Wrinkle with fpregs 828 * ----------------------- 829 * 830 * When we instead turn our attention to the fpregs, whether we're gathering 831 * them as part of the ucontext_t or as part of /proc, there are a few 832 * complications that we need to be aware of when we're operating on a kernel 833 * that is using xsave as the save mechanism. When we're using fxsave as the 834 * save mechanism, the CPU will always save the entire 512-byte fxsave region. 835 * The fpregs ABI that the kernel expects is basically this structure itself, 836 * which is transformed into a 32-bit compatible form in archdep.c. 837 * 838 * But xsave makes this much more complex and has historically been a source of 839 * bugs in the system. In particular, unlike fxsave, xsave has its component bit 840 * vector that is written out to indicate validity. This means that blindly 841 * copying the fxsave area without checking those bits will lead us to do the 842 * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers, 843 * while the x87 legacy fp flag covers the rest of the state. This is all good, 844 * aside from the MCXSR. 845 * 846 * One of the more complicated pieces of xsave state management is correctly 847 * answering the question of when the MXCSR is written out to xsave_state. In 848 * practice, this is rather convoluted and varies. If either the XMM or AVX 849 * feature bits are set then the CPU will write out the MXCSR and its mask 850 * register into the traditional fxsave state region. This behavior is dependent 851 * on the type of save function that we use. xsave and xsaveopt will look at the 852 * AVX feature bit; however, xsavec does not and only considers the SSE feature 853 * bit. This means that when we're retrieving things, we need to check both of 854 * those bits to determine if we should use the initial state or the value 855 * written out. 856 * 857 * When we come to someone trying to set the fpregs through /proc, the main 858 * question we have is what happens to the extended registers. We have opted to 859 * implement and document it such that a write to the fpregs only impacts the 860 * fpregs. Put differently, we will save the FPU state with fp_save() ahead of 861 * copying the data into the save area, set the state bits for x87 and XMM 862 * state, and then set the FPU to be restored. All in all, this basically means 863 * that writing to fpregs does not touch any of the %ymm, %zmm, or other state 864 * that we might have present. 865 * 866 * Forward Looking: Adding Intel AMX Support 867 * ----------------------------------------- 868 * 869 * Nothing can stop the march of features being added into the FPU. One of the 870 * larger chunks that we will need to wrangle with is Intel's Advanced Matrix 871 * Extensions (AMX), which add a large chunk of xsave state to each process. 872 * While things like AVX and AVX-512 have been enabled by default, the broader 873 * OS community has not been wanting to do this for AMX ,because of the size of 874 * the state which exceeds 8 KiB. While the signal handling state went out of 875 * its way to minimize the size it wrote to the stack, if this is used, it would 876 * need to be preserved. 877 * 878 * To deal with this reality and the fact that folks don't really want to 879 * enable it by default for all purposes when its use will be quite special 880 * purpose, Intel has also added a MSR around extended feature disable or xfd. 881 * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting 882 * assumption, and the reason that so much of the /proc and signal logic ensures 883 * that we have the thread and process around, taking as an example the unused 884 * process argument in fpu_proc_xregs_info(), is that we will follow suit and 885 * default to having support disabled, but that a process will be able to opt 886 * into it, which will result in several different assumptions around signal 887 * stack sizing and cause us to reallocate and extend the pcb's FPU save state. 888 * 889 * The following is a list of items to pay attention to for future folks who 890 * work on this: 891 * 892 * o We will want to confirm whether other systems have opted to make this 893 * process-wide or thread-wide. Assuming process-wide, we will need to do a 894 * hold of all lwps while making a change. The interface for that probably 895 * doesn't want to be /proc, as a process probably doesn't want to write to 896 * its own control file. Changing it for another process could be done 897 * through the agent-lwp. 898 * o Opting into this should probably be a one-way street. 899 * o Opting into this will need to evaluate all threads and in particular 900 * stack sizes to confirm they adhere to the new minimum. 901 * o We will need to make sure that setting and clearing the xfd MSR is part 902 * of the FPU context ops and something we set by default on every CPU. 903 * o We will need to add a new interface to allow opting into this feature. 904 * o We will need to ensure that all subsequently created signal stacks adhere 905 * to a required minimum size that we communicate through libc. 906 * o We will need to make sure that both rtld and libc no longer rely on a 907 * static value of the AT_SUN_FPSIZE, but rather realize that this can be 908 * dynamic. At that time, we should evaluate if we can get away with not 909 * needing to save this for rtld, even though signal handlers should assume 910 * they will. 911 * o The various components (because there is more than one) will want to be 912 * added to the fpu_xsave_info[]. Consulting the processes's xfd will be 913 * required and probably require logic changes. 914 * 915 * The above is not exhaustive. We'll probably have some other issues and fun 916 * while doing this. 917 */ 918 919 /* 920 * The kind of FPU we advertise to rtld so it knows what to do when working 921 * through the PLT. 922 */ 923 int fp_elf = AT_386_FPINFO_FXSAVE; 924 925 /* 926 * Mechanism to save FPU state. 927 */ 928 int fp_save_mech = FP_FXSAVE; 929 930 kmem_cache_t *fpsave_cachep; 931 932 /* Legacy fxsave layout + xsave header + ymm */ 933 #define AVX_XSAVE_SIZE (512 + 64 + 256) 934 935 /* 936 * Various sanity checks. 937 */ 938 CTASSERT(sizeof (struct fxsave_state) == 512); 939 CTASSERT(sizeof (struct fnsave_state) == 108); 940 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0); 941 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE); 942 943 /* 944 * Basic architectural alignment information. 945 */ 946 #define FPU_ALIGN_XMM 16 947 #define FPU_ALIGN_YMM 32 948 #define FPU_ALIGN_ZMM 64 949 950 /* 951 * This structure is the x86 implementation of the kernel FPU that is defined in 952 * uts/common/sys/kfpu.h. 953 */ 954 955 typedef enum kfpu_flags { 956 /* 957 * This indicates that the save state has initial FPU data. 958 */ 959 KFPU_F_INITIALIZED = 0x01 960 } kfpu_flags_t; 961 962 struct kfpu_state { 963 fpu_ctx_t kfpu_ctx; 964 kfpu_flags_t kfpu_flags; 965 kthread_t *kfpu_curthread; 966 }; 967 968 /* 969 * Initial kfpu state for SSE/SSE2 used by fpinit() 970 */ 971 const struct fxsave_state sse_initial = { 972 FPU_CW_INIT, /* fx_fcw */ 973 0, /* fx_fsw */ 974 0, /* fx_fctw */ 975 0, /* fx_fop */ 976 0, /* fx_rip */ 977 0, /* fx_rdp */ 978 SSE_MXCSR_INIT /* fx_mxcsr */ 979 /* rest of structure is zero */ 980 }; 981 982 /* 983 * Initial kfpu state for AVX used by fpinit() 984 */ 985 const struct xsave_state avx_initial = { 986 /* 987 * The definition below needs to be identical with sse_initial 988 * defined above. 989 */ 990 .xs_fxsave = { 991 .fx_fcw = FPU_CW_INIT, 992 .fx_mxcsr = SSE_MXCSR_INIT, 993 }, 994 .xs_header = { 995 /* 996 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are 997 * valid, and CPU should initialize XMM/YMM. 998 */ 999 .xsh_xstate_bv = 1, 1000 .xsh_xcomp_bv = 0, 1001 }, 1002 }; 1003 1004 /* 1005 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid 1006 * the #gp exception caused by setting unsupported bits in the 1007 * MXCSR register 1008 */ 1009 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT; 1010 1011 /* 1012 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we 1013 * have an XSAVE-capable chip in fpu_probe. 1014 */ 1015 void (*fpsave_ctxt)(void *) = fpxsave_ctxt; 1016 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt; 1017 1018 /* 1019 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable. 1020 */ 1021 void (*xsavep)(struct xsave_state *, uint64_t) = xsave; 1022 1023 static int fpe_sicode(uint_t); 1024 static int fpe_simd_sicode(uint_t); 1025 static void fp_new_lwp(void *, void *); 1026 static void fp_free_ctx(void *, int); 1027 1028 static struct ctxop * 1029 fp_ctxop_allocate(struct fpu_ctx *fp) 1030 { 1031 const struct ctxop_template tpl = { 1032 .ct_rev = CTXOP_TPL_REV, 1033 .ct_save = fpsave_ctxt, 1034 .ct_restore = fprestore_ctxt, 1035 .ct_fork = fp_new_lwp, 1036 .ct_lwp_create = fp_new_lwp, 1037 .ct_free = fp_free_ctx, 1038 }; 1039 return (ctxop_allocate(&tpl, fp)); 1040 } 1041 1042 /* 1043 * Copy the state of parent lwp's floating point context into the new lwp. 1044 * Invoked for both fork() and lwp_create(). 1045 * 1046 * Note that we inherit -only- the control state (e.g. exception masks, 1047 * rounding, precision control, etc.); the FPU registers are otherwise 1048 * reset to their initial state. 1049 */ 1050 static void 1051 fp_new_lwp(void *parent, void *child) 1052 { 1053 kthread_id_t t = parent, ct = child; 1054 struct fpu_ctx *fp; /* parent fpu context */ 1055 struct fpu_ctx *cfp; /* new fpu context */ 1056 struct fxsave_state *fx, *cfx; 1057 struct xsave_state *cxs; 1058 1059 ASSERT(fp_kind != FP_NO); 1060 1061 fp = &t->t_lwp->lwp_pcb.pcb_fpu; 1062 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu; 1063 1064 /* 1065 * If the parent FPU state is still in the FPU hw then save it; 1066 * conveniently, fp_save() already does this for us nicely. 1067 */ 1068 fp_save(fp); 1069 1070 cfp->fpu_flags = FPU_EN | FPU_VALID; 1071 cfp->fpu_regs.kfpu_status = 0; 1072 cfp->fpu_regs.kfpu_xstatus = 0; 1073 1074 /* 1075 * Make sure that the child's FPU is cleaned up and made ready for user 1076 * land. 1077 */ 1078 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb); 1079 1080 switch (fp_save_mech) { 1081 case FP_FXSAVE: 1082 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 1083 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx; 1084 bcopy(&sse_initial, cfx, sizeof (*cfx)); 1085 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 1086 cfx->fx_fcw = fx->fx_fcw; 1087 break; 1088 1089 case FP_XSAVE: 1090 cfp->fpu_xsave_mask = fp->fpu_xsave_mask; 1091 1092 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL); 1093 1094 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 1095 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs; 1096 cfx = &cxs->xs_fxsave; 1097 1098 bcopy(&avx_initial, cxs, sizeof (*cxs)); 1099 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 1100 cfx->fx_fcw = fx->fx_fcw; 1101 cxs->xs_header.xsh_xstate_bv |= 1102 (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL); 1103 break; 1104 default: 1105 panic("Invalid fp_save_mech"); 1106 /*NOTREACHED*/ 1107 } 1108 1109 /* 1110 * Mark that both the parent and child need to have the FPU cleaned up 1111 * before returning to userland. 1112 */ 1113 1114 ctxop_attach(ct, fp_ctxop_allocate(cfp)); 1115 } 1116 1117 /* 1118 * Free any state associated with floating point context. 1119 * Fp_free can be called in three cases: 1120 * 1) from reaper -> thread_free -> freectx-> fp_free 1121 * fp context belongs to a thread on deathrow 1122 * nothing to do, thread will never be resumed 1123 * thread calling ctxfree is reaper 1124 * 1125 * 2) from exec -> freectx -> fp_free 1126 * fp context belongs to the current thread 1127 * must disable fpu, thread calling ctxfree is curthread 1128 * 1129 * 3) from restorecontext -> setfpregs -> fp_free 1130 * we have a modified context in the memory (lwp->pcb_fpu) 1131 * disable fpu and release the fp context for the CPU 1132 * 1133 */ 1134 void 1135 fp_free(struct fpu_ctx *fp) 1136 { 1137 ASSERT(fp_kind != FP_NO); 1138 1139 if (fp->fpu_flags & FPU_VALID) 1140 return; 1141 1142 kpreempt_disable(); 1143 /* 1144 * We want to do fpsave rather than fpdisable so that we can 1145 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit 1146 */ 1147 fp->fpu_flags |= FPU_VALID; 1148 /* If for current thread disable FP to track FPU_VALID */ 1149 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) { 1150 /* Clear errors if any to prevent frstor from complaining */ 1151 (void) fperr_reset(); 1152 if (fp_kind & __FP_SSE) 1153 (void) fpxerr_reset(); 1154 fpdisable(); 1155 } 1156 kpreempt_enable(); 1157 } 1158 1159 /* 1160 * Wrapper for freectx to make the types line up for fp_free() 1161 */ 1162 static void 1163 fp_free_ctx(void *arg, int isexec __unused) 1164 { 1165 fp_free((struct fpu_ctx *)arg); 1166 } 1167 1168 /* 1169 * Store the floating point state and disable the floating point unit. 1170 */ 1171 void 1172 fp_save(struct fpu_ctx *fp) 1173 { 1174 ASSERT(fp_kind != FP_NO); 1175 1176 kpreempt_disable(); 1177 if (!fp || fp->fpu_flags & FPU_VALID || 1178 (fp->fpu_flags & FPU_EN) == 0) { 1179 kpreempt_enable(); 1180 return; 1181 } 1182 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu); 1183 1184 switch (fp_save_mech) { 1185 case FP_FXSAVE: 1186 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx); 1187 break; 1188 1189 case FP_XSAVE: 1190 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 1191 break; 1192 default: 1193 panic("Invalid fp_save_mech"); 1194 /*NOTREACHED*/ 1195 } 1196 1197 fp->fpu_flags |= FPU_VALID; 1198 1199 /* 1200 * We save the FPU as part of forking, execing, modifications via /proc, 1201 * restorecontext, etc. As such, we need to make sure that we return to 1202 * userland with valid state in the FPU. If we're context switched out 1203 * before we hit sys_rtt_common() we'll end up having restored the FPU 1204 * as part of the context ops operations. The restore logic always makes 1205 * sure that FPU_VALID is set before doing a restore so we don't restore 1206 * it a second time. 1207 */ 1208 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb); 1209 1210 kpreempt_enable(); 1211 } 1212 1213 /* 1214 * Restore the FPU context for the thread: 1215 * The possibilities are: 1216 * 1. No active FPU context: Load the new context into the FPU hw 1217 * and enable the FPU. 1218 */ 1219 void 1220 fp_restore(struct fpu_ctx *fp) 1221 { 1222 switch (fp_save_mech) { 1223 case FP_FXSAVE: 1224 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx); 1225 break; 1226 1227 case FP_XSAVE: 1228 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 1229 break; 1230 default: 1231 panic("Invalid fp_save_mech"); 1232 /*NOTREACHED*/ 1233 } 1234 1235 fp->fpu_flags &= ~FPU_VALID; 1236 } 1237 1238 /* 1239 * Reset the FPU such that it is in a valid state for a new thread that is 1240 * coming out of exec. The FPU will be in a usable state at this point. At this 1241 * point we know that the FPU state has already been allocated and if this 1242 * wasn't an init process, then it will have had fp_free() previously called. 1243 */ 1244 void 1245 fp_exec(void) 1246 { 1247 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1248 1249 if (fp_save_mech == FP_XSAVE) { 1250 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 1251 } 1252 1253 struct ctxop *ctx = fp_ctxop_allocate(fp); 1254 /* 1255 * Make sure that we're not preempted in the middle of initializing the 1256 * FPU on CPU. 1257 */ 1258 kpreempt_disable(); 1259 ctxop_attach(curthread, ctx); 1260 fpinit(); 1261 fp->fpu_flags = FPU_EN; 1262 kpreempt_enable(); 1263 } 1264 1265 1266 /* 1267 * Seeds the initial state for the current thread. The possibilities are: 1268 * 1. Another process has modified the FPU state before we have done any 1269 * initialization: Load the FPU state from the LWP state. 1270 * 2. The FPU state has not been externally modified: Load a clean state. 1271 */ 1272 void 1273 fp_seed(void) 1274 { 1275 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1276 1277 ASSERT(curthread->t_preempt >= 1); 1278 ASSERT((fp->fpu_flags & FPU_EN) == 0); 1279 1280 /* 1281 * Always initialize a new context and initialize the hardware. 1282 */ 1283 if (fp_save_mech == FP_XSAVE) { 1284 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 1285 } 1286 1287 ctxop_attach(curthread, fp_ctxop_allocate(fp)); 1288 fpinit(); 1289 1290 /* 1291 * If FPU_VALID is set, it means someone has modified registers via 1292 * /proc. In this case, restore the current lwp's state. 1293 */ 1294 if (fp->fpu_flags & FPU_VALID) 1295 fp_restore(fp); 1296 1297 ASSERT((fp->fpu_flags & FPU_VALID) == 0); 1298 fp->fpu_flags = FPU_EN; 1299 } 1300 1301 /* 1302 * When using xsave/xrstor, these three functions are used by the lwp code to 1303 * manage the memory for the xsave area. 1304 */ 1305 void 1306 fp_lwp_init(klwp_t *lwp) 1307 { 1308 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 1309 1310 /* 1311 * We keep a copy of the pointer in lwp_fpu so that we can restore the 1312 * value in forklwp() after we duplicate the parent's LWP state. 1313 */ 1314 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = 1315 kmem_cache_alloc(fpsave_cachep, KM_SLEEP); 1316 fp->fpu_signal = NULL; 1317 1318 if (fp_save_mech == FP_XSAVE) { 1319 /* 1320 * 1321 * We bzero since the fpinit() code path will only 1322 * partially initialize the xsave area using avx_inital. 1323 */ 1324 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state)); 1325 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size()); 1326 } 1327 } 1328 1329 void 1330 fp_lwp_cleanup(klwp_t *lwp) 1331 { 1332 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 1333 1334 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) { 1335 kmem_cache_free(fpsave_cachep, 1336 fp->fpu_regs.kfpu_u.kfpu_generic); 1337 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL; 1338 } 1339 1340 if (fp->fpu_signal != NULL) { 1341 kmem_cache_free(fpsave_cachep, fp->fpu_signal); 1342 fp->fpu_signal = NULL; 1343 } 1344 } 1345 1346 /* 1347 * Called during the process of forklwp(). The kfpu_u pointer will have been 1348 * overwritten while copying the parent's LWP structure. We have a valid copy 1349 * stashed in the child's lwp_fpu which we use to restore the correct value. 1350 */ 1351 void 1352 fp_lwp_dup(klwp_t *lwp) 1353 { 1354 void *xp = lwp->lwp_fpu; 1355 size_t sz; 1356 1357 switch (fp_save_mech) { 1358 case FP_FXSAVE: 1359 sz = sizeof (struct fxsave_state); 1360 break; 1361 case FP_XSAVE: 1362 sz = cpuid_get_xsave_size(); 1363 break; 1364 default: 1365 panic("Invalid fp_save_mech"); 1366 /*NOTREACHED*/ 1367 } 1368 1369 /* copy the parent's values into the new lwp's struct */ 1370 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz); 1371 /* now restore the pointer */ 1372 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp; 1373 /* Ensure that we don't inherit our parent's signal state */ 1374 lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL; 1375 } 1376 1377 /* 1378 * Handle a processor extension error fault 1379 * Returns non zero for error. 1380 */ 1381 1382 /*ARGSUSED*/ 1383 int 1384 fpexterrflt(struct regs *rp) 1385 { 1386 uint32_t fpcw, fpsw; 1387 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1388 1389 ASSERT(fp_kind != FP_NO); 1390 1391 /* 1392 * Now we can enable the interrupts. 1393 * (NOTE: x87 fp exceptions come thru interrupt gate) 1394 */ 1395 sti(); 1396 1397 if (!fpu_exists) 1398 return (FPE_FLTINV); 1399 1400 /* 1401 * Do an unconditional save of the FP state. If it's dirty (TS=0), 1402 * it'll be saved into the fpu context area passed in (that of the 1403 * current thread). If it's not dirty (it may not be, due to 1404 * an intervening save due to a context switch between the sti(), 1405 * above and here, then it's safe to just use the stored values in 1406 * the context save area to determine the cause of the fault. 1407 */ 1408 fp_save(fp); 1409 1410 /* clear exception flags in saved state, as if by fnclex */ 1411 switch (fp_save_mech) { 1412 case FP_FXSAVE: 1413 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 1414 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw; 1415 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS; 1416 break; 1417 1418 case FP_XSAVE: 1419 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 1420 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw; 1421 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS; 1422 /* 1423 * Always set LEGACY_FP as it may have been cleared by XSAVE 1424 * instruction 1425 */ 1426 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= 1427 XFEATURE_LEGACY_FP; 1428 break; 1429 default: 1430 panic("Invalid fp_save_mech"); 1431 /*NOTREACHED*/ 1432 } 1433 1434 fp->fpu_regs.kfpu_status = fpsw; 1435 1436 if ((fpsw & FPS_ES) == 0) 1437 return (0); /* No exception */ 1438 1439 /* 1440 * "and" the exception flags with the complement of the mask 1441 * bits to determine which exception occurred 1442 */ 1443 return (fpe_sicode(fpsw & ~fpcw & 0x3f)); 1444 } 1445 1446 /* 1447 * Handle an SSE/SSE2 precise exception. 1448 * Returns a non-zero sicode for error. 1449 */ 1450 /*ARGSUSED*/ 1451 int 1452 fpsimderrflt(struct regs *rp) 1453 { 1454 uint32_t mxcsr, xmask; 1455 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1456 1457 ASSERT(fp_kind & __FP_SSE); 1458 1459 /* 1460 * NOTE: Interrupts are disabled during execution of this 1461 * function. They are enabled by the caller in trap.c. 1462 */ 1463 1464 /* 1465 * The only way we could have gotten here if there is no FP unit 1466 * is via a user executing an INT $19 instruction, so there is 1467 * no fault in that case. 1468 */ 1469 if (!fpu_exists) 1470 return (0); 1471 1472 /* 1473 * Do an unconditional save of the FP state. If it's dirty (TS=0), 1474 * it'll be saved into the fpu context area passed in (that of the 1475 * current thread). If it's not dirty, then it's safe to just use 1476 * the stored values in the context save area to determine the 1477 * cause of the fault. 1478 */ 1479 fp_save(fp); /* save the FPU state */ 1480 1481 if (fp_save_mech == FP_XSAVE) { 1482 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr; 1483 fp->fpu_regs.kfpu_status = 1484 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 1485 } else { 1486 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr; 1487 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 1488 } 1489 fp->fpu_regs.kfpu_xstatus = mxcsr; 1490 1491 /* 1492 * compute the mask that determines which conditions can cause 1493 * a #xm exception, and use this to clean the status bits so that 1494 * we can identify the true cause of this one. 1495 */ 1496 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS; 1497 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask)); 1498 } 1499 1500 /* 1501 * In the unlikely event that someone is relying on this subcode being 1502 * FPE_FLTILL for denormalize exceptions, it can always be patched back 1503 * again to restore old behaviour. 1504 */ 1505 int fpe_fltden = FPE_FLTDEN; 1506 1507 /* 1508 * Map from the FPU status word to the FP exception si_code. 1509 */ 1510 static int 1511 fpe_sicode(uint_t sw) 1512 { 1513 if (sw & FPS_IE) 1514 return (FPE_FLTINV); 1515 if (sw & FPS_ZE) 1516 return (FPE_FLTDIV); 1517 if (sw & FPS_DE) 1518 return (fpe_fltden); 1519 if (sw & FPS_OE) 1520 return (FPE_FLTOVF); 1521 if (sw & FPS_UE) 1522 return (FPE_FLTUND); 1523 if (sw & FPS_PE) 1524 return (FPE_FLTRES); 1525 return (FPE_FLTINV); /* default si_code for other exceptions */ 1526 } 1527 1528 /* 1529 * Map from the SSE status word to the FP exception si_code. 1530 */ 1531 static int 1532 fpe_simd_sicode(uint_t sw) 1533 { 1534 if (sw & SSE_IE) 1535 return (FPE_FLTINV); 1536 if (sw & SSE_ZE) 1537 return (FPE_FLTDIV); 1538 if (sw & SSE_DE) 1539 return (FPE_FLTDEN); 1540 if (sw & SSE_OE) 1541 return (FPE_FLTOVF); 1542 if (sw & SSE_UE) 1543 return (FPE_FLTUND); 1544 if (sw & SSE_PE) 1545 return (FPE_FLTRES); 1546 return (FPE_FLTINV); /* default si_code for other exceptions */ 1547 } 1548 1549 /* 1550 * This routine is invoked as part of libc's __fpstart implementation 1551 * via sysi86(2). 1552 * 1553 * It may be called -before- any context has been assigned in which case 1554 * we try and avoid touching the hardware. Or it may be invoked well 1555 * after the context has been assigned and fiddled with, in which case 1556 * just tweak it directly. 1557 */ 1558 void 1559 fpsetcw(uint16_t fcw, uint32_t mxcsr) 1560 { 1561 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1562 struct fxsave_state *fx; 1563 1564 if (!fpu_exists || fp_kind == FP_NO) 1565 return; 1566 1567 if ((fp->fpu_flags & FPU_EN) == 0) { 1568 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) { 1569 /* 1570 * Common case. Floating point unit not yet 1571 * enabled, and kernel already intends to initialize 1572 * the hardware the way the caller wants. 1573 */ 1574 return; 1575 } 1576 /* 1577 * Hmm. Userland wants a different default. 1578 * Do a fake "first trap" to establish the context, then 1579 * handle as if we already had a context before we came in. 1580 */ 1581 kpreempt_disable(); 1582 fp_seed(); 1583 kpreempt_enable(); 1584 } 1585 1586 /* 1587 * Ensure that the current hardware state is flushed back to the 1588 * pcb, then modify that copy. Next use of the fp will 1589 * restore the context. 1590 */ 1591 fp_save(fp); 1592 1593 switch (fp_save_mech) { 1594 case FP_FXSAVE: 1595 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 1596 fx->fx_fcw = fcw; 1597 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1598 break; 1599 1600 case FP_XSAVE: 1601 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 1602 fx->fx_fcw = fcw; 1603 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1604 /* 1605 * Always set LEGACY_FP as it may have been cleared by XSAVE 1606 * instruction 1607 */ 1608 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= 1609 XFEATURE_LEGACY_FP; 1610 break; 1611 default: 1612 panic("Invalid fp_save_mech"); 1613 /*NOTREACHED*/ 1614 } 1615 } 1616 1617 static void 1618 kernel_fpu_fpstate_init(kfpu_state_t *kfpu) 1619 { 1620 struct xsave_state *xs; 1621 1622 switch (fp_save_mech) { 1623 case FP_FXSAVE: 1624 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx, 1625 sizeof (struct fxsave_state)); 1626 kfpu->kfpu_ctx.fpu_xsave_mask = 0; 1627 break; 1628 case FP_XSAVE: 1629 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs; 1630 bzero(xs, cpuid_get_xsave_size()); 1631 bcopy(&avx_initial, xs, sizeof (*xs)); 1632 xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; 1633 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL; 1634 break; 1635 default: 1636 panic("invalid fp_save_mech"); 1637 } 1638 1639 /* 1640 * Set the corresponding flags that the system expects on the FPU state 1641 * to indicate that this is our state. The FPU_EN flag is required to 1642 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly 1643 * not set below as it represents that this state is being suppressed 1644 * by the kernel. 1645 */ 1646 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID; 1647 kfpu->kfpu_flags |= KFPU_F_INITIALIZED; 1648 } 1649 1650 kfpu_state_t * 1651 kernel_fpu_alloc(int kmflags) 1652 { 1653 kfpu_state_t *kfpu; 1654 1655 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) { 1656 return (NULL); 1657 } 1658 1659 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic = 1660 kmem_cache_alloc(fpsave_cachep, kmflags); 1661 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) { 1662 kmem_free(kfpu, sizeof (kfpu_state_t)); 1663 return (NULL); 1664 } 1665 1666 kernel_fpu_fpstate_init(kfpu); 1667 1668 return (kfpu); 1669 } 1670 1671 void 1672 kernel_fpu_free(kfpu_state_t *kfpu) 1673 { 1674 kmem_cache_free(fpsave_cachep, 1675 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic); 1676 kmem_free(kfpu, sizeof (kfpu_state_t)); 1677 } 1678 1679 static void 1680 kernel_fpu_ctx_save(void *arg) 1681 { 1682 kfpu_state_t *kfpu = arg; 1683 fpu_ctx_t *pf; 1684 1685 if (kfpu == NULL) { 1686 /* 1687 * A NULL kfpu implies this is a kernel thread with an LWP and 1688 * no user-level FPU usage. Use the lwp fpu save area. 1689 */ 1690 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1691 1692 ASSERT(curthread->t_procp->p_flag & SSYS); 1693 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1694 1695 fp_save(pf); 1696 } else { 1697 pf = &kfpu->kfpu_ctx; 1698 1699 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1700 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1701 1702 /* 1703 * Note, we can't use fp_save because it assumes that we're 1704 * saving to the thread's PCB and not somewhere else. Because 1705 * this is a different FPU context, we instead have to do this 1706 * ourselves. 1707 */ 1708 switch (fp_save_mech) { 1709 case FP_FXSAVE: 1710 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx); 1711 break; 1712 case FP_XSAVE: 1713 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask); 1714 break; 1715 default: 1716 panic("Invalid fp_save_mech"); 1717 } 1718 1719 /* 1720 * Because we have saved context here, our save state is no 1721 * longer valid and therefore needs to be reinitialized. 1722 */ 1723 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED; 1724 } 1725 1726 pf->fpu_flags |= FPU_VALID; 1727 1728 /* 1729 * Clear KFPU flag. This allows swtch to check for improper kernel 1730 * usage of the FPU (i.e. switching to a new thread while the old 1731 * thread was in the kernel and using the FPU, but did not perform a 1732 * context save). 1733 */ 1734 curthread->t_flag &= ~T_KFPU; 1735 } 1736 1737 static void 1738 kernel_fpu_ctx_restore(void *arg) 1739 { 1740 kfpu_state_t *kfpu = arg; 1741 fpu_ctx_t *pf; 1742 1743 if (kfpu == NULL) { 1744 /* 1745 * A NULL kfpu implies this is a kernel thread with an LWP and 1746 * no user-level FPU usage. Use the lwp fpu save area. 1747 */ 1748 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1749 1750 ASSERT(curthread->t_procp->p_flag & SSYS); 1751 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1752 } else { 1753 pf = &kfpu->kfpu_ctx; 1754 1755 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1756 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1757 } 1758 1759 fp_restore(pf); 1760 curthread->t_flag |= T_KFPU; 1761 } 1762 1763 /* 1764 * Validate that the thread is not switching off-cpu while actively using the 1765 * FPU within the kernel. 1766 */ 1767 void 1768 kernel_fpu_no_swtch(void) 1769 { 1770 if ((curthread->t_flag & T_KFPU) != 0) { 1771 panic("curthread swtch-ing while the kernel is using the FPU"); 1772 } 1773 } 1774 1775 static const struct ctxop_template kfpu_ctxop_tpl = { 1776 .ct_rev = CTXOP_TPL_REV, 1777 .ct_save = kernel_fpu_ctx_save, 1778 .ct_restore = kernel_fpu_ctx_restore, 1779 }; 1780 1781 void 1782 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags) 1783 { 1784 klwp_t *pl = curthread->t_lwp; 1785 struct ctxop *ctx; 1786 1787 if ((curthread->t_flag & T_KFPU) != 0) { 1788 panic("curthread attempting to nest kernel FPU states"); 1789 } 1790 1791 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */ 1792 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) != 1793 (KFPU_USE_LWP | KFPU_NO_STATE)); 1794 1795 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) { 1796 /* 1797 * Since we don't have a kfpu_state or usable lwp pcb_fpu to 1798 * hold our kernel FPU context, we depend on the caller doing 1799 * kpreempt_disable for the duration of our FPU usage. This 1800 * should only be done for very short periods of time. 1801 */ 1802 ASSERT(curthread->t_preempt > 0); 1803 ASSERT(kfpu == NULL); 1804 1805 if (pl != NULL) { 1806 /* 1807 * We might have already saved once so FPU_VALID could 1808 * be set. This is handled in fp_save. 1809 */ 1810 fp_save(&pl->lwp_pcb.pcb_fpu); 1811 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1812 } 1813 1814 curthread->t_flag |= T_KFPU; 1815 1816 /* Always restore the fpu to the initial state. */ 1817 fpinit(); 1818 1819 return; 1820 } 1821 1822 /* 1823 * We either have a kfpu, or are using the LWP pcb_fpu for context ops. 1824 */ 1825 1826 if ((flags & KFPU_USE_LWP) == 0) { 1827 if (kfpu->kfpu_curthread != NULL) 1828 panic("attempting to reuse kernel FPU state at %p when " 1829 "another thread already is using", kfpu); 1830 1831 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0) 1832 kernel_fpu_fpstate_init(kfpu); 1833 1834 kfpu->kfpu_curthread = curthread; 1835 } 1836 1837 /* 1838 * Not all threads may have an active LWP. If they do and we're not 1839 * going to re-use the LWP, then we should go ahead and save the state. 1840 * We must also note that the fpu is now being used by the kernel and 1841 * therefore we do not want to manage the fpu state via the user-level 1842 * thread's context handlers. 1843 * 1844 * We might have already saved once (due to a prior use of the kernel 1845 * FPU or another code path) so FPU_VALID could be set. This is handled 1846 * by fp_save, as is the FPU_EN check. 1847 */ 1848 ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu); 1849 kpreempt_disable(); 1850 if (pl != NULL) { 1851 if ((flags & KFPU_USE_LWP) == 0) 1852 fp_save(&pl->lwp_pcb.pcb_fpu); 1853 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1854 } 1855 1856 /* 1857 * Set the context operations for kernel FPU usage. Because kernel FPU 1858 * setup and ctxop attachment needs to happen under the protection of 1859 * kpreempt_disable(), we allocate the ctxop outside the guard so its 1860 * sleeping allocation will not cause a voluntary swtch(). This allows 1861 * the rest of the initialization to proceed, ensuring valid state for 1862 * the ctxop handlers. 1863 */ 1864 ctxop_attach(curthread, ctx); 1865 curthread->t_flag |= T_KFPU; 1866 1867 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) { 1868 /* 1869 * For pure kernel threads with an LWP, we can use the LWP's 1870 * pcb_fpu to save/restore context. 1871 */ 1872 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu; 1873 1874 VERIFY(curthread->t_procp->p_flag & SSYS); 1875 VERIFY(kfpu == NULL); 1876 ASSERT((pf->fpu_flags & FPU_EN) == 0); 1877 1878 /* Always restore the fpu to the initial state. */ 1879 if (fp_save_mech == FP_XSAVE) 1880 pf->fpu_xsave_mask = XFEATURE_FP_ALL; 1881 fpinit(); 1882 pf->fpu_flags = FPU_EN | FPU_KERNEL; 1883 } else { 1884 /* initialize the kfpu state */ 1885 kernel_fpu_ctx_restore(kfpu); 1886 } 1887 kpreempt_enable(); 1888 } 1889 1890 void 1891 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags) 1892 { 1893 if ((curthread->t_flag & T_KFPU) == 0) { 1894 panic("curthread attempting to clear kernel FPU state " 1895 "without using it"); 1896 } 1897 1898 /* 1899 * General comments on why the rest of this function is structured the 1900 * way it is. Be aware that there is a lot of subtlety here. 1901 * 1902 * If a user-level thread ever uses the fpu while in the kernel, then 1903 * we cannot call fpdisable since that does STTS. That will set the 1904 * ts bit in %cr0 which will cause an exception if anything touches the 1905 * fpu. However, the user-level context switch handler (fpsave_ctxt) 1906 * needs to access the fpu to save the registers into the pcb. 1907 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in 1908 * fprestore_ctxt when the thread context switched onto the CPU. 1909 * 1910 * Calling fpdisable only effects the current CPU's %cr0 register. 1911 * 1912 * During ctxop_remove and kpreempt_enable, we can voluntarily context 1913 * switch, so the CPU we were on when we entered this function might 1914 * not be the same one we're on when we return from ctxop_remove or end 1915 * the function. Note there can be user-level context switch handlers 1916 * still installed if this is a user-level thread. 1917 * 1918 * We also must be careful in the unlikely chance we're running in an 1919 * interrupt thread, since we can't leave the CPU's %cr0 TS state set 1920 * incorrectly for the "real" thread to resume on this CPU. 1921 */ 1922 1923 if ((flags & KFPU_NO_STATE) == 0) { 1924 kpreempt_disable(); 1925 } else { 1926 ASSERT(curthread->t_preempt > 0); 1927 } 1928 1929 curthread->t_flag &= ~T_KFPU; 1930 1931 /* 1932 * When we are ending things, we explicitly don't save the current 1933 * kernel FPU state back to the temporary state. The kfpu API is not 1934 * intended to be a permanent save location. 1935 * 1936 * If this is a user-level thread and we were to context switch 1937 * before returning to user-land, fpsave_ctxt will be a no-op since we 1938 * already saved the user-level FPU state the first time we run 1939 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over 1940 * the user-level fpu state). The fpsave_ctxt functions only save if 1941 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so 1942 * fprestore_ctxt will be done in sys_rtt_common when the thread 1943 * finally returns to user-land. 1944 */ 1945 1946 if ((curthread->t_procp->p_flag & SSYS) != 0 && 1947 curthread->t_intr == NULL) { 1948 /* 1949 * A kernel thread which is not an interrupt thread, so we 1950 * STTS now. 1951 */ 1952 fpdisable(); 1953 } 1954 1955 if ((flags & KFPU_NO_STATE) == 0) { 1956 ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu); 1957 1958 if (kfpu != NULL) { 1959 if (kfpu->kfpu_curthread != curthread) { 1960 panic("attempting to end kernel FPU state " 1961 "for %p, but active thread is not " 1962 "curthread", kfpu); 1963 } else { 1964 kfpu->kfpu_curthread = NULL; 1965 } 1966 } 1967 1968 kpreempt_enable(); 1969 } 1970 1971 if (curthread->t_lwp != NULL) { 1972 uint_t f; 1973 1974 if (flags & KFPU_USE_LWP) { 1975 f = FPU_EN | FPU_KERNEL; 1976 } else { 1977 f = FPU_KERNEL; 1978 } 1979 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f; 1980 } 1981 } 1982 1983 /* 1984 * Fill in FPU information that is required by exec. 1985 */ 1986 void 1987 fpu_auxv_info(int *typep, size_t *lenp) 1988 { 1989 *typep = fp_elf; 1990 switch (fp_save_mech) { 1991 case FP_FXSAVE: 1992 *lenp = sizeof (struct fxsave_state); 1993 break; 1994 case FP_XSAVE: 1995 *lenp = cpuid_get_xsave_size(); 1996 break; 1997 default: 1998 *lenp = 0; 1999 break; 2000 } 2001 } 2002 2003 /* 2004 * This function exists to transform an xsave_state into an fxsave_state. The 2005 * way that we have to do this is nuanced. We assume that callers have already 2006 * handled FPU_EN and thus we only need to consider the xsave_state and its 2007 * component vector itself. This results in the following cases that we need to 2008 * consider: 2009 * 2010 * o Neither the x87 / XMM state bits are set. We use the hardware default and 2011 * need to ensure to copy the xsave header. 2012 * o Both x87 / XMM state bits are set. We can copy everything. 2013 * o Only the x87 bit is set. We need to copy the x87 state but make the XMM 2014 * state be in the initial case. 2015 * o Only the XMM bit is set. The reverse of the above case. 2016 * 2017 * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are 2018 * generally the same; however, the default floating point control word is 2019 * different. 2020 * 2021 * Finally, we have the complication of the MXCSR and MCXSR_MASK registers. 2022 * Because we are using xsave and xsaveopt in the kernel right now and not 2023 * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the 2024 * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX 2025 * is set, we must also come back and copy out the MXCSR register. Sorry, we 2026 * don't make the rules. 2027 */ 2028 static void 2029 fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx) 2030 { 2031 const uint64_t comps = xsave->xs_header.xsh_xstate_bv; 2032 2033 switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) { 2034 case XFEATURE_LEGACY_FP | XFEATURE_SSE: 2035 bcopy(xsave, fx, sizeof (*fx)); 2036 return; 2037 case XFEATURE_LEGACY_FP: 2038 bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm)); 2039 fx->fx_mxcsr = SSE_MXCSR_INIT; 2040 fx->fx_mxcsr_mask = 0; 2041 break; 2042 case XFEATURE_SSE: 2043 bcopy(&sse_initial, fx, offsetof(struct fxsave_state, 2044 fx_mxcsr)); 2045 2046 fx->fx_fcw = FPU_CW_INIT_HW; 2047 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr; 2048 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask; 2049 bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm)); 2050 break; 2051 default: 2052 bcopy(&sse_initial, fx, sizeof (*fx)); 2053 fx->fx_fcw = FPU_CW_INIT_HW; 2054 break; 2055 } 2056 2057 /* 2058 * Account for the AVX causing MXCSR to be valid. 2059 */ 2060 if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 && 2061 (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) { 2062 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr; 2063 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask; 2064 } 2065 } 2066 2067 /* 2068 * This function is designed to answer the question of are we using any xsave 2069 * family of instructions in context switch and therefore we have this state. 2070 * This should still remain true if we are using xsavec or xsaves in the kernel 2071 * in the future. 2072 */ 2073 boolean_t 2074 fpu_xsave_enabled(void) 2075 { 2076 return (fp_save_mech == FP_XSAVE); 2077 } 2078 2079 /* 2080 * The following structure is used to track and manage the programmatic 2081 * construction of /proc and signal stack spilling of xsave information. All 2082 * known xsave types that the kernel supports must be included here. 2083 */ 2084 typedef struct xsave_proc_info { 2085 /* 2086 * This matches the /proc xregs type that this data represents. This s 2087 * used for /proc only. 2088 */ 2089 uint32_t xi_type; 2090 /* 2091 * This indicates the size of the /proc data that we're operating on. 2092 * This is only used for /proc. 2093 */ 2094 size_t xi_size; 2095 /* 2096 * This indicates the alignment that we want to have for the member when 2097 * we're writing out. This is not used when setting data. This is only 2098 * used for /proc. 2099 */ 2100 size_t xi_align; 2101 /* 2102 * This indicates whether this member must always be considered or not. 2103 * This is used in both /proc and context/signal handling. 2104 */ 2105 bool xi_always; 2106 /* 2107 * This contains the corresponding bits in the xsave bit vector that 2108 * corresponds to this entry. This is used for both /proc and 2109 * context/signal handling. 2110 */ 2111 uint64_t xi_bits; 2112 /* 2113 * The xi_fill function pointer is used to write out the /proc regset 2114 * data (e.g. when a user reads xregs). This is only used for the /proc 2115 * handling. The xi_valid function pointer is used instead to validate a 2116 * given set of data that we've read in, while the xi_set pointer is 2117 * used to actually transform the data in the underlying fpu save area. 2118 */ 2119 void (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *, 2120 void *); 2121 bool (*xi_valid)(model_t, const void *); 2122 void (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *, 2123 uint64_t, const void *); 2124 /* 2125 * The xi_signal_in and xi_signal_out function pointers are used for 2126 * extended context and signal handling information. They are used when 2127 * reading in data from a ucontext_t and writing it out respectively. 2128 * These are only used for context/signal handling. 2129 */ 2130 int (*xi_signal_in)(const struct xsave_proc_info *, 2131 const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *, 2132 const uintptr_t); 2133 int (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f, 2134 uc_xsave_t *, const void *fpup, uintptr_t); 2135 } xsave_proc_info_t; 2136 2137 static bool 2138 fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats) 2139 { 2140 const struct xsave_state *xs = fpu->fpu_regs.kfpu_u.kfpu_xs; 2141 2142 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) { 2143 return (true); 2144 } 2145 2146 return ((xs->xs_header.xsh_xstate_bv & feats) == 0); 2147 } 2148 2149 static void 2150 fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2151 void *datap) 2152 { 2153 prxregset_xcr_t *xcr = datap; 2154 2155 xcr->prx_xcr_xcr0 = xsave_bv_all; 2156 } 2157 2158 /* 2159 * Unlike other instruction portions, we treat the xsave header and the legacy 2160 * XMM section together as both are somewhat tied at the instruction hip. Unlike 2161 * the when dealing with other xsave regions like the ymm and zmm components, 2162 * the initial state here is much more nuanced as it has to match what we actual 2163 * do in the OS and depends on the components that are present. 2164 */ 2165 static void 2166 fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2167 void *datap) 2168 { 2169 prxregset_xsave_t *prxsave = datap; 2170 const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs; 2171 size_t hdr_off; 2172 2173 /* 2174 * In the x87/XMM case, the no device vs. initial state is different 2175 * because the initial state case still wants us to copy the real xsave 2176 * header. It's also worth calling out that the actual illumos default 2177 * fxsave state is not the same as what Intel documents. The main 2178 * difference is in what the x87 FPU control word is. This results in 2179 * the following different cases that we need to think about: 2180 * 2181 * o FPU_EN is not set. So we use the illumos default. 2182 */ 2183 if ((fpu->fpu_flags & FPU_EN) == 0) { 2184 bcopy(&avx_initial, prxsave, sizeof (*prxsave)); 2185 return; 2186 } 2187 2188 /* 2189 * Convert all the fxsave region while taking into account the validity 2190 * of the xsave bits. The prxregset_xsave_t structure is the same as the 2191 * xsave structure in our ABI and Intel designed the xsave header to 2192 * begin with the 512-bit fxsave structure. 2193 */ 2194 fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave); 2195 2196 /* 2197 * Now that we've dealt with the x87 and XMM state, take care of the 2198 * header. 2199 */ 2200 hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv); 2201 bcopy((const void *)((uintptr_t)xsave + hdr_off), 2202 (void *)((uintptr_t)prxsave + hdr_off), 2203 sizeof (struct xsave_header)); 2204 } 2205 2206 static void 2207 fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2208 void *datap) 2209 { 2210 if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) { 2211 size_t size, off; 2212 const void *xsave_off; 2213 2214 cpuid_get_xsave_info(info->xi_bits, &size, &off); 2215 ASSERT3U(size, ==, info->xi_size); 2216 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs + 2217 off); 2218 bcopy(xsave_off, datap, info->xi_size); 2219 } 2220 } 2221 2222 /* 2223 * Users are not allowed to actually set the xcr information this way. However, 2224 * to make it easier for someone to just do a read, modify, write, of the xregs 2225 * data, if it is identical, then we will accept it (and do nothing). 2226 */ 2227 static bool 2228 fpu_proc_xregs_xcr_valid(model_t model, const void *datap) 2229 { 2230 const prxregset_xcr_t *xcr = datap; 2231 2232 return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 && 2233 xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0); 2234 } 2235 2236 /* 2237 * To match traditional /proc semantics, we do not error if reserved bits of 2238 * MXCSR are set, they will be masked off when writing data. We do not allow 2239 * someone to indicate that they are asking for compressed xsave data, hence the 2240 * check that prx_xsh_comp_bv is zero. Separately, in fpu_proc_xregs_set() we 2241 * check that each component that was indicated in the xstate_bv is actually 2242 * present. 2243 */ 2244 static bool 2245 fpu_proc_xregs_xsave_valid(model_t model, const void *datap) 2246 { 2247 const prxregset_xsave_t *xsave = datap; 2248 uint64_t rsvd[6] = { 0 }; 2249 2250 if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 || 2251 xsave->prx_xsh_xcomp_bv != 0) { 2252 return (false); 2253 } 2254 2255 if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) { 2256 return (false); 2257 } 2258 2259 return (true); 2260 } 2261 2262 /* 2263 * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment 2264 * on x86; however, when operating in ILP32, subsets are reserved. We require 2265 * that all reserved portions are set to zero. 2266 */ 2267 static bool 2268 fpu_proc_xregs_ymm_valid(model_t model, const void *datap) 2269 { 2270 upad128_t ymm_zero[8]; 2271 const prxregset_ymm_t *ymm = datap; 2272 2273 if (model == DATAMODEL_LP64) { 2274 return (true); 2275 } 2276 2277 bzero(&ymm_zero, sizeof (ymm_zero)); 2278 return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0); 2279 } 2280 2281 static bool 2282 fpu_proc_xregs_zmm_valid(model_t model, const void *datap) 2283 { 2284 upad256_t zmm_zero[8]; 2285 const prxregset_zmm_t *zmm = datap; 2286 2287 if (model == DATAMODEL_LP64) { 2288 return (true); 2289 } 2290 2291 bzero(&zmm_zero, sizeof (zmm_zero)); 2292 return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0); 2293 } 2294 2295 static bool 2296 fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap) 2297 { 2298 prxregset_hi_zmm_t hi_zmm_zero; 2299 const prxregset_hi_zmm_t *hi_zmm = datap; 2300 2301 if (model == DATAMODEL_LP64) { 2302 return (true); 2303 } 2304 2305 bzero(&hi_zmm_zero, sizeof (hi_zmm_zero)); 2306 return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0); 2307 } 2308 2309 /* 2310 * The xsave state consists of the first 512 bytes of the XMM state and then the 2311 * xsave header itself. Because of the xsave header, this structure is marked 2312 * with xi_always, so we must always process and consider it. 2313 * 2314 * Semantically if either of the bits around SSE / x87 is set, then we will copy 2315 * the entire thing. This may mean that we end up copying a region that is not 2316 * valid into the save area; however, that should be OK as we still have the 2317 * specific bit flags that indicate what we should consider or not. 2318 * 2319 * There is one additional wrinkle we need to consider and honor here. The CPU 2320 * will load the MXCSR values if the AVX bit is set in an xrstor regardless of 2321 * anything else. So if this is set and we do not have a valid x87/XMM bits 2322 * set then we will set the MXCSR to its default state in case the processor 2323 * tries to load it. For reference see: 2324 * 2325 * o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR 2326 * o AMD64 Volume 2: Section 11.5.9 MXCSR State Management 2327 * 2328 * Note, the behavior around this changes depending on whether using the 2329 * compressed xrstor or not. We are not, but it's worth being aware of. We do 2330 * not worry about MXCSR_MASK because the instructions ignore it. 2331 */ 2332 static void 2333 fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2334 uint64_t xsave_bv, const void *datap) 2335 { 2336 const struct xsave_state *src_xs = datap; 2337 struct xsave_state *targ_xs = fpu->fpu_regs.kfpu_u.kfpu_xs; 2338 2339 if ((xsave_bv & info->xi_bits) != 0) { 2340 bcopy(&src_xs->xs_fxsave, &targ_xs->xs_fxsave, 2341 sizeof (struct fxsave_state)); 2342 } else if ((xsave_bv & XFEATURE_AVX) != 0) { 2343 targ_xs->xs_fxsave.fx_mxcsr = SSE_MXCSR_INIT; 2344 } 2345 2346 bcopy(&src_xs->xs_header, &targ_xs->xs_header, 2347 sizeof (struct xsave_header)); 2348 targ_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask; 2349 } 2350 2351 static void 2352 fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2353 uint64_t xsave_bv, const void *datap) 2354 { 2355 size_t size, off; 2356 void *xsave_off; 2357 2358 cpuid_get_xsave_info(info->xi_bits, &size, &off); 2359 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs + 2360 off); 2361 bcopy(datap, xsave_off, size); 2362 } 2363 2364 /* 2365 * Dealing with XMM data is a little more annoying in signal context. If UC_FPU 2366 * is set, the ucontext_t's fpregset_t contains a copy of the XMM region. That 2367 * must take priority over an XMM region that showed up in the uc_xsave_t data. 2368 * In the signal copyout code we do not save XMM region in the uc_xsave_t or set 2369 * it as a present component because of it being kept in the fpregset_t. Because 2370 * of this behavior, if we find the XMM (or x87) state bits present, we treat 2371 * that as an error. 2372 * 2373 * The system has always gone through and cleaned up the reserved bits in the 2374 * fxsave state when someone calls setcontext(). Therefore we need to do the 2375 * same thing which is why you see the masking of the mxcsr below. 2376 * 2377 * Finally, there is one last wrinkle here that we need to consider. The 2378 * fpregset_t has two private words which cache the status/exception 2379 * information. Therefore, we well... cheat. Intel has left bytes 464 (0x1d0) 2380 * through 511 (0x1ff) available for us to do what we want. So we will pass this 2381 * through that for the moment to help us pass this state around without too 2382 * much extra allocation. 2383 */ 2384 static int 2385 fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc, 2386 const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap, 2387 const uintptr_t max_udata) 2388 { 2389 struct xsave_state *xsave = fpup; 2390 2391 if ((ucx->ucx_bv & info->xi_bits) != 0) { 2392 return (EINVAL); 2393 } 2394 2395 if ((kuc->uc_flags & UC_FPU) != 0) { 2396 bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave, 2397 sizeof (struct fxsave_state)); 2398 xsave->xs_fxsave.__fx_ign2[3]._l[0] = 2399 kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status; 2400 xsave->xs_fxsave.__fx_ign2[3]._l[1] = 2401 kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus; 2402 xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask; 2403 xsave->xs_header.xsh_xstate_bv |= info->xi_bits; 2404 } 2405 2406 return (0); 2407 } 2408 2409 static int 2410 fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc, 2411 const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap, 2412 const uintptr_t max_udata) 2413 { 2414 size_t len, xsave_off; 2415 void *copy_to; 2416 struct xsave_state *xsave = fpup; 2417 2418 cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off); 2419 if (*udatap + len > max_udata) { 2420 return (EOVERFLOW); 2421 } 2422 2423 copy_to = (void *)((uintptr_t)fpup + xsave_off); 2424 if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) { 2425 return (EFAULT); 2426 } 2427 2428 xsave->xs_header.xsh_xstate_bv |= info->xi_bits; 2429 *udatap = *udatap + len; 2430 2431 return (0); 2432 } 2433 2434 static int 2435 fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc, 2436 uc_xsave_t *ucx, const void *fpup, uintptr_t udatap) 2437 { 2438 size_t len, xsave_off; 2439 const void *copy_from; 2440 void *copy_to; 2441 int ret; 2442 2443 cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off); 2444 copy_from = (void *)(uintptr_t)fpup + xsave_off; 2445 copy_to = (void *)(udatap + ucx->ucx_len); 2446 2447 ret = copyfunc(copy_from, copy_to, len); 2448 if (ret != 0) { 2449 return (ret); 2450 } 2451 2452 ucx->ucx_len += len; 2453 ucx->ucx_bv |= info->xi_bits; 2454 return (0); 2455 } 2456 2457 /* 2458 * This table contains information about the extended FPU states and synthetic 2459 * information we create for /proc, the ucontext_t, and signal handling. The 2460 * definition of the xsave_proc_info_t describes how each member is used. 2461 * 2462 * In general, this table is expected to be in the order of the xsave data 2463 * structure itself. Synthetic elements that we create can go anywhere and new 2464 * ones should be inserted at the end. This structure is walked in order to 2465 * produce the /proc and signal handling logic, so changing the order is 2466 * meaningful for those and should not be done lightly. 2467 */ 2468 static const xsave_proc_info_t fpu_xsave_info[] = { { 2469 .xi_type = PRX_INFO_XCR, 2470 .xi_size = sizeof (prxregset_xcr_t), 2471 .xi_align = alignof (prxregset_xcr_t), 2472 .xi_always = true, 2473 .xi_bits = 0, 2474 .xi_fill = fpu_proc_xregs_xcr_fill, 2475 .xi_valid = fpu_proc_xregs_xcr_valid 2476 }, { 2477 /* 2478 * The XSAVE entry covers both the xsave header and the %xmm registers. 2479 * Note, there is no signal copyout information for the %xmm registers 2480 * because it is expected that that data is already in the fpregset_t. 2481 */ 2482 .xi_type = PRX_INFO_XSAVE, 2483 .xi_size = sizeof (prxregset_xsave_t), 2484 .xi_align = FPU_ALIGN_XMM, 2485 .xi_always = true, 2486 .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE, 2487 .xi_fill = fpu_proc_xregs_xsave_fill, 2488 .xi_set = fpu_proc_xregs_xsave_set, 2489 .xi_valid = fpu_proc_xregs_xsave_valid, 2490 .xi_signal_in = fpu_signal_copyin_xmm 2491 }, { 2492 .xi_type = PRX_INFO_YMM, 2493 .xi_size = sizeof (prxregset_ymm_t), 2494 .xi_align = FPU_ALIGN_YMM, 2495 .xi_always = false, 2496 .xi_bits = XFEATURE_AVX, 2497 .xi_fill = fpu_proc_xregs_std_fill, 2498 .xi_set = fpu_proc_xregs_std_set, 2499 .xi_signal_in = fpu_signal_copyin_std, 2500 .xi_valid = fpu_proc_xregs_ymm_valid, 2501 .xi_signal_out = fpu_signal_copyout_std 2502 }, { 2503 /* 2504 * There is no /proc validation function for the mask registers because 2505 * they are the same in ILP32 / LP64 and there is nothing for us to 2506 * actually validate. 2507 */ 2508 .xi_type = PRX_INFO_OPMASK, 2509 .xi_size = sizeof (prxregset_opmask_t), 2510 .xi_align = alignof (prxregset_opmask_t), 2511 .xi_always = false, 2512 .xi_bits = XFEATURE_AVX512_OPMASK, 2513 .xi_fill = fpu_proc_xregs_std_fill, 2514 .xi_set = fpu_proc_xregs_std_set, 2515 .xi_signal_in = fpu_signal_copyin_std, 2516 .xi_signal_out = fpu_signal_copyout_std 2517 }, { 2518 .xi_type = PRX_INFO_ZMM, 2519 .xi_size = sizeof (prxregset_zmm_t), 2520 .xi_align = FPU_ALIGN_ZMM, 2521 .xi_always = false, 2522 .xi_bits = XFEATURE_AVX512_ZMM, 2523 .xi_fill = fpu_proc_xregs_std_fill, 2524 .xi_set = fpu_proc_xregs_std_set, 2525 .xi_valid = fpu_proc_xregs_zmm_valid, 2526 .xi_signal_in = fpu_signal_copyin_std, 2527 .xi_signal_out = fpu_signal_copyout_std 2528 }, { 2529 .xi_type = PRX_INFO_HI_ZMM, 2530 .xi_size = sizeof (prxregset_hi_zmm_t), 2531 .xi_align = FPU_ALIGN_ZMM, 2532 .xi_always = false, 2533 .xi_bits = XFEATURE_AVX512_HI_ZMM, 2534 .xi_fill = fpu_proc_xregs_std_fill, 2535 .xi_set = fpu_proc_xregs_std_set, 2536 .xi_valid = fpu_proc_xregs_hi_zmm_valid, 2537 .xi_signal_in = fpu_signal_copyin_std, 2538 .xi_signal_out = fpu_signal_copyout_std 2539 } }; 2540 2541 static bool 2542 fpu_proc_xregs_include(const xsave_proc_info_t *infop) 2543 { 2544 return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0); 2545 } 2546 2547 void 2548 fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep, 2549 uint32_t *dstart) 2550 { 2551 size_t ret = sizeof (prxregset_hdr_t); 2552 uint32_t ninfo = 0; 2553 2554 ASSERT(fpu_xsave_enabled()); 2555 2556 /* 2557 * Right now the set of flags that are enabled in the FPU is global. 2558 * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the 2559 * actual things that might show up and we care about are all about what 2560 * is set up in %xcr0 which is stored in the global xsave_bv_all. If we 2561 * move to per-process FPU enablement which is likely to come with AMX, 2562 * then this will need the proc_t to look at, hence why we've set things 2563 * up with the unused variable above. 2564 * 2565 * We take two passes through the array. The first is just to count up 2566 * how many informational entries we need. 2567 */ 2568 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2569 if (!fpu_proc_xregs_include(&fpu_xsave_info[i])) 2570 continue; 2571 ninfo++; 2572 } 2573 2574 ASSERT3U(ninfo, >, 0); 2575 ret += sizeof (prxregset_info_t) * ninfo; 2576 2577 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2578 size_t curphase; 2579 if (!fpu_proc_xregs_include(&fpu_xsave_info[i])) 2580 continue; 2581 2582 curphase = ret % fpu_xsave_info[i].xi_align; 2583 if (ret < fpu_xsave_info[i].xi_align) { 2584 ret = fpu_xsave_info[i].xi_align; 2585 } else if (curphase != 0) { 2586 ret += curphase; 2587 } 2588 2589 if (i == 0 && dstart != NULL) { 2590 *dstart = ret; 2591 } 2592 2593 ret += fpu_xsave_info[i].xi_size; 2594 } 2595 2596 VERIFY3U(ret, <=, UINT32_MAX); 2597 if (sizep != NULL) { 2598 *sizep = ret; 2599 } 2600 2601 if (ninfop != NULL) { 2602 *ninfop = ninfo; 2603 } 2604 } 2605 2606 /* 2607 * This function supports /proc. Because /proc does not have a process locked 2608 * while processing a PCSXREG, this tries to establish an upper bound that we 2609 * will validate later in fpu_proc_xregs_set(). We basically say that if you 2610 * take the maximum xsave size and add 1 KiB that is a good enough approximation 2611 * for the maximum size. The 1 KiB is us basically trying to rationalize the 2612 * overhead of our structures that we're adding right, while being cognisant of 2613 * differing alignments and the fact that the full xsave size is in some cases 2614 * (when supervisor states or features we don't support are present) going to be 2615 * larger than we would need for this. 2616 */ 2617 size_t 2618 fpu_proc_xregs_max_size(void) 2619 { 2620 VERIFY(fpu_xsave_enabled()); 2621 return (cpuid_get_xsave_size() + 0x1000); 2622 } 2623 2624 /* 2625 * This functions supports /proc. In particular, it's meant to perform the 2626 * following: 2627 * 2628 * o Potentially save the current thread's registers. 2629 * o Write out the x86 xsave /proc xregs format data from the xsave data we 2630 * actually have. Note, this can be a little weird for cases where the FPU is 2631 * not actually enabled, which happens for system processes. 2632 */ 2633 void 2634 fpu_proc_xregs_get(klwp_t *lwp, void *buf) 2635 { 2636 uint32_t size, ninfo, curinfo, dstart; 2637 fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu; 2638 prxregset_hdr_t *hdr = buf; 2639 2640 ASSERT(fpu_xsave_enabled()); 2641 fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart); 2642 2643 /* 2644 * Before we get going, defensively zero out all the data buffer so that 2645 * the rest of the fill functions can assume a specific base. 2646 */ 2647 bzero(buf, size); 2648 2649 kpreempt_disable(); 2650 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 2651 /* 2652 * This case suggests that thread in question doesn't have a 2653 * valid FPU save state which should only happen when it is on 2654 * CPU. If this is the case, we must ensure that we save the 2655 * current FPU state before proceeding. We also sanity check 2656 * several things here before doing this as using /proc on 2657 * yourself is always exciting. fp_save() will ensure that the 2658 * thread is flagged to go back to being an eager FPU before 2659 * returning back to userland. 2660 */ 2661 VERIFY3P(curthread, ==, lwptot(lwp)); 2662 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 2663 fp_save(fpu); 2664 } 2665 kpreempt_enable(); 2666 2667 hdr->pr_type = PR_TYPE_XSAVE; 2668 hdr->pr_size = size; 2669 hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] = 2670 hdr->pr_pad[3] = 0; 2671 hdr->pr_ninfo = ninfo; 2672 2673 curinfo = 0; 2674 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2675 void *startp; 2676 uint32_t phase; 2677 2678 if (!fpu_proc_xregs_include(&fpu_xsave_info[i])) 2679 continue; 2680 2681 phase = dstart % fpu_xsave_info[i].xi_align; 2682 if (dstart < fpu_xsave_info[i].xi_align) { 2683 ASSERT3U(i, !=, 0); 2684 dstart = fpu_xsave_info[i].xi_align; 2685 } else if (phase != 0) { 2686 ASSERT3U(i, !=, 0); 2687 dstart += phase; 2688 } 2689 2690 hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type; 2691 hdr->pr_info[curinfo].pri_flags = 0; 2692 hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size; 2693 hdr->pr_info[curinfo].pri_offset = dstart; 2694 2695 startp = (void *)((uintptr_t)buf + dstart); 2696 fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp); 2697 dstart += fpu_xsave_info[i].xi_size; 2698 ASSERT3U(curinfo, <=, ninfo); 2699 curinfo++; 2700 } 2701 } 2702 2703 /* 2704 * We have been asked to set the data in the FPU for a given thread. Our 2705 * prmachdep code has already validated that the raw semantics of the data that 2706 * we have are valid (that is the appropriate sizes, offsets, and flags). We now 2707 * apply additional checking here: 2708 * 2709 * o The xsave structure is present and only valid bits are set. 2710 * o If the xsave component bit-vector is set, we have the corresponding proc 2711 * info item. 2712 * o Read-only items are ignored if and only if they actually match what we 2713 * gave the user mostly as a courtesy to simplify things here. 2714 * o ILP32 processes which can't support many of the regions are allowed to 2715 * have the items here (as we likely gave them to them), but they must be 2716 * zero if they are set. 2717 * 2718 * We take a first pass through all the data, validating it makes sense for the 2719 * FPU. Only after that point do we ensure that we have the FPU data in question 2720 * and then we clobber all the FPU data. Part of the semantics of setting this 2721 * is that we're setting the entire extended FPU. 2722 */ 2723 int 2724 fpu_proc_xregs_set(klwp_t *lwp, void *buf) 2725 { 2726 prxregset_hdr_t *prx = buf; 2727 model_t model = lwp_getdatamodel(lwp); 2728 uint64_t bv_found = 0; 2729 const prxregset_xsave_t *xsave = NULL; 2730 fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu; 2731 2732 VERIFY(fpu_xsave_enabled()); 2733 2734 /* 2735 * First, walk each note info header that we have from the user and 2736 * proceed to validate it. The prmachdep code has already validated that 2737 * the size, type, and offset information is valid, but it has not 2738 * validated the semantic contents of this or if someone is trying to 2739 * write something they shouldn't. 2740 * 2741 * While we walk this, we keep track of where the xsave header is. We 2742 * also track all of the bits that we have found along the way so we can 2743 * match up and ensure that everything that was set has a corresponding 2744 * bit in the xsave bitmap. If we have something in the xsave bitmap, 2745 * but not its corresponding data, then that is an error. However, we 2746 * allow folks to write data regions without the bit set in the xsave 2747 * data to make the read, modify, write process simpler. 2748 */ 2749 for (uint32_t i = 0; i < prx->pr_ninfo; i++) { 2750 const prxregset_info_t *info = &prx->pr_info[i]; 2751 bool found = false; 2752 2753 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) { 2754 void *data; 2755 if (info->pri_type != fpu_xsave_info[pt].xi_type) 2756 continue; 2757 2758 found = true; 2759 data = (void *)((uintptr_t)buf + info->pri_offset); 2760 if (fpu_xsave_info[pt].xi_valid != NULL && 2761 !fpu_xsave_info[pt].xi_valid(model, data)) { 2762 return (EINVAL); 2763 } 2764 2765 if (info->pri_type == PRX_INFO_XSAVE) { 2766 xsave = data; 2767 } 2768 bv_found |= fpu_xsave_info[pt].xi_bits; 2769 break; 2770 } 2771 2772 if (!found) { 2773 return (EINVAL); 2774 } 2775 } 2776 2777 /* 2778 * No xsave data, no dice. 2779 */ 2780 if (xsave == NULL) { 2781 return (EINVAL); 2782 } 2783 2784 /* 2785 * If anything is set in the xsave header that was not found as we 2786 * walked structures, then that is an error. The opposite is not true as 2787 * discussed above. 2788 */ 2789 if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) { 2790 return (EINVAL); 2791 } 2792 2793 /* 2794 * At this point, we consider all the data actually valid. Now we must 2795 * set up this information in the save area. If this is our own lwp, we 2796 * must disable it first. Otherwise, we expect that it is already valid. 2797 * To try to sanitize this, we will defensively zero the entire region 2798 * as we are setting everything that will result in here. 2799 */ 2800 kpreempt_disable(); 2801 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 2802 /* 2803 * This case suggests that thread in question doesn't have a 2804 * valid FPU save state which should only happen when it is on 2805 * CPU. If this is the case, we explicitly disable the FPU, but 2806 * do not save it before proceeding. We also sanity check 2807 * several things here before doing this as using /proc on 2808 * yourself is always exciting. Unlike fp_save(), fp_free() does 2809 * not signal that an update is required, so we unconditionally 2810 * set that for all threads. 2811 */ 2812 VERIFY3P(curthread, ==, lwptot(lwp)); 2813 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 2814 fp_free(fpu); 2815 } 2816 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb); 2817 bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, 2818 cpuid_get_xsave_size()); 2819 2820 for (uint32_t i = 0; i < prx->pr_ninfo; i++) { 2821 const prxregset_info_t *info = &prx->pr_info[i]; 2822 bool found = false; 2823 2824 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) { 2825 const void *data; 2826 if (info->pri_type != fpu_xsave_info[pt].xi_type) 2827 continue; 2828 2829 /* 2830 * Check if we have a set function and if we should 2831 * include this. We may not if this is something like 2832 * PRX_INFO_XCR which is read-only. 2833 * 2834 * We may not include a given entry as it may not have 2835 * been set in the actual xsave state that we have been 2836 * asked to restore, in which case to not break the 2837 * xsaveopt logic, we must leave it in its initial 2838 * state, e.g. zeroed (generally). XMM data initial 2839 * state is not zeroed, but is marked with xi_always to 2840 * help account for this. 2841 */ 2842 found = true; 2843 if (fpu_xsave_info[pt].xi_set == NULL) 2844 break; 2845 if (!fpu_xsave_info[pt].xi_always && 2846 (xsave->prx_xsh_xstate_bv & 2847 fpu_xsave_info[pt].xi_bits) != 2848 fpu_xsave_info[pt].xi_bits) { 2849 break; 2850 } 2851 2852 data = (void *)((uintptr_t)buf + info->pri_offset); 2853 fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt], 2854 xsave->prx_xsh_xstate_bv, data); 2855 } 2856 2857 VERIFY(found); 2858 } 2859 kpreempt_enable(); 2860 2861 return (0); 2862 } 2863 2864 /* 2865 * To be included in the signal copyout logic we must have a copy function and 2866 * the bit in question must be included. Note, we don't consult xi_always here 2867 * as that is really part of what is always present for xsave logic and 2868 * therefore isn't really pertinent here because of our custom format. See the 2869 * big theory statement for more info. 2870 */ 2871 static bool 2872 fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv) 2873 { 2874 return ((infop->xi_bits & xs_bv) == infop->xi_bits && 2875 infop->xi_signal_out != NULL); 2876 } 2877 2878 /* 2879 * We need to fill out the xsave related data into the ucontext_t that we've 2880 * been given. We should have a valid user pointer at this point in the uc_xsave 2881 * member. This is much simpler than the copyin that we have. Here are the 2882 * current assumptions: 2883 * 2884 * o This is being called for the current thread. This is not meant to operate 2885 * on an arbitrary thread's state. 2886 * o We cannot assume whether the FPU is valid in the pcb or not. While most 2887 * callers will have just called getfpregs() which saved the state, don't 2888 * assume that. 2889 * o We assume that the user address has the requisite required space for this 2890 * to be copied out. 2891 * o We assume that copyfunc() will ensure we are not copying into a kernel 2892 * address. 2893 * 2894 * For more information on the format of the data, see the 'Signal Handling and 2895 * the ucontext_t' portion of the big theory statement. We copy out all the 2896 * constituent parts and then come back and write out the actual final header 2897 * information. 2898 */ 2899 int 2900 fpu_signal_copyout(klwp_t *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc) 2901 { 2902 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 2903 uint64_t xs_bv; 2904 uc_xsave_t ucx; 2905 int ret; 2906 2907 VERIFY3P(curthread, ==, lwptot(lwp)); 2908 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 2909 VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN); 2910 2911 if (!fpu_xsave_enabled()) { 2912 return (ENOTSUP); 2913 } 2914 2915 /* 2916 * Unlike when we're dealing with /proc, we can unconditionally call 2917 * fp_save() because this is always called in the context where the lwp 2918 * we're operating on is always the one on CPU (which is what fp_save() 2919 * asserts). 2920 */ 2921 fp_save(fpu); 2922 2923 bzero(&ucx, sizeof (ucx)); 2924 ucx.ucx_vers = UC_XSAVE_VERS; 2925 ucx.ucx_len += sizeof (uc_xsave_t); 2926 2927 xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv; 2928 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2929 const xsave_proc_info_t *info = &fpu_xsave_info[i]; 2930 2931 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv)) 2932 continue; 2933 ret = info->xi_signal_out(info, copyfunc, &ucx, 2934 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, 2935 uaddr); 2936 if (ret != 0) { 2937 kpreempt_enable(); 2938 return (ret); 2939 } 2940 } 2941 2942 /* 2943 * Now that everything has been copied out, we should have an accurate 2944 * value in the uc_xsave_t header and we can copy that out at the start 2945 * of the user data. 2946 */ 2947 ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx)); 2948 return (ret); 2949 } 2950 2951 /* 2952 * Here we've been given a ucontext_t which potentially has a user pointer to 2953 * xsave state that we've copied out previously. In this case we need to do the 2954 * following, assuming UC_XSAVE is present: 2955 * 2956 * o Copy in our header and validate it. 2957 * o Allocate an fpu context to use as a holding ground for all this data. 2958 * o If UC_FPU is set, override the xsave structure with the saved XMM state, 2959 * clear UC_FPU, and make sure that the correct xsave_bv bits are set. 2960 * 2961 * Currently we always allocate the additional state as a holding ground for the 2962 * FPU. What we're copying in may not be valid and we don't want to clobber the 2963 * existing FPU state or deal with merging it until we believe it's reasonable 2964 * enough. The proc_t is here to set us up for when we have per-process settings 2965 * in the extended feature disable MSRs. 2966 */ 2967 int 2968 fpu_signal_copyin(klwp_t *lwp, ucontext_t *kuc) 2969 { 2970 uc_xsave_t ucx; 2971 uint64_t bv; 2972 uintptr_t data, max_data; 2973 void *fpu; 2974 proc_t *p = lwp->lwp_procp; 2975 size_t ksize; 2976 2977 /* 2978 * Because this has been opaque filler and the kernel has never 2979 * historically looked at it, we don't really care about the uc_xsave 2980 * pointer being garbage in the case that the flag is not set. While 2981 * this isn't perhaps the most sporting choice in some cases, this is on 2982 * the other hand, pragmatic. 2983 */ 2984 if ((kuc->uc_flags & UC_XSAVE) != 0) { 2985 if (kuc->uc_xsave == 0) { 2986 return (EINVAL); 2987 } 2988 2989 if (!fpu_xsave_enabled()) { 2990 return (ENOTSUP); 2991 } 2992 } else { 2993 return (0); 2994 } 2995 2996 if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) != 2997 0) { 2998 return (EFAULT); 2999 } 3000 3001 ksize = cpuid_get_xsave_size(); 3002 if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) || 3003 ucx.ucx_len > ksize || 3004 (ucx.ucx_bv & ~xsave_bv_all) != 0 || 3005 (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len < 3006 (uintptr_t)kuc->uc_xsave) { 3007 return (EINVAL); 3008 } 3009 3010 /* 3011 * OK, our goal right now is to recreate a valid xsave_state structure 3012 * that we'll ultimately end up having to merge with our existing one in 3013 * the FPU save state. The reason we describe this as a merge is to help 3014 * future us when we want to retain supervisor state which will never be 3015 * part of userland signal state. The design of the userland signal 3016 * state is basically to compress it as much as we can. This is done for 3017 * two reasons: 3018 * 3019 * 1) We currently consider this a private interface. 3020 * 2) We really want to minimize the actual amount of stack space we 3021 * use as much as possible. Most applications aren't using AVX-512 3022 * right now, so doing our own compression style is worthwhile. If 3023 * libc adopts AVX-512 routines, we may want to change this. 3024 * 3025 * On the allocation below, our assumption is that if a thread has taken 3026 * a signal, then it is likely to take a signal again in the future (or 3027 * be shortly headed to its demise). As such, when that happens we will 3028 * leave the allocated signal stack around for the process. Most 3029 * applications don't allow all threads to take signals, so this should 3030 * hopefully help amortize the cost of the allocation. 3031 */ 3032 max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len; 3033 data = (uintptr_t)kuc->uc_xsave + sizeof (ucx); 3034 bv = ucx.ucx_bv; 3035 if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) { 3036 lwp->lwp_pcb.pcb_fpu.fpu_signal = 3037 kmem_cache_alloc(fpsave_cachep, KM_SLEEP); 3038 } 3039 fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal; 3040 3041 /* 3042 * Unconditionally initialize the memory we get in here to ensure that 3043 * it is in a reasonable state for ourselves. This ensures that unused 3044 * regions are mostly left in their initial state (the main exception 3045 * here is the x87/XMM state, but that should be OK). We don't fill in 3046 * the initial xsave state as we expect that to happen as part of our 3047 * processing. 3048 */ 3049 bzero(fpu, ksize); 3050 3051 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 3052 int ret; 3053 const xsave_proc_info_t *info = &fpu_xsave_info[i]; 3054 if (!info->xi_always && (info->xi_bits & bv) == 0) 3055 continue; 3056 bv &= ~info->xi_bits; 3057 3058 if (info->xi_signal_in == NULL) 3059 continue; 3060 ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data); 3061 if (ret != 0) { 3062 return (ret); 3063 } 3064 } 3065 ASSERT0(bv); 3066 3067 /* 3068 * As described in the big theory statement section 'Signal Handling and 3069 * the ucontext_t', we always remove UC_FPU from here as we've taken 3070 * care of reassembling it ourselves. 3071 */ 3072 kuc->uc_flags &= ~UC_FPU; 3073 kuc->uc_xsave = (uintptr_t)fpu; 3074 3075 return (0); 3076 } 3077 3078 /* 3079 * This determines the size of the signal stack that we need for our custom form 3080 * of the xsave state. 3081 */ 3082 size_t 3083 fpu_signal_size(klwp_t *lwp) 3084 { 3085 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3086 size_t len = sizeof (uc_xsave_t); 3087 uint64_t xs_bv; 3088 3089 VERIFY3P(curthread, ==, lwptot(lwp)); 3090 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3091 VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN); 3092 3093 if (!fpu_xsave_enabled()) { 3094 return (0); 3095 } 3096 3097 kpreempt_disable(); 3098 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3099 fp_save(fpu); 3100 } 3101 3102 xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv; 3103 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 3104 size_t comp_size; 3105 3106 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv)) 3107 continue; 3108 3109 cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size, 3110 NULL); 3111 len += comp_size; 3112 } 3113 3114 kpreempt_enable(); 3115 return (len); 3116 } 3117 3118 /* 3119 * This function is used in service of restorecontext() to set the specified 3120 * thread's extended FPU state to the passed in data. Our assumptions at this 3121 * point from the system are: 3122 * 3123 * o Someone has already verified that the actual xsave header is correct. 3124 * o Any traditional XMM state that causes a #gp has been clamped. 3125 * o That data is basically the correct sized xsave state structure. Right now 3126 * that means it is not compressed and follows the CPUID-based rules for 3127 * constructing and laying out data. 3128 * o That the lwp argument refers to the current thread. 3129 * 3130 * Our primary purpose here is to merge the current FPU state with what exists 3131 * here. Right now, "merge", strictly speaking is just "replace". We can get 3132 * away with just replacing everything because all we currently save are user 3133 * states. If we start saving kernel states in here, this will get more nuanced 3134 * and we will need to be more careful about how we store data here. 3135 */ 3136 void 3137 fpu_set_xsave(klwp_t *lwp, const void *data) 3138 { 3139 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3140 uint32_t status, xstatus; 3141 struct xsave_state *dst_xsave; 3142 3143 VERIFY(fpu_xsave_enabled()); 3144 VERIFY3P(curthread, ==, lwptot(lwp)); 3145 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3146 ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN); 3147 3148 /* 3149 * We use fp_save() here rather than a stock fpdisable() so we can 3150 * attempt to honor our invariants that when the thread state has been 3151 * saved, the valid flag is set, even though we're going to be 3152 * overwriting it shortly. If we just called fpdisable() then we would 3153 * basically be asking for trouble. 3154 * 3155 * Because we are modifying the state here and we don't want the system 3156 * to end up in an odd state, we are being a little paranoid and 3157 * disabling preemption across this operation. In particular, once the 3158 * state is properly tagged with FPU_VALID, there should be no other way 3159 * that this thread can return to userland and get cleared out because 3160 * we're resetting its context; however, we let paranoia win out. 3161 */ 3162 kpreempt_disable(); 3163 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3164 fp_save(fpu); 3165 } 3166 3167 bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, 3168 cpuid_get_xsave_size()); 3169 dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic; 3170 status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0]; 3171 xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1]; 3172 dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0; 3173 dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0; 3174 3175 /* 3176 * These two status words are information that the kernel itself uses to 3177 * track additional information and is part of the traditional fpregset, 3178 * but is not part of our xregs information. Because we are setting this 3179 * state, we leave it up to the rest of the kernel to determine whether 3180 * this came from an fpregset_t or is being reset to the default of 0. 3181 */ 3182 fpu->fpu_regs.kfpu_status = status; 3183 fpu->fpu_regs.kfpu_xstatus = xstatus; 3184 3185 fpu->fpu_flags |= FPU_VALID; 3186 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb); 3187 kpreempt_enable(); 3188 } 3189 3190 /* 3191 * Convert the current FPU state to the traditional fpregset_t. In the 64-bit 3192 * kernel, this is just an fxsave_state with additional values for the status 3193 * and xstatus members. 3194 * 3195 * This has the same nuance as the xregs cases discussed above, but is simpler 3196 * in that we only need to handle the fxsave state, but more complicated because 3197 * we need to check our save mechanism. 3198 */ 3199 void 3200 fpu_get_fpregset(klwp_t *lwp, fpregset_t *fp) 3201 { 3202 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3203 3204 kpreempt_disable(); 3205 fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status; 3206 fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus; 3207 3208 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3209 /* 3210 * If we're requesting the fpregs of a thread that isn't 3211 * currently valid and isn't the one that we're executing, then 3212 * we consider getting this information to be a best-effort and 3213 * we will not stop the thread in question to serialize it, 3214 * which means possibly getting stale data. This is the 3215 * traditional semantics that the system has used to service 3216 * this for /proc. 3217 */ 3218 if (curthread == lwptot(lwp)) { 3219 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3220 fp_save(fpu); 3221 } 3222 } 3223 3224 /* 3225 * If the FPU is not enabled and the state isn't valid (due to someone 3226 * else setting it), just copy the initial state. 3227 */ 3228 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) { 3229 bcopy(&sse_initial, fp, sizeof (sse_initial)); 3230 kpreempt_enable(); 3231 return; 3232 } 3233 3234 /* 3235 * Given that we have an enabled FPU, we must look at the type of FPU 3236 * save mechanism to clean this up. In particular, while we can just 3237 * copy the save area with FXSAVE, with XSAVE we must carefully copy 3238 * only the bits that are valid and reset the rest to their default 3239 * state. 3240 */ 3241 switch (fp_save_mech) { 3242 case FP_FXSAVE: 3243 bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp, 3244 sizeof (struct fxsave_state)); 3245 break; 3246 case FP_XSAVE: 3247 fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs, 3248 (struct fxsave_state *)fp); 3249 break; 3250 default: 3251 panic("Invalid fp_save_mech"); 3252 } 3253 3254 kpreempt_enable(); 3255 } 3256 3257 /* 3258 * This is a request to set the ABI fpregset_t into our actual hardware state. 3259 * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the 3260 * 512-byte fxsave area. 3261 */ 3262 void 3263 fpu_set_fpregset(klwp_t *lwp, const fpregset_t *fp) 3264 { 3265 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3266 3267 kpreempt_disable(); 3268 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3269 /* 3270 * We always save the entire FPU. This is required if we're 3271 * using xsave. If we're using fxsave, we could skip the 3272 * 512-byte write and instead just disable the FPU since we'd be 3273 * replacing it all. For now we don't bother with more 3274 * conditional logic. 3275 */ 3276 VERIFY3P(curthread, ==, lwptot(lwp)); 3277 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3278 fp_save(fpu); 3279 } 3280 3281 fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus; 3282 fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status; 3283 switch (fp_save_mech) { 3284 case FP_FXSAVE: 3285 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx, 3286 sizeof (struct fxsave_state)); 3287 break; 3288 case FP_XSAVE: 3289 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs, 3290 sizeof (struct fxsave_state)); 3291 fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= 3292 XFEATURE_LEGACY_FP | XFEATURE_SSE; 3293 break; 3294 default: 3295 panic("Invalid fp_save_mech"); 3296 } 3297 3298 fpu->fpu_flags |= FPU_VALID; 3299 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb); 3300 kpreempt_enable(); 3301 } 3302