1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2021 Joyent, Inc. 24 * Copyright 2021 RackTop Systems, Inc. 25 * Copyright 2023 Oxide Computer Company 26 * Copyright 2025 Edgecast Cloud LLC. 27 */ 28 29 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 30 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 31 /* All Rights Reserved */ 32 33 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 34 /* All Rights Reserved */ 35 36 /* 37 * Copyright (c) 2009, Intel Corporation. 38 * All rights reserved. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/param.h> 43 #include <sys/signal.h> 44 #include <sys/regset.h> 45 #include <sys/privregs.h> 46 #include <sys/psw.h> 47 #include <sys/trap.h> 48 #include <sys/fault.h> 49 #include <sys/systm.h> 50 #include <sys/user.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/pcb.h> 54 #include <sys/lwp.h> 55 #include <sys/cpuvar.h> 56 #include <sys/thread.h> 57 #include <sys/disp.h> 58 #include <sys/fp.h> 59 #include <sys/siginfo.h> 60 #include <sys/archsystm.h> 61 #include <sys/kmem.h> 62 #include <sys/debug.h> 63 #include <sys/x86_archext.h> 64 #include <sys/sysmacros.h> 65 #include <sys/cmn_err.h> 66 #include <sys/kfpu.h> 67 #include <sys/stdbool.h> 68 #include <sys/stdalign.h> 69 #include <sys/procfs_isa.h> 70 #include <sys/sunddi.h> 71 72 /* 73 * FPU Management Overview 74 * ----------------------- 75 * 76 * The x86 FPU has evolved substantially since its days as the x87 coprocessor; 77 * however, many aspects of its life as a coprocessor are still around in x86. 78 * 79 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU. 80 * While that state still exists, there is much more that is covered by the FPU. 81 * Today, this includes not just traditional FPU state, but also supervisor only 82 * state. The following state is currently managed and covered logically by the 83 * idea of the FPU registers and more generally is called the Extended Processor 84 * States: 85 * 86 * o Traditional x87 FPU 87 * o Vector Registers (%xmm, %ymm, %zmm) 88 * o Memory Protection Extensions (MPX) Bounds Registers 89 * o Protected Key Rights Registers (PKRU) 90 * o Processor Trace data 91 * o Control-Flow Enforcement state 92 * o Hardware Duty Cycle 93 * o Hardware P-states 94 * 95 * The rest of this covers how the FPU is managed and controlled, how state is 96 * saved and restored between threads, interactions with hypervisors, and other 97 * information exported to userland through aux vectors. A lot of background 98 * information is here to synthesize major parts of the Intel SDM, but 99 * unfortunately, it is not a replacement for reading it. 100 * 101 * FPU Control Registers 102 * --------------------- 103 * 104 * Because the x87 FPU began its life as a co-processor and the FPU was 105 * optional there are several bits that show up in %cr0 that we have to 106 * manipulate when dealing with the FPU. These are: 107 * 108 * o CR0.ET The 'extension type' bit. This was used originally to indicate 109 * that the FPU co-processor was present. Now it is forced on for 110 * compatibility. This is often used to verify whether or not the 111 * FPU is present. 112 * 113 * o CR0.NE The 'native error' bit. Used to indicate that native error 114 * mode should be enabled. This indicates that we should take traps 115 * on FPU errors. The OS enables this early in boot. 116 * 117 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not 118 * wait/fwait instructions generate a #NM if CR0.TS is set. 119 * 120 * o CR0.EM The 'Emulation' bit. This is used to cause floating point 121 * operations (x87 through SSE4) to trap with a #UD so they can be 122 * emulated. The system never sets this bit, but makes sure it is 123 * clear on processor start up. 124 * 125 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating 126 * point operation will generate a #NM. An fwait will as well, 127 * depending on the value in CR0.MP. 128 * 129 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by 130 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more 131 * complicated role. Historically it has been used to allow running systems to 132 * restore the FPU registers lazily. This will be discussed in greater depth 133 * later on. 134 * 135 * %cr4 is also used as part of the FPU control. Specifically we need to worry 136 * about the following bits in the system: 137 * 138 * o CR4.OSFXSR This bit is used to indicate that the OS understands and 139 * supports the execution of the fxsave and fxrstor 140 * instructions. This bit is required to be set to enable 141 * the use of the SSE->SSE4 instructions. 142 * 143 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand 144 * and take a SIMD floating point exception (#XM). This bit 145 * is always enabled by the system. 146 * 147 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and 148 * supports the execution of the xsave and xrstor family of 149 * instructions. This bit is required to use any of the AVX 150 * and newer feature sets. 151 * 152 * Because all supported processors are 64-bit, they'll always support the XMM 153 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot. 154 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid. 155 * 156 * %xcr0 is used to manage the behavior of the xsave feature set and is only 157 * present on the system if xsave is supported. %xcr0 is read and written to 158 * through by the xgetbv and xsetbv instructions. This register is present 159 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a 160 * different component of the xsave state and controls whether or not that 161 * information is saved and restored. For newer feature sets like AVX and MPX, 162 * it also controls whether or not the corresponding instructions can be 163 * executed (much like CR0.OSFXSR does for the SSE feature sets). 164 * 165 * Everything in %xcr0 is around features available to users. There is also the 166 * IA32_XSS MSR which is used to control supervisor-only features that are still 167 * part of the xsave state. Bits that can be set in %xcr0 are reserved in 168 * IA32_XSS and vice versa. This is an important property that is particularly 169 * relevant to how the xsave instructions operate. 170 * 171 * Save Mechanisms 172 * --------------- 173 * 174 * When switching between running threads the FPU state needs to be saved and 175 * restored by the OS. If this state was not saved, users would rightfully 176 * complain about corrupt state. There are three mechanisms that exist on the 177 * processor for saving and restoring these state images: 178 * 179 * o fsave 180 * o fxsave 181 * o xsave 182 * 183 * fsave saves and restores only the x87 FPU and is the oldest of these 184 * mechanisms. This mechanism is never used in the kernel today because we are 185 * always running on systems that support fxsave. 186 * 187 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register 188 * state to be saved and restored to and from a struct fxsave_state. This is the 189 * default mechanism that is used to save and restore the FPU on amd64. An 190 * important aspect of fxsave that was different from the original i386 fsave 191 * mechanism is that the restoring of FPU state with pending exceptions will not 192 * generate an exception, it will be deferred to the next use of the FPU. 193 * 194 * The final and by far the most complex mechanism is that of the xsave set. 195 * xsave allows for saving and restoring all of the traditional x86 pieces (x87 196 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc. 197 * registers. 198 * 199 * Data is saved and restored into and out of a struct xsave_state. The first 200 * part of the struct xsave_state is equivalent to the struct fxsave_state. 201 * After that, there is a header which is used to describe the remaining 202 * portions of the state. The header is a 64-byte value of which the first two 203 * uint64_t values are defined and the rest are reserved and must be zero. The 204 * first uint64_t is the xstate_bv member. This describes which values in the 205 * xsave_state are actually valid and present. This is updated on a save and 206 * used on restore. The second member is the xcomp_bv member. Its last bit 207 * determines whether or not a compressed version of the structure is used. 208 * 209 * When the uncompressed structure is used (currently the only format we 210 * support), then each state component is at a fixed offset in the structure, 211 * even if it is not being used. For example, if you only saved the AVX related 212 * state, but did not save the MPX related state, the offset would not change 213 * for any component. With the compressed format, components that aren't used 214 * are all elided (though the x87 and SSE state are always there). 215 * 216 * Unlike fxsave which saves all state, the xsave family does not always save 217 * and restore all the state that could be covered by the xsave_state. The 218 * instructions all take an argument which is a mask of what to consider. This 219 * is the same mask that will be used in the xstate_bv vector and it is also the 220 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only 221 * considered with the xsaves and xrstors instructions. 222 * 223 * When a save or restore is requested, a bitwise and is performed between the 224 * requested bits and those that have been enabled in %xcr0. Only the bits that 225 * match that are then saved or restored. Others will be silently ignored by 226 * the processor. This idea is used often in the OS. We will always request that 227 * we save and restore all of the state, but only those portions that are 228 * actually enabled in %xcr0 will be touched. 229 * 230 * If a feature has been asked to be restored that is not set in the xstate_bv 231 * feature vector of the save state, then it will be set to its initial state by 232 * the processor (usually zeros). Also, when asked to save state, the processor 233 * may not write out data that is in its initial state as an optimization. This 234 * optimization only applies to saving data and not to restoring data. 235 * 236 * There are a few different variants of the xsave and xrstor instruction. They 237 * are: 238 * 239 * o xsave This is the original save instruction. It will save all of the 240 * requested data in the xsave state structure. It only saves data 241 * in the uncompressed (xcomp_bv[63] is zero) format. It may be 242 * executed at all privilege levels. 243 * 244 * o xrstor This is the original restore instruction. It will restore all of 245 * the requested data. The xrstor function can handle both the 246 * compressed and uncompressed formats. It may be executed at all 247 * privilege levels. 248 * 249 * o xsaveopt This is a variant of the xsave instruction that employs 250 * optimizations to try and only write out state that has been 251 * modified since the last time an xrstor instruction was called. 252 * The processor tracks a tuple of information about the last 253 * xrstor and tries to ensure that the same buffer is being used 254 * when this optimization is being used. However, because of the 255 * way that it tracks the xrstor buffer based on the address of it, 256 * it is not suitable for use if that buffer can be easily reused. 257 * The most common case is trying to save data to the stack in 258 * rtld. It may be executed at all privilege levels. 259 * 260 * o xsavec This is a variant of the xsave instruction that writes out the 261 * compressed form of the xsave_state. Otherwise it behaves as 262 * xsave. It may be executed at all privilege levels. 263 * 264 * o xsaves This is a variant of the xsave instruction. It is similar to 265 * xsavec in that it always writes the compressed form of the 266 * buffer. Unlike all the other forms, this instruction looks at 267 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine 268 * what to save and restore. xsaves also implements the same 269 * optimization that xsaveopt does around modified pieces. User 270 * land may not execute the instruction. 271 * 272 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves 273 * it can save and restore both the user and privileged states. 274 * Unlike xrstor it can only operate on the compressed form. 275 * User land may not execute the instruction. 276 * 277 * Based on all of these, the kernel has a precedence for what it will use. 278 * Basically, xsaves (not supported) is preferred to xsaveopt, which is 279 * preferred to xsave. A similar scheme is used when informing rtld (more later) 280 * about what it should use. xsavec is preferred to xsave. xsaveopt is not 281 * recommended due to the modified optimization not being appropriate for this 282 * use. 283 * 284 * Finally, there is one last gotcha with the xsave state. Importantly some AMD 285 * processors did not always save and restore some of the FPU exception state in 286 * some cases like Intel did. In those cases the OS will make up for this fact 287 * itself. 288 * 289 * FPU Initialization 290 * ------------------ 291 * 292 * One difference with the FPU registers is that not all threads have FPU state, 293 * only those that have an lwp. Generally this means kernel threads, which all 294 * share p0 and its lwp, do not have FPU state. Though there are definitely 295 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread 296 * and lwp interchangeably, just think of thread meaning a thread that has a 297 * lwp. 298 * 299 * Each lwp has its FPU state allocated in its pcb (process control block). The 300 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized 301 * dynamically at start up based on the save mechanism that we're using and the 302 * amount of memory required for it. This is dynamic because the xsave_state 303 * size varies based on the supported feature set. 304 * 305 * The hardware side of the FPU is initialized early in boot before we mount the 306 * root file system. This is effectively done in fpu_probe(). This is where we 307 * make the final decision about what the save and restore mechanisms we should 308 * use are, create the fpsave_cachep kmem cache, and initialize a number of 309 * function pointers that use save and restoring logic. 310 * 311 * The thread/lwp side is a a little more involved. There are two different 312 * things that we need to concern ourselves with. The first is how the FPU 313 * resources are allocated and the second is how the FPU state is initialized 314 * for a given lwp. 315 * 316 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init(). 317 * This is always called unconditionally by the system as part of creating an 318 * LWP. 319 * 320 * There are three different initialization paths that we deal with. The first 321 * is when we are executing a new process. As part of exec all of the register 322 * state is reset. The exec case is particularly important because init is born 323 * like Athena, sprouting from the head of the kernel, without any true parent 324 * to fork from. The second is used whenever we fork or create a new lwp. The 325 * third is to deal with special lwps like the agent lwp. 326 * 327 * During exec, we will call fp_exec() which will initialize and set up the FPU 328 * state for the process. That will fill in the initial state for the FPU and 329 * also set that state in the FPU itself. As part of fp_exec() we also install a 330 * thread context operations vector that takes care of dealing with the saving 331 * and restoring of the FPU. These context handlers will also be called whenever 332 * an lwp is created or forked. In those cases, to initialize the FPU we will 333 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context 334 * operations vector for the new thread. 335 * 336 * Next we'll end up in the context operation fp_new_lwp(). This saves the 337 * current thread's state, initializes the new thread's state, and copies over 338 * the relevant parts of the originating thread's state. It's as this point that 339 * we also install the FPU context operations into the new thread, which ensures 340 * that all future threads that are descendants of the current one get the 341 * thread context operations (unless they call exec). 342 * 343 * To deal with some things like the agent lwp, we double check the state of the 344 * FPU in sys_rtt_common() to make sure that it has been enabled before 345 * returning to userland. In general, this path should be rare, but it's useful 346 * for the odd lwp here and there. 347 * 348 * The FPU state will remain valid most of the time. There are times that 349 * the state will be rewritten. For example in restorecontext, due to /proc, or 350 * the lwp calls exec(). Whether the context is being freed or we are resetting 351 * the state, we will call fp_free() to disable the FPU and our context. 352 * 353 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU 354 * state by calling fp_lwp_cleanup(). 355 * 356 * Kernel FPU Multiplexing 357 * ----------------------- 358 * 359 * Just as the kernel has to maintain all of the general purpose registers when 360 * switching between scheduled threads, the same is true of the FPU registers. 361 * 362 * When a thread has FPU state, it also has a set of context operations 363 * installed. These context operations take care of making sure that the FPU is 364 * properly saved and restored during a context switch (fpsave_ctxt and 365 * fprestore_ctxt respectively). This means that the current implementation of 366 * the FPU is 'eager', when a thread is running the CPU will have its FPU state 367 * loaded. While this is always true when executing in userland, there are a few 368 * cases where this is not true in the kernel. 369 * 370 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was 371 * employed. This meant that the FPU would be saved on a context switch and the 372 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would 373 * then take a #NM trap, at which point we would restore the FPU from the save 374 * area and return to userland. Given the frequency of use of the FPU alone by 375 * libc, there's no point returning to userland just to trap again. 376 * 377 * There are a few cases though where the FPU state may need to be changed for a 378 * thread on its behalf. The most notable cases are in the case of processes 379 * using /proc, restorecontext, forking, etc. In all of these cases the kernel 380 * will force a threads FPU state to be saved into the PCB through the fp_save() 381 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the 382 * pcb. This indicates that the save state holds currently valid data. As a side 383 * effect of this, CR0.TS will be set. To make sure that all of the state is 384 * updated before returning to userland, in these cases, we set a flag on the 385 * PCB that says the FPU needs to be updated. This will make sure that we take 386 * the slow path out of a system call to fix things up for the thread. Due to 387 * the fact that this is a rather rare case, effectively setting the equivalent 388 * of t_postsys is acceptable. 389 * 390 * CR0.TS will be set after a save occurs and cleared when a restore occurs. 391 * Generally this means it will be cleared immediately by the new thread that is 392 * running in a context switch. However, this isn't the case for kernel threads. 393 * They currently operate with CR0.TS set as no kernel state is restored for 394 * them. This means that using the FPU will cause a #NM and panic. 395 * 396 * The FPU_VALID flag on the currently executing thread's pcb is meant to track 397 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set. 398 * However, because we eagerly restore, the only time that CR0.TS should be set 399 * for a non-kernel thread is during operations where it will be cleared before 400 * returning to userland and importantly, the only data that is in it is its 401 * own. 402 * 403 * Kernel FPU Usage 404 * ---------------- 405 * 406 * Traditionally the kernel never used the FPU since it had no need for 407 * floating point operations. However, modern FPU hardware supports a variety 408 * of SIMD extensions which can speed up code such as parity calculations or 409 * encryption. 410 * 411 * To allow the kernel to take advantage of these features, the 412 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped 413 * around any usage of the FPU by the kernel to ensure that user-level context 414 * is properly saved/restored, as well as to properly setup the FPU for use by 415 * the kernel. There are a variety of ways this wrapping can be used, as 416 * discussed in this section below. 417 * 418 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended 419 * operations, the kernel_fpu_alloc() function should be used to allocate a 420 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU 421 * state. This structure is not tied to any thread. That is, different threads 422 * can reuse the same kfpu_state_t structure, although not concurrently. A 423 * kfpu_state_t structure is freed by the kernel_fpu_free() function. 424 * 425 * In some cases, the kernel may need to use the FPU for a short operation 426 * without the overhead to manage a kfpu_state_t structure and without 427 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE 428 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags 429 * parameter. This indicates that there is no kfpu_state_t. When used this way, 430 * kernel preemption should be disabled by the caller (kpreempt_disable) before 431 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end(). 432 * For this usage, it is important to limit the kernel's FPU use to short 433 * operations. The tradeoff between using the FPU without a kfpu_state_t 434 * structure vs. the overhead of allowing a context switch while using the FPU 435 * should be carefully considered on a case by case basis. 436 * 437 * In other cases, kernel threads have an LWP, but never execute in user space. 438 * In this situation, the LWP's pcb_fpu area can be used to save/restore the 439 * kernel's FPU state if the thread is context switched, instead of having to 440 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the 441 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to 442 * enable this behavior. It is the caller's responsibility to ensure that this 443 * is only used for a kernel thread which never executes in user space. 444 * 445 * FPU Exceptions 446 * -------------- 447 * 448 * Certain operations can cause the kernel to take traps due to FPU activity. 449 * Generally these events will cause a user process to receive a SIGFPU and if 450 * the kernel receives it in kernel context, we will die. Traditionally the #NM 451 * (Device Not Available / No Math) exception generated by CR0.TS would have 452 * caused us to restore the FPU. Now it is a fatal event regardless of whether 453 * or not userland causes it. 454 * 455 * While there are some cases where the kernel uses the FPU, it is up to the 456 * kernel to use the FPU in a way such that it cannot receive a trap or to use 457 * the appropriate trap protection mechanisms. 458 * 459 * Hypervisors 460 * ----------- 461 * 462 * When providing support for hypervisors things are a little bit more 463 * complicated because the FPU is not virtualized at all. This means that they 464 * need to save and restore the FPU and %xcr0 across entry and exit to the 465 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These 466 * allow us to use the full native state to make sure that we are always saving 467 * and restoring the full FPU that the host sees, even when the guest is using a 468 * subset. 469 * 470 * One tricky aspect of this is that the guest may be using a subset of %xcr0 471 * and therefore changing our %xcr0 on the fly. It is vital that when we're 472 * saving and restoring the FPU that we always use the largest %xcr0 contents 473 * otherwise we will end up leaving behind data in it. 474 * 475 * ELF PLT Support 476 * --------------- 477 * 478 * rtld has to preserve a subset of the FPU when it is saving and restoring 479 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for 480 * more information. As a result, we set up an aux vector that contains 481 * information about what save and restore mechanisms it should be using and 482 * the sizing thereof based on what the kernel supports. This is passed down in 483 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is 484 * initialized in fpu_subr.c. 485 * 486 * Signal Handling and the ucontext_t 487 * ---------------------------------- 488 * 489 * One of the many gifts that signals give us is the twofold fact that when a 490 * signal occurs, the signal handler is allowed to change the CPU's state 491 * arbitrarily and when the signal handler is done executing, we must restore it 492 * back to the original state. However, the second part of this is that the 493 * signal handler is actually allowed to modify the state that the thread will 494 * return to! To create this facade, the kernel will create a full ucontext_t 495 * state, effectively calling getcontext(2) on the thread's behalf, and a 496 * pointer to that is given to the signal handler (the void * argument for the 497 * sa_sigaction function pointer in sigaction(2)). When libc is done with a 498 * signal, it will call setcontext(2) with that same ucontext_t. 499 * 500 * Now, the ucontext_t has a fixed ABI for both ILP32 and LP64 environments and 501 * it's often declared on the stack itself, with the signal handler spilling all 502 * this state to the stack. The ucontext_t machine portion was broken into the 503 * general purpose and floating point registers. In 64-bit code, the floating 504 * point registers were mostly the same as the results of the fxsave instruction 505 * (i.e. struct fxsave_state). While the 64-bit kernel still uses the equivalent 506 * starting point for information, it is transformed into a different shape to 507 * deal with the history of the 32-bit SYS V ABI. 508 * 509 * While this worked, if you're reading this, you're aware that the x86 FPU and 510 * extended register states didn't stop at the initial 16 128-bit %xmm 511 * registers. Since then we have added 256-bit %ymm, 512-bit %zmm, and the %k 512 * opmask registers. None of these fit inside the standard ucontext_t; however, 513 * they must all be preserved and restored across a signal. While the various 514 * x86 platform-specific ABIs all suggest that these registers are not preserved 515 * across a function call, receiving a signal is not a function call and must be 516 * thought of like a process receiving an interrupt. In other words, this 517 * extended state must be preserved. 518 * 519 * To facilitate this, we have extended the ucontext_t structure with an 520 * additional flag, UC_XSAVE, which indicates that the traditional padding 521 * member, uc_xsave, actually is a pointer to the extended state. While this is 522 * accessible outside of a signal handling context through the combination of 523 * ucontext_alloc(3C) and getcontext_extd(2), our design around saving this 524 * state is focused on signal handling. Signal handling spills all this state to 525 * the stack and if we cannot spill the entire state to the stack then our 526 * inability to deliver the signal results in the process being killed! While 527 * there are separate efforts to ensure that the signal stack sizing that is 528 * used for the minimum and maximum signal sizes are sufficient, we still need 529 * to do our part to minimize the likelihood here. 530 * 531 * In designing this, we make the following observations which have helped us 532 * focus our design: 533 * 534 * o While the start of an xsave area is the traditional 512-byte fxsave XMM 535 * region, we already have that in the fpregs. Thus there is no reason to 536 * duplicate it. This not only saves 512 bytes of additional stack space, 537 * but it also means we don't have to ask which of the version of it to take 538 * if they were to differ. 539 * 540 * o Many applications out there aren't necessarily using the extended vectors 541 * and even when we do make libc and others take advantage of it, it will 542 * behoove us to ensure that they are put back into their initial state 543 * after use. This leads us to expect that in a number of cases, the actual 544 * extended register state will be in its initial state. 545 * 546 * o While the signal handler does allow contents to be modified, we are 547 * starting with making the interface private and thus allowing us to excise 548 * components that are in their initial state. 549 * 550 * o There are similarities to what we want to create with the compressed 551 * xsave format; however, because we don't always have support for the 552 * compressed format, we can't just arbitrarily say let's do a compressed 553 * save to the user stack. 554 * 555 * o Because we are not handing this state directly to and from hardware, we 556 * don't need to meet some of the constraints of the compressed xsave format 557 * around wanting alignment for the initial save or additional components. 558 * 559 * All of the above lead us to our own unique format for this data. When the 560 * UC_XSAVE flag is set in the ucontext_t, the uc_xsave member points to a 561 * uc_xsave_t structure which has a magic version number, a 32-bit length of the 562 * overall structure, and the 64-bit state bit-vector to represent which 563 * components are valid. Following this 8-byte header, each component that is 564 * present in the bit vector is immediately written out in roughly ascending bit 565 * order (the order is determined based on the order of the fpu_xsave_info 566 * array). 567 * 568 * This makes the rough logic that we have here when taking a signal and writing 569 * out this state as: 570 * 571 * 1. Ensure that the FPU is saved and that the contents of the pcb save area 572 * are valid. That is, call fp_save() if the state is not already flagged 573 * with FPU_VALID. 574 * 575 * 2. Copy the bit-vector from the save area and remove the XFEATURE_LEGACY_FP 576 * and XFEATURE_SSE bits as these will be placed in the xsave area. 577 * 578 * 3. Initialize the uc_xsave_t by setting our version field, initializing the 579 * length to the length of the current structure, and then setting the 580 * modified bit vector above. 581 * 582 * 4. Walk each remaining bit of the bit-vector. For each set bit, copy out 583 * its extended state starting at the current length in the header and then 584 * increase the header size by that length. 585 * 586 * 5. Finally write out the final uc_xsave_t structure. 587 * 588 * The above process is also used when someone manually calls getcontext_extd(2) 589 * to get this state. The main difference between the two is which copyout 590 * function we use. This deserves some explanation. Our main starting point for 591 * all the logic here is fpu_signal_copyout(). It takes a copyfunc that allows 592 * the signal handling context to operate with a different copyout than we 593 * normally use in say getcontext_extd(2). 594 * 595 * When we've received a signal, we're at the intersection of several different 596 * gotchas. Normal copyout (or ddi_copyout()) will trigger watchpoints. That is, 597 * the watchpoints effectively set a copyout override function (t_copyops) that 598 * we end up vectoring to rather than a normal copyout. This allows the data to 599 * be modified and for the watchpoint to fire. While this is all well and good 600 * normally, it is problematic if we are trying to handle a signal. The signal 601 * deliver logic, sendsig(), goes through and disables the watchpoint for the 602 * region of the stack that we are copying out to. However, disabling 603 * watchpoints is not sufficient, we also need to use the copyout_noerr 604 * variants. 605 * 606 * These variants also require the use of on_fault() and no_fault() for error 607 * handling. While it is tempting to try and on_fault() the entire 608 * fpu_signal_copyout() operation, that is actually fraught for a few reasons. 609 * The first is that we don't want to disable faults during the entire operation 610 * as if the kernel messes up we will treat that as a user error. That isn't 611 * theoretical and happened during development. The second and perhaps more 612 * important issue is that correctly bounding the on_fault() / no_fault() means 613 * being careful about state. For example, kernel pre-emption is often disabled 614 * during parts of these operations, but it needs to be re-enabled when we're 615 * done. This would require tracking in some volatile variable that this had 616 * been enabled and disabled and tracking that. 617 * 618 * Instead, this is why fpu_signal_copyout() takes a copy out function as an 619 * argument. When we're in signal handling context, the function will use 620 * coypout_noerr() and wrap it in the appropriate on_fault() mechanisms. 621 * 622 * RESTORING STATE 623 * 624 * Copying out our current state is the easier half of this problem. When the 625 * kernel is done with a signal it calls setcontext(2) with the ucontext_t we 626 * assembled for it as described above. setcontext(2) isn't just used for 627 * returning from signals. 628 * 629 * The process for this goes in two steps. The first step is to copy in, 630 * validate, and transform the ucontext_t UC_XSAVE that we created above into an 631 * equivalent xsave format that we can use the appropriate xrstor function on. 632 * This first phase is implemented in fpu_signal_copyin(). Once that is done, we 633 * come back through a second phase that is driven out of restorecontext() and 634 * is implemented in fpu_set_xsave(). 635 * 636 * Let's start by discussing the second part of this, which is more 637 * straightforward. In particular, the second phase assumes that all of the 638 * validation and error handling has been done by the first phase. This means 639 * here, we have a buffer that is already the appropriate size 640 * (cpuid_get_xsave_size()) and all we need to do is make sure that we can 641 * replace the actual save state with the current one. 642 * 643 * The only piece of shenanigans we have to do is around the kernel provided 644 * notion of 'status' and 'xstatus', which are cached versions of the x87 and 645 * SSE exception vectors. These are part of the fpregset ABI and therefore we 646 * need to propagate them from the temporary storage that part 1 sets up in the 647 * ignored region of the fxsave data. We use that because it is not persisted by 648 * the CPU, so clobbering it is generally alright. 649 * 650 * Once that is done, we simply note that we need a PCB update to occur to 651 * refresh the FPU state before we return to userland. Given that someone has 652 * called setcontext(2), this was always going to happen because we have to 653 * update segment registers and related, so this isn't so bad. With that, let's 654 * move onto the more nuanced part (1). 655 * 656 * When we're handling a setcontext(2) we have, in userland, a data structure 657 * that should match one we serialized out, though we cannot assume that a user 658 * has not modified it either accidentally or maliciously. Our goal is to set up 659 * the appropriate xsave state that can be passed to the CPU's xrstor. The first 660 * problem we have to deal with is where do we actually put this state? 661 * 662 * While not many programs actually call setcontext(2) of their own volition, 663 * this is going to get hit every time we take a signal. The first thought was 664 * to re-use the existing thread's save area; however, that's a bit challenging 665 * for a few reasons. In particular, we would need to ensure that we don't go 666 * off-CPU for any reason, which we cannot assume with a copyin from a user 667 * address space. In particular, it is trivial for us to hit a case where the 668 * stack has been paged out for some reason, which eschews that path. 669 * 670 * Instead, whenever a thread first calls setcontext(2), generally from signal 671 * context, we will at that time allocate another entry from the 'fpsave_cachep' 672 * kmem cache, giving us a buffer of the appropriate space to handle this. Once 673 * this buffer has been allocated, we leave it assigned to the thread's pcb and 674 * only tear it down when the thread itself finally exits. We reason that a 675 * thread that takes a signal once is either going to have the process exit 676 * shortly thereafter or is much more likely to take a signal again in the 677 * future. Many daemons and other processes set things up so signals are 678 * dispatched via one location, masking signals in other thread, using 679 * sigsuspend(2), signalfd(3C), or something similar. 680 * 681 * With this buffer in hand, we begin our task of reassembling state. Note, all 682 * of this is conditional on UC_XSAVE being set in the uc_flags member of the 683 * ucontext_t. If it is not set, then we assume that there is no extended state 684 * and will use the traditional path of setting the fpregset_t into the system 685 * via setfpregs(). 686 * 687 * We first will copyin and validate the uc_xsave_t. In particular, we need to 688 * make sure the version makes sense, that the xsave component bit-vector 689 * doesn't have anything unexpected and more importantly unsupported in it, and 690 * that the addresses we've been given are within the user address space. At 691 * this point we can walk through our table of implemented bits and process 692 * them. 693 * 694 * For most components in here, the processing is straightforward. We continue 695 * walking our cursor and copy data into the kernel and place it in the 696 * appropriate place in our xsave state. If a xsave state component bit-vector 697 * isn't set, then we must ensure that we have the item in the initial state, 698 * which for everything other than the x87/SSE state is the memory being zeroed. 699 * 700 * The most unique case in the copyin state is that of the x87/SSE state. You 701 * might recall that we didn't copy it out explicitly as part of the uc_xsave_t, 702 * but instead have opted to use the single definition in the fpregset_t. Thus 703 * here, we copy it out of the fpregset_t, which the kernel has helpfully 704 * already unified into the 64-bit fxsave version prior to calling us, and 705 * install that into the save area we're building up. 706 * 707 * As part of this, there are two important pieces to be aware of. The first is 708 * that because the fpregset_t has both the status and xstatus members 709 * mentioned earlier, we temporarily copy them to the software-usable ignored 710 * areas of the fxsave state so we can corral this extra state into part (2) 711 * without needing to allocate additional space. The second piece is that when 712 * we're done processing this we explicitly remove the UC_FPU flag that would 713 * tell the kernel to proceed with updating that region. The problem is that 714 * that goes directly into the pcb's save area and not to the intermediate 715 * buffer as it uses the same entry point as /proc, mainly setfpregs(). 716 * 717 * We don't do much validation of the actual contents of the registers that are 718 * being set with the exception of ensuring that no reserved bits of the mxcsr 719 * are used. This is not as strict as /proc, but failure here means the process 720 * is likely going to die (returning from setcontext() in a signal handler is 721 * fatal). 722 * 723 * /proc xregs 724 * ----------- 725 * 726 * Observability of the state of the extended registers is important for 727 * understanding the system. While on the surface this is similar to signal 728 * handling, it is crucially different in a number of ways: 729 * 730 * o In signal handling, we're trying to conserve every byte of stack that we 731 * can. 732 * o The /proc xregs file will end up in core files, which means that we need 733 * a way of knowing what components are present and not present in it, 734 * because this will vary from CPU to CPU due to the addition of 735 * architectural features. For example, some CPUs support AVX-512, but 736 * others do not. 737 * 738 * o The signal handling structure (uc_xsave_t) is private and we're not 739 * trying to have software modify it, on the other hand, the /proc 740 * interfaces that we support we do want software to be able to interrogate 741 * and manipulate. These need to be something that we can introduce 742 * additional components into and make other changes that still allow it to 743 * work. 744 * 745 * The x86 xregs format is documented in proc(5). The short form is that the 746 * prxregset_hdr_t has a number of information entries, which are of the type 747 * prxregset_info_t. Each of the information headers has a type, size, and 748 * offset which indicate where to find the additional data. 749 * 750 * Each entry is described as one of the entries in the fpu_xsave_info[]. These 751 * items either are a 1:1 correspondence with a xsave related feature (e.g. 752 * there is one entry for each of the three AVX-512 components) or it is 753 * something synthetic that we provide as additional information such as the 754 * PRX_INFO_XCR, which is a way of getting information about the system such as 755 * what is enabled in %xcr0 out there. 756 * 757 * Unlike signal handling, we are given the buffer to place everything that 758 * needs to be written out. This is partially the design of the /proc APIs. That 759 * is, we will always assemble everything into the entire buffer that /proc asks 760 * us to, and then it will use as much or as little of it as is required. 761 * Similarly, when setting things, we don't have to worry about copying in 762 * information in the same way as signal handling does, because /proc takes care 763 * of it and always hands us a full buffer. Sizing that is a little nuanced, but 764 * is all handled in prmachdep.c. 765 * 766 * When someone performs a read of the xregs and thus is asking us for the 767 * current state, there is a little bit of nuance that we need to deal with. 768 * The first, is whether or not the FPU is enabled and the second is if the FPU 769 * is enabled, whether a given component is noted as being in its initial state. 770 * This basically gives us three possible states for a given component: 771 * 772 * 1. FPU_EN is not set and FPU_VALID is not set. This means we need to take 773 * the illumos FPU default for an item. More on that in a moment. 774 * 2. The saved xsave state indicates that the bit for a given component is 775 * zero -- specifically the xsh_xstate_bv member of the struct xsave_state. 776 * In this case, we must take the CPU's default for an item. This is 777 * usually the same as illumos, but not always. 778 * 3. The saved xsave state indicates that a given component's state bit is 779 * valid. The simplest of our cases. We can just take what we have from the 780 * xsave state. 781 * 782 * The CPU's default state for most components other than the x87/SSE state is 783 * to have it be zeroed. This is what we treat as our default state as well. The 784 * primary difference is in the initialization of the x87/SSE state. The SYS V 785 * ABI requires that we enable a different floating point control word then the 786 * hardware default. This means that when we're dealing with case (1) for 787 * x87/SSE we have to be more careful than the other components. Thankfully for 788 * everything else this is just keeping it zeroed. 789 * 790 * A reasonable question would be why not just skip components that aren't 791 * marked as present. There are a few reasons we take a different approach and 792 * always include them. Both of these are to make lives simpler for consumers. 793 * In the first case, when someone is performing a read and wants to reassemble 794 * and answer the question of 'what is the value of %ymm0 or %zmm15', they have 795 * to combine multiple disparate parts. If one knows that the data we put into 796 * there is always valid and represents what is in hardware and doesn't have to 797 * keep track of what are the defaults in different circumstances, then that 798 * greatly simplifies consumers lives. It also helps us for core files and other 799 * observability cases because the answer to what is the operating system's 800 * default may change over time. 801 * 802 * Similarly, including all the possible structures means that we have 803 * simplified writes. Writes are always setting the full state of a thread, 804 * meaning that if someone wants to modify only a single register they must do a 805 * read, modify, and write. By including everything that they might need, it 806 * makes it easier for consumers to do this and not have to cons up the whole 807 * structure on their own. 808 * 809 * When we're setting state, things change around a little bit. We have a few 810 * constraints that are laid out in proc(5). In particular, we require that the 811 * PRX_INFO_XSAVE component always be present to tell us which other components 812 * we expect to be here and which ones we don't. We also are much stricter about 813 * writes in several ways. Of all the components, the PRX_INFO_XCR is read-only 814 * and may not be modified by a calling process. In addition, when we have 815 * 32-bit applications which have reserved registers in the %ymm, %zmm, etc. 816 * components, if they are being written to and have modifications, then we will 817 * indicate an error there. 818 * 819 * Because we are given the entire buffer from userland and don't need to have 820 * an intermediate place to copy it in, we will validate the entire thing in 821 * advance. Once it has been validated and we consider it legal, then we will 822 * translate each entry into its corresponding entry in pcb's normal floating 823 * point state. This is different from signal handling mostly because of the 824 * fact that we are not using copyin, and once we get to this point, there is 825 * no more validation, so we don't have the same concerns around blocking while 826 * pre-emption is disabled. 827 * 828 * The Wrinkle with fpregs 829 * ----------------------- 830 * 831 * When we instead turn our attention to the fpregs, whether we're gathering 832 * them as part of the ucontext_t or as part of /proc, there are a few 833 * complications that we need to be aware of when we're operating on a kernel 834 * that is using xsave as the save mechanism. When we're using fxsave as the 835 * save mechanism, the CPU will always save the entire 512-byte fxsave region. 836 * The fpregs ABI that the kernel expects is basically this structure itself, 837 * which is transformed into a 32-bit compatible form in archdep.c. 838 * 839 * But xsave makes this much more complex and has historically been a source of 840 * bugs in the system. In particular, unlike fxsave, xsave has its component bit 841 * vector that is written out to indicate validity. This means that blindly 842 * copying the fxsave area without checking those bits will lead us to do the 843 * wrong thing. The XMM state flag mostly covers the 16 128-bit %xmm registers, 844 * while the x87 legacy fp flag covers the rest of the state. This is all good, 845 * aside from the MCXSR. 846 * 847 * One of the more complicated pieces of xsave state management is correctly 848 * answering the question of when the MXCSR is written out to xsave_state. In 849 * practice, this is rather convoluted and varies. If either the XMM or AVX 850 * feature bits are set then the CPU will write out the MXCSR and its mask 851 * register into the traditional fxsave state region. This behavior is dependent 852 * on the type of save function that we use. xsave and xsaveopt will look at the 853 * AVX feature bit; however, xsavec does not and only considers the SSE feature 854 * bit. This means that when we're retrieving things, we need to check both of 855 * those bits to determine if we should use the initial state or the value 856 * written out. 857 * 858 * When we come to someone trying to set the fpregs through /proc, the main 859 * question we have is what happens to the extended registers. We have opted to 860 * implement and document it such that a write to the fpregs only impacts the 861 * fpregs. Put differently, we will save the FPU state with fp_save() ahead of 862 * copying the data into the save area, set the state bits for x87 and XMM 863 * state, and then set the FPU to be restored. All in all, this basically means 864 * that writing to fpregs does not touch any of the %ymm, %zmm, or other state 865 * that we might have present. 866 * 867 * Forward Looking: Adding Intel AMX Support 868 * ----------------------------------------- 869 * 870 * Nothing can stop the march of features being added into the FPU. One of the 871 * larger chunks that we will need to wrangle with is Intel's Advanced Matrix 872 * Extensions (AMX), which add a large chunk of xsave state to each process. 873 * While things like AVX and AVX-512 have been enabled by default, the broader 874 * OS community has not been wanting to do this for AMX ,because of the size of 875 * the state which exceeds 8 KiB. While the signal handling state went out of 876 * its way to minimize the size it wrote to the stack, if this is used, it would 877 * need to be preserved. 878 * 879 * To deal with this reality and the fact that folks don't really want to 880 * enable it by default for all purposes when its use will be quite special 881 * purpose, Intel has also added a MSR around extended feature disable or xfd. 882 * This is what we represent in the PRX_INFO_XCR prx_xfd member. Our starting 883 * assumption, and the reason that so much of the /proc and signal logic ensures 884 * that we have the thread and process around, taking as an example the unused 885 * process argument in fpu_proc_xregs_info(), is that we will follow suit and 886 * default to having support disabled, but that a process will be able to opt 887 * into it, which will result in several different assumptions around signal 888 * stack sizing and cause us to reallocate and extend the pcb's FPU save state. 889 * 890 * The following is a list of items to pay attention to for future folks who 891 * work on this: 892 * 893 * o We will want to confirm whether other systems have opted to make this 894 * process-wide or thread-wide. Assuming process-wide, we will need to do a 895 * hold of all lwps while making a change. The interface for that probably 896 * doesn't want to be /proc, as a process probably doesn't want to write to 897 * its own control file. Changing it for another process could be done 898 * through the agent-lwp. 899 * o Opting into this should probably be a one-way street. 900 * o Opting into this will need to evaluate all threads and in particular 901 * stack sizes to confirm they adhere to the new minimum. 902 * o We will need to make sure that setting and clearing the xfd MSR is part 903 * of the FPU context ops and something we set by default on every CPU. 904 * o We will need to add a new interface to allow opting into this feature. 905 * o We will need to ensure that all subsequently created signal stacks adhere 906 * to a required minimum size that we communicate through libc. 907 * o We will need to make sure that both rtld and libc no longer rely on a 908 * static value of the AT_SUN_FPSIZE, but rather realize that this can be 909 * dynamic. At that time, we should evaluate if we can get away with not 910 * needing to save this for rtld, even though signal handlers should assume 911 * they will. 912 * o The various components (because there is more than one) will want to be 913 * added to the fpu_xsave_info[]. Consulting the processes's xfd will be 914 * required and probably require logic changes. 915 * 916 * The above is not exhaustive. We'll probably have some other issues and fun 917 * while doing this. 918 */ 919 920 /* 921 * The kind of FPU we advertise to rtld so it knows what to do when working 922 * through the PLT. 923 */ 924 int fp_elf = AT_386_FPINFO_FXSAVE; 925 926 /* 927 * Mechanism to save FPU state. 928 */ 929 int fp_save_mech = FP_FXSAVE; 930 931 /* 932 * See section 10.5.1 in the Intel 64 and IA-32 Architectures Software 933 * Developer's Manual, Volume 1. 934 */ 935 #define FXSAVE_ALIGN 16 936 937 /* 938 * See section 13.4 in the Intel 64 and IA-32 Architectures Software 939 * Developer's Manual, Volume 1. 940 */ 941 #define XSAVE_ALIGN 64 942 943 kmem_cache_t *fpsave_cachep; 944 945 /* Legacy fxsave layout + xsave header + ymm */ 946 #define AVX_XSAVE_SIZE (512 + 64 + 256) 947 948 /* 949 * Various sanity checks. 950 */ 951 CTASSERT(sizeof (struct fxsave_state) == 512); 952 CTASSERT(sizeof (struct fnsave_state) == 108); 953 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0); 954 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE); 955 956 /* 957 * Basic architectural alignment information. 958 */ 959 #define FPU_ALIGN_XMM 16 960 #define FPU_ALIGN_YMM 32 961 #define FPU_ALIGN_ZMM 64 962 963 /* 964 * This structure is the x86 implementation of the kernel FPU that is defined in 965 * uts/common/sys/kfpu.h. 966 */ 967 968 typedef enum kfpu_flags { 969 /* 970 * This indicates that the save state has initial FPU data. 971 */ 972 KFPU_F_INITIALIZED = 0x01 973 } kfpu_flags_t; 974 975 struct kfpu_state { 976 fpu_ctx_t kfpu_ctx; 977 kfpu_flags_t kfpu_flags; 978 kthread_t *kfpu_curthread; 979 }; 980 981 /* 982 * Initial kfpu state for SSE/SSE2 used by fpinit() 983 */ 984 const struct fxsave_state sse_initial = { 985 FPU_CW_INIT, /* fx_fcw */ 986 0, /* fx_fsw */ 987 0, /* fx_fctw */ 988 0, /* fx_fop */ 989 0, /* fx_rip */ 990 0, /* fx_rdp */ 991 SSE_MXCSR_INIT /* fx_mxcsr */ 992 /* rest of structure is zero */ 993 }; 994 995 /* 996 * Initial kfpu state for AVX used by fpinit() 997 */ 998 const struct xsave_state avx_initial = { 999 /* 1000 * The definition below needs to be identical with sse_initial 1001 * defined above. 1002 */ 1003 .xs_fxsave = { 1004 .fx_fcw = FPU_CW_INIT, 1005 .fx_mxcsr = SSE_MXCSR_INIT, 1006 }, 1007 .xs_header = { 1008 /* 1009 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are 1010 * valid, and CPU should initialize XMM/YMM. 1011 */ 1012 .xsh_xstate_bv = 1, 1013 .xsh_xcomp_bv = 0, 1014 }, 1015 }; 1016 1017 /* 1018 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid 1019 * the #gp exception caused by setting unsupported bits in the 1020 * MXCSR register 1021 */ 1022 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT; 1023 1024 /* 1025 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we 1026 * have an XSAVE-capable chip in fpu_probe. 1027 */ 1028 void (*fpsave_ctxt)(void *) = fpxsave_ctxt; 1029 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt; 1030 1031 /* 1032 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable. 1033 */ 1034 void (*xsavep)(struct xsave_state *, uint64_t) = xsave; 1035 1036 static int fpe_sicode(uint_t); 1037 static int fpe_simd_sicode(uint_t); 1038 static void fp_new_lwp(void *, void *); 1039 static void fp_free_ctx(void *, int); 1040 1041 static struct ctxop * 1042 fp_ctxop_allocate(struct fpu_ctx *fp) 1043 { 1044 const struct ctxop_template tpl = { 1045 .ct_rev = CTXOP_TPL_REV, 1046 .ct_save = fpsave_ctxt, 1047 .ct_restore = fprestore_ctxt, 1048 .ct_fork = fp_new_lwp, 1049 .ct_lwp_create = fp_new_lwp, 1050 .ct_free = fp_free_ctx, 1051 }; 1052 return (ctxop_allocate(&tpl, fp)); 1053 } 1054 1055 /* 1056 * Copy the state of parent lwp's floating point context into the new lwp. 1057 * Invoked for both fork() and lwp_create(). 1058 * 1059 * Note that we inherit -only- the control state (e.g. exception masks, 1060 * rounding, precision control, etc.); the FPU registers are otherwise 1061 * reset to their initial state. 1062 */ 1063 static void 1064 fp_new_lwp(void *parent, void *child) 1065 { 1066 kthread_id_t t = parent, ct = child; 1067 struct fpu_ctx *fp; /* parent fpu context */ 1068 struct fpu_ctx *cfp; /* new fpu context */ 1069 struct fxsave_state *fx, *cfx; 1070 struct xsave_state *cxs; 1071 1072 ASSERT(fp_kind != FP_NO); 1073 1074 fp = &t->t_lwp->lwp_pcb.pcb_fpu; 1075 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu; 1076 1077 /* 1078 * If the parent FPU state is still in the FPU hw then save it; 1079 * conveniently, fp_save() already does this for us nicely. 1080 */ 1081 fp_save(fp); 1082 1083 cfp->fpu_flags = FPU_EN | FPU_VALID; 1084 cfp->fpu_regs.kfpu_status = 0; 1085 cfp->fpu_regs.kfpu_xstatus = 0; 1086 1087 /* 1088 * Make sure that the child's FPU is cleaned up and made ready for user 1089 * land. 1090 */ 1091 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb); 1092 1093 switch (fp_save_mech) { 1094 case FP_FXSAVE: 1095 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 1096 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx; 1097 bcopy(&sse_initial, cfx, sizeof (*cfx)); 1098 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 1099 cfx->fx_fcw = fx->fx_fcw; 1100 break; 1101 1102 case FP_XSAVE: 1103 cfp->fpu_xsave_mask = fp->fpu_xsave_mask; 1104 1105 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL); 1106 1107 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 1108 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs; 1109 cfx = &cxs->xs_fxsave; 1110 1111 bcopy(&avx_initial, cxs, sizeof (*cxs)); 1112 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 1113 cfx->fx_fcw = fx->fx_fcw; 1114 cxs->xs_header.xsh_xstate_bv |= 1115 (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL); 1116 break; 1117 default: 1118 panic("Invalid fp_save_mech"); 1119 /*NOTREACHED*/ 1120 } 1121 1122 /* 1123 * Mark that both the parent and child need to have the FPU cleaned up 1124 * before returning to userland. 1125 */ 1126 1127 ctxop_attach(ct, fp_ctxop_allocate(cfp)); 1128 } 1129 1130 /* 1131 * Free any state associated with floating point context. 1132 * Fp_free can be called in three cases: 1133 * 1) from reaper -> thread_free -> freectx-> fp_free 1134 * fp context belongs to a thread on deathrow 1135 * nothing to do, thread will never be resumed 1136 * thread calling ctxfree is reaper 1137 * 1138 * 2) from exec -> freectx -> fp_free 1139 * fp context belongs to the current thread 1140 * must disable fpu, thread calling ctxfree is curthread 1141 * 1142 * 3) from restorecontext -> setfpregs -> fp_free 1143 * we have a modified context in the memory (lwp->pcb_fpu) 1144 * disable fpu and release the fp context for the CPU 1145 * 1146 */ 1147 void 1148 fp_free(struct fpu_ctx *fp) 1149 { 1150 ASSERT(fp_kind != FP_NO); 1151 1152 if (fp->fpu_flags & FPU_VALID) 1153 return; 1154 1155 kpreempt_disable(); 1156 /* 1157 * We want to do fpsave rather than fpdisable so that we can 1158 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit 1159 */ 1160 fp->fpu_flags |= FPU_VALID; 1161 /* If for current thread disable FP to track FPU_VALID */ 1162 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) { 1163 /* Clear errors if any to prevent frstor from complaining */ 1164 (void) fperr_reset(); 1165 if (fp_kind & __FP_SSE) 1166 (void) fpxerr_reset(); 1167 fpdisable(); 1168 } 1169 kpreempt_enable(); 1170 } 1171 1172 /* 1173 * Wrapper for freectx to make the types line up for fp_free() 1174 */ 1175 static void 1176 fp_free_ctx(void *arg, int isexec __unused) 1177 { 1178 fp_free((struct fpu_ctx *)arg); 1179 } 1180 1181 /* 1182 * Store the floating point state and disable the floating point unit. 1183 */ 1184 void 1185 fp_save(struct fpu_ctx *fp) 1186 { 1187 ASSERT(fp_kind != FP_NO); 1188 1189 kpreempt_disable(); 1190 if (!fp || fp->fpu_flags & FPU_VALID || 1191 (fp->fpu_flags & FPU_EN) == 0) { 1192 kpreempt_enable(); 1193 return; 1194 } 1195 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu); 1196 1197 switch (fp_save_mech) { 1198 case FP_FXSAVE: 1199 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx); 1200 break; 1201 1202 case FP_XSAVE: 1203 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 1204 break; 1205 default: 1206 panic("Invalid fp_save_mech"); 1207 /*NOTREACHED*/ 1208 } 1209 1210 fp->fpu_flags |= FPU_VALID; 1211 1212 /* 1213 * We save the FPU as part of forking, execing, modifications via /proc, 1214 * restorecontext, etc. As such, we need to make sure that we return to 1215 * userland with valid state in the FPU. If we're context switched out 1216 * before we hit sys_rtt_common() we'll end up having restored the FPU 1217 * as part of the context ops operations. The restore logic always makes 1218 * sure that FPU_VALID is set before doing a restore so we don't restore 1219 * it a second time. 1220 */ 1221 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb); 1222 1223 kpreempt_enable(); 1224 } 1225 1226 /* 1227 * Restore the FPU context for the thread: 1228 * The possibilities are: 1229 * 1. No active FPU context: Load the new context into the FPU hw 1230 * and enable the FPU. 1231 */ 1232 void 1233 fp_restore(struct fpu_ctx *fp) 1234 { 1235 switch (fp_save_mech) { 1236 case FP_FXSAVE: 1237 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx); 1238 break; 1239 1240 case FP_XSAVE: 1241 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 1242 break; 1243 default: 1244 panic("Invalid fp_save_mech"); 1245 /*NOTREACHED*/ 1246 } 1247 1248 fp->fpu_flags &= ~FPU_VALID; 1249 } 1250 1251 /* 1252 * Reset the FPU such that it is in a valid state for a new thread that is 1253 * coming out of exec. The FPU will be in a usable state at this point. At this 1254 * point we know that the FPU state has already been allocated and if this 1255 * wasn't an init process, then it will have had fp_free() previously called. 1256 */ 1257 void 1258 fp_exec(void) 1259 { 1260 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1261 1262 if (fp_save_mech == FP_XSAVE) { 1263 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 1264 } 1265 1266 struct ctxop *ctx = fp_ctxop_allocate(fp); 1267 /* 1268 * Make sure that we're not preempted in the middle of initializing the 1269 * FPU on CPU. 1270 */ 1271 kpreempt_disable(); 1272 ctxop_attach(curthread, ctx); 1273 fpinit(); 1274 fp->fpu_flags = FPU_EN; 1275 kpreempt_enable(); 1276 } 1277 1278 1279 /* 1280 * Seeds the initial state for the current thread. The possibilities are: 1281 * 1. Another process has modified the FPU state before we have done any 1282 * initialization: Load the FPU state from the LWP state. 1283 * 2. The FPU state has not been externally modified: Load a clean state. 1284 */ 1285 void 1286 fp_seed(void) 1287 { 1288 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1289 1290 ASSERT(curthread->t_preempt >= 1); 1291 ASSERT((fp->fpu_flags & FPU_EN) == 0); 1292 1293 /* 1294 * Always initialize a new context and initialize the hardware. 1295 */ 1296 if (fp_save_mech == FP_XSAVE) { 1297 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 1298 } 1299 1300 ctxop_attach(curthread, fp_ctxop_allocate(fp)); 1301 fpinit(); 1302 1303 /* 1304 * If FPU_VALID is set, it means someone has modified registers via 1305 * /proc. In this case, restore the current lwp's state. 1306 */ 1307 if (fp->fpu_flags & FPU_VALID) 1308 fp_restore(fp); 1309 1310 ASSERT((fp->fpu_flags & FPU_VALID) == 0); 1311 fp->fpu_flags = FPU_EN; 1312 } 1313 1314 /* 1315 * When using xsave/xrstor, these three functions are used by the lwp code to 1316 * manage the memory for the xsave area. 1317 */ 1318 void 1319 fp_lwp_init(klwp_t *lwp) 1320 { 1321 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 1322 1323 /* 1324 * We keep a copy of the pointer in lwp_fpu so that we can restore the 1325 * value in forklwp() after we duplicate the parent's LWP state. 1326 */ 1327 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = 1328 kmem_cache_alloc(fpsave_cachep, KM_SLEEP); 1329 fp->fpu_signal = NULL; 1330 1331 if (fp_save_mech == FP_XSAVE) { 1332 /* 1333 * 1334 * We bzero since the fpinit() code path will only 1335 * partially initialize the xsave area using avx_inital. 1336 */ 1337 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state)); 1338 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size()); 1339 } 1340 } 1341 1342 void 1343 fp_lwp_cleanup(klwp_t *lwp) 1344 { 1345 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 1346 1347 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) { 1348 kmem_cache_free(fpsave_cachep, 1349 fp->fpu_regs.kfpu_u.kfpu_generic); 1350 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL; 1351 } 1352 1353 if (fp->fpu_signal != NULL) { 1354 kmem_cache_free(fpsave_cachep, fp->fpu_signal); 1355 fp->fpu_signal = NULL; 1356 } 1357 } 1358 1359 /* 1360 * Called during the process of forklwp(). The kfpu_u pointer will have been 1361 * overwritten while copying the parent's LWP structure. We have a valid copy 1362 * stashed in the child's lwp_fpu which we use to restore the correct value. 1363 */ 1364 void 1365 fp_lwp_dup(klwp_t *lwp) 1366 { 1367 void *xp = lwp->lwp_fpu; 1368 size_t sz; 1369 1370 switch (fp_save_mech) { 1371 case FP_FXSAVE: 1372 sz = sizeof (struct fxsave_state); 1373 break; 1374 case FP_XSAVE: 1375 sz = cpuid_get_xsave_size(); 1376 break; 1377 default: 1378 panic("Invalid fp_save_mech"); 1379 /*NOTREACHED*/ 1380 } 1381 1382 /* copy the parent's values into the new lwp's struct */ 1383 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz); 1384 /* now restore the pointer */ 1385 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp; 1386 /* Ensure that we don't inherit our parent's signal state */ 1387 lwp->lwp_pcb.pcb_fpu.fpu_signal = NULL; 1388 } 1389 1390 /* 1391 * Handle a processor extension error fault 1392 * Returns non zero for error. 1393 */ 1394 1395 /*ARGSUSED*/ 1396 int 1397 fpexterrflt(struct regs *rp) 1398 { 1399 uint32_t fpcw, fpsw; 1400 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1401 1402 ASSERT(fp_kind != FP_NO); 1403 1404 /* 1405 * Now we can enable the interrupts. 1406 * (NOTE: x87 fp exceptions come thru interrupt gate) 1407 */ 1408 sti(); 1409 1410 if (!fpu_exists) 1411 return (FPE_FLTINV); 1412 1413 /* 1414 * Do an unconditional save of the FP state. If it's dirty (TS=0), 1415 * it'll be saved into the fpu context area passed in (that of the 1416 * current thread). If it's not dirty (it may not be, due to 1417 * an intervening save due to a context switch between the sti(), 1418 * above and here, then it's safe to just use the stored values in 1419 * the context save area to determine the cause of the fault. 1420 */ 1421 fp_save(fp); 1422 1423 /* clear exception flags in saved state, as if by fnclex */ 1424 switch (fp_save_mech) { 1425 case FP_FXSAVE: 1426 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 1427 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw; 1428 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS; 1429 break; 1430 1431 case FP_XSAVE: 1432 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 1433 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw; 1434 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS; 1435 /* 1436 * Always set LEGACY_FP as it may have been cleared by XSAVE 1437 * instruction 1438 */ 1439 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= 1440 XFEATURE_LEGACY_FP; 1441 break; 1442 default: 1443 panic("Invalid fp_save_mech"); 1444 /*NOTREACHED*/ 1445 } 1446 1447 fp->fpu_regs.kfpu_status = fpsw; 1448 1449 if ((fpsw & FPS_ES) == 0) 1450 return (0); /* No exception */ 1451 1452 /* 1453 * "and" the exception flags with the complement of the mask 1454 * bits to determine which exception occurred 1455 */ 1456 return (fpe_sicode(fpsw & ~fpcw & 0x3f)); 1457 } 1458 1459 /* 1460 * Handle an SSE/SSE2 precise exception. 1461 * Returns a non-zero sicode for error. 1462 */ 1463 /*ARGSUSED*/ 1464 int 1465 fpsimderrflt(struct regs *rp) 1466 { 1467 uint32_t mxcsr, xmask; 1468 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1469 1470 ASSERT(fp_kind & __FP_SSE); 1471 1472 /* 1473 * NOTE: Interrupts are disabled during execution of this 1474 * function. They are enabled by the caller in trap.c. 1475 */ 1476 1477 /* 1478 * The only way we could have gotten here if there is no FP unit 1479 * is via a user executing an INT $19 instruction, so there is 1480 * no fault in that case. 1481 */ 1482 if (!fpu_exists) 1483 return (0); 1484 1485 /* 1486 * Do an unconditional save of the FP state. If it's dirty (TS=0), 1487 * it'll be saved into the fpu context area passed in (that of the 1488 * current thread). If it's not dirty, then it's safe to just use 1489 * the stored values in the context save area to determine the 1490 * cause of the fault. 1491 */ 1492 fp_save(fp); /* save the FPU state */ 1493 1494 if (fp_save_mech == FP_XSAVE) { 1495 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr; 1496 fp->fpu_regs.kfpu_status = 1497 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 1498 } else { 1499 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr; 1500 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 1501 } 1502 fp->fpu_regs.kfpu_xstatus = mxcsr; 1503 1504 /* 1505 * compute the mask that determines which conditions can cause 1506 * a #xm exception, and use this to clean the status bits so that 1507 * we can identify the true cause of this one. 1508 */ 1509 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS; 1510 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask)); 1511 } 1512 1513 /* 1514 * In the unlikely event that someone is relying on this subcode being 1515 * FPE_FLTILL for denormalize exceptions, it can always be patched back 1516 * again to restore old behaviour. 1517 */ 1518 int fpe_fltden = FPE_FLTDEN; 1519 1520 /* 1521 * Map from the FPU status word to the FP exception si_code. 1522 */ 1523 static int 1524 fpe_sicode(uint_t sw) 1525 { 1526 if (sw & FPS_IE) 1527 return (FPE_FLTINV); 1528 if (sw & FPS_ZE) 1529 return (FPE_FLTDIV); 1530 if (sw & FPS_DE) 1531 return (fpe_fltden); 1532 if (sw & FPS_OE) 1533 return (FPE_FLTOVF); 1534 if (sw & FPS_UE) 1535 return (FPE_FLTUND); 1536 if (sw & FPS_PE) 1537 return (FPE_FLTRES); 1538 return (FPE_FLTINV); /* default si_code for other exceptions */ 1539 } 1540 1541 /* 1542 * Map from the SSE status word to the FP exception si_code. 1543 */ 1544 static int 1545 fpe_simd_sicode(uint_t sw) 1546 { 1547 if (sw & SSE_IE) 1548 return (FPE_FLTINV); 1549 if (sw & SSE_ZE) 1550 return (FPE_FLTDIV); 1551 if (sw & SSE_DE) 1552 return (FPE_FLTDEN); 1553 if (sw & SSE_OE) 1554 return (FPE_FLTOVF); 1555 if (sw & SSE_UE) 1556 return (FPE_FLTUND); 1557 if (sw & SSE_PE) 1558 return (FPE_FLTRES); 1559 return (FPE_FLTINV); /* default si_code for other exceptions */ 1560 } 1561 1562 /* 1563 * This routine is invoked as part of libc's __fpstart implementation 1564 * via sysi86(2). 1565 * 1566 * It may be called -before- any context has been assigned in which case 1567 * we try and avoid touching the hardware. Or it may be invoked well 1568 * after the context has been assigned and fiddled with, in which case 1569 * just tweak it directly. 1570 */ 1571 void 1572 fpsetcw(uint16_t fcw, uint32_t mxcsr) 1573 { 1574 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1575 struct fxsave_state *fx; 1576 1577 if (!fpu_exists || fp_kind == FP_NO) 1578 return; 1579 1580 if ((fp->fpu_flags & FPU_EN) == 0) { 1581 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) { 1582 /* 1583 * Common case. Floating point unit not yet 1584 * enabled, and kernel already intends to initialize 1585 * the hardware the way the caller wants. 1586 */ 1587 return; 1588 } 1589 /* 1590 * Hmm. Userland wants a different default. 1591 * Do a fake "first trap" to establish the context, then 1592 * handle as if we already had a context before we came in. 1593 */ 1594 kpreempt_disable(); 1595 fp_seed(); 1596 kpreempt_enable(); 1597 } 1598 1599 /* 1600 * Ensure that the current hardware state is flushed back to the 1601 * pcb, then modify that copy. Next use of the fp will 1602 * restore the context. 1603 */ 1604 fp_save(fp); 1605 1606 switch (fp_save_mech) { 1607 case FP_FXSAVE: 1608 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 1609 fx->fx_fcw = fcw; 1610 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1611 break; 1612 1613 case FP_XSAVE: 1614 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 1615 fx->fx_fcw = fcw; 1616 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1617 /* 1618 * Always set LEGACY_FP as it may have been cleared by XSAVE 1619 * instruction 1620 */ 1621 fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= 1622 XFEATURE_LEGACY_FP; 1623 break; 1624 default: 1625 panic("Invalid fp_save_mech"); 1626 /*NOTREACHED*/ 1627 } 1628 } 1629 1630 static void 1631 kernel_fpu_fpstate_init(kfpu_state_t *kfpu) 1632 { 1633 struct xsave_state *xs; 1634 1635 switch (fp_save_mech) { 1636 case FP_FXSAVE: 1637 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx, 1638 sizeof (struct fxsave_state)); 1639 kfpu->kfpu_ctx.fpu_xsave_mask = 0; 1640 break; 1641 case FP_XSAVE: 1642 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs; 1643 bzero(xs, cpuid_get_xsave_size()); 1644 bcopy(&avx_initial, xs, sizeof (*xs)); 1645 xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; 1646 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL; 1647 break; 1648 default: 1649 panic("invalid fp_save_mech"); 1650 } 1651 1652 /* 1653 * Set the corresponding flags that the system expects on the FPU state 1654 * to indicate that this is our state. The FPU_EN flag is required to 1655 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly 1656 * not set below as it represents that this state is being suppressed 1657 * by the kernel. 1658 */ 1659 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID; 1660 kfpu->kfpu_flags |= KFPU_F_INITIALIZED; 1661 } 1662 1663 kfpu_state_t * 1664 kernel_fpu_alloc(int kmflags) 1665 { 1666 kfpu_state_t *kfpu; 1667 1668 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) { 1669 return (NULL); 1670 } 1671 1672 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic = 1673 kmem_cache_alloc(fpsave_cachep, kmflags); 1674 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) { 1675 kmem_free(kfpu, sizeof (kfpu_state_t)); 1676 return (NULL); 1677 } 1678 1679 kernel_fpu_fpstate_init(kfpu); 1680 1681 return (kfpu); 1682 } 1683 1684 void 1685 kernel_fpu_free(kfpu_state_t *kfpu) 1686 { 1687 kmem_cache_free(fpsave_cachep, 1688 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic); 1689 kmem_free(kfpu, sizeof (kfpu_state_t)); 1690 } 1691 1692 static void 1693 kernel_fpu_ctx_save(void *arg) 1694 { 1695 kfpu_state_t *kfpu = arg; 1696 fpu_ctx_t *pf; 1697 1698 if (kfpu == NULL) { 1699 /* 1700 * A NULL kfpu implies this is a kernel thread with an LWP and 1701 * no user-level FPU usage. Use the lwp fpu save area. 1702 */ 1703 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1704 1705 ASSERT(curthread->t_procp->p_flag & SSYS); 1706 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1707 1708 fp_save(pf); 1709 } else { 1710 pf = &kfpu->kfpu_ctx; 1711 1712 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1713 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1714 1715 /* 1716 * Note, we can't use fp_save because it assumes that we're 1717 * saving to the thread's PCB and not somewhere else. Because 1718 * this is a different FPU context, we instead have to do this 1719 * ourselves. 1720 */ 1721 switch (fp_save_mech) { 1722 case FP_FXSAVE: 1723 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx); 1724 break; 1725 case FP_XSAVE: 1726 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask); 1727 break; 1728 default: 1729 panic("Invalid fp_save_mech"); 1730 } 1731 1732 /* 1733 * Because we have saved context here, our save state is no 1734 * longer valid and therefore needs to be reinitialized. 1735 */ 1736 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED; 1737 } 1738 1739 pf->fpu_flags |= FPU_VALID; 1740 1741 /* 1742 * Clear KFPU flag. This allows swtch to check for improper kernel 1743 * usage of the FPU (i.e. switching to a new thread while the old 1744 * thread was in the kernel and using the FPU, but did not perform a 1745 * context save). 1746 */ 1747 curthread->t_flag &= ~T_KFPU; 1748 } 1749 1750 static void 1751 kernel_fpu_ctx_restore(void *arg) 1752 { 1753 kfpu_state_t *kfpu = arg; 1754 fpu_ctx_t *pf; 1755 1756 if (kfpu == NULL) { 1757 /* 1758 * A NULL kfpu implies this is a kernel thread with an LWP and 1759 * no user-level FPU usage. Use the lwp fpu save area. 1760 */ 1761 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1762 1763 ASSERT(curthread->t_procp->p_flag & SSYS); 1764 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1765 } else { 1766 pf = &kfpu->kfpu_ctx; 1767 1768 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1769 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1770 } 1771 1772 fp_restore(pf); 1773 curthread->t_flag |= T_KFPU; 1774 } 1775 1776 /* 1777 * Validate that the thread is not switching off-cpu while actively using the 1778 * FPU within the kernel. 1779 */ 1780 void 1781 kernel_fpu_no_swtch(void) 1782 { 1783 if ((curthread->t_flag & T_KFPU) != 0) { 1784 panic("curthread swtch-ing while the kernel is using the FPU"); 1785 } 1786 } 1787 1788 static const struct ctxop_template kfpu_ctxop_tpl = { 1789 .ct_rev = CTXOP_TPL_REV, 1790 .ct_save = kernel_fpu_ctx_save, 1791 .ct_restore = kernel_fpu_ctx_restore, 1792 }; 1793 1794 void 1795 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags) 1796 { 1797 klwp_t *pl = curthread->t_lwp; 1798 struct ctxop *ctx; 1799 1800 if ((curthread->t_flag & T_KFPU) != 0) { 1801 panic("curthread attempting to nest kernel FPU states"); 1802 } 1803 1804 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */ 1805 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) != 1806 (KFPU_USE_LWP | KFPU_NO_STATE)); 1807 1808 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) { 1809 /* 1810 * Since we don't have a kfpu_state or usable lwp pcb_fpu to 1811 * hold our kernel FPU context, we depend on the caller doing 1812 * kpreempt_disable for the duration of our FPU usage. This 1813 * should only be done for very short periods of time. 1814 */ 1815 ASSERT(curthread->t_preempt > 0); 1816 ASSERT(kfpu == NULL); 1817 1818 if (pl != NULL) { 1819 /* 1820 * We might have already saved once so FPU_VALID could 1821 * be set. This is handled in fp_save. 1822 */ 1823 fp_save(&pl->lwp_pcb.pcb_fpu); 1824 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1825 } 1826 1827 curthread->t_flag |= T_KFPU; 1828 1829 /* Always restore the fpu to the initial state. */ 1830 fpinit(); 1831 1832 return; 1833 } 1834 1835 /* 1836 * We either have a kfpu, or are using the LWP pcb_fpu for context ops. 1837 */ 1838 1839 if ((flags & KFPU_USE_LWP) == 0) { 1840 if (kfpu->kfpu_curthread != NULL) 1841 panic("attempting to reuse kernel FPU state at %p when " 1842 "another thread already is using", kfpu); 1843 1844 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0) 1845 kernel_fpu_fpstate_init(kfpu); 1846 1847 kfpu->kfpu_curthread = curthread; 1848 } 1849 1850 /* 1851 * Not all threads may have an active LWP. If they do and we're not 1852 * going to re-use the LWP, then we should go ahead and save the state. 1853 * We must also note that the fpu is now being used by the kernel and 1854 * therefore we do not want to manage the fpu state via the user-level 1855 * thread's context handlers. 1856 * 1857 * We might have already saved once (due to a prior use of the kernel 1858 * FPU or another code path) so FPU_VALID could be set. This is handled 1859 * by fp_save, as is the FPU_EN check. 1860 */ 1861 ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu); 1862 kpreempt_disable(); 1863 if (pl != NULL) { 1864 if ((flags & KFPU_USE_LWP) == 0) 1865 fp_save(&pl->lwp_pcb.pcb_fpu); 1866 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1867 } 1868 1869 /* 1870 * Set the context operations for kernel FPU usage. Because kernel FPU 1871 * setup and ctxop attachment needs to happen under the protection of 1872 * kpreempt_disable(), we allocate the ctxop outside the guard so its 1873 * sleeping allocation will not cause a voluntary swtch(). This allows 1874 * the rest of the initialization to proceed, ensuring valid state for 1875 * the ctxop handlers. 1876 */ 1877 ctxop_attach(curthread, ctx); 1878 curthread->t_flag |= T_KFPU; 1879 1880 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) { 1881 /* 1882 * For pure kernel threads with an LWP, we can use the LWP's 1883 * pcb_fpu to save/restore context. 1884 */ 1885 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu; 1886 1887 VERIFY(curthread->t_procp->p_flag & SSYS); 1888 VERIFY(kfpu == NULL); 1889 ASSERT((pf->fpu_flags & FPU_EN) == 0); 1890 1891 /* Always restore the fpu to the initial state. */ 1892 if (fp_save_mech == FP_XSAVE) 1893 pf->fpu_xsave_mask = XFEATURE_FP_ALL; 1894 fpinit(); 1895 pf->fpu_flags = FPU_EN | FPU_KERNEL; 1896 } else { 1897 /* initialize the kfpu state */ 1898 kernel_fpu_ctx_restore(kfpu); 1899 } 1900 kpreempt_enable(); 1901 } 1902 1903 void 1904 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags) 1905 { 1906 if ((curthread->t_flag & T_KFPU) == 0) { 1907 panic("curthread attempting to clear kernel FPU state " 1908 "without using it"); 1909 } 1910 1911 /* 1912 * General comments on why the rest of this function is structured the 1913 * way it is. Be aware that there is a lot of subtlety here. 1914 * 1915 * If a user-level thread ever uses the fpu while in the kernel, then 1916 * we cannot call fpdisable since that does STTS. That will set the 1917 * ts bit in %cr0 which will cause an exception if anything touches the 1918 * fpu. However, the user-level context switch handler (fpsave_ctxt) 1919 * needs to access the fpu to save the registers into the pcb. 1920 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in 1921 * fprestore_ctxt when the thread context switched onto the CPU. 1922 * 1923 * Calling fpdisable only effects the current CPU's %cr0 register. 1924 * 1925 * During ctxop_remove and kpreempt_enable, we can voluntarily context 1926 * switch, so the CPU we were on when we entered this function might 1927 * not be the same one we're on when we return from ctxop_remove or end 1928 * the function. Note there can be user-level context switch handlers 1929 * still installed if this is a user-level thread. 1930 * 1931 * We also must be careful in the unlikely chance we're running in an 1932 * interrupt thread, since we can't leave the CPU's %cr0 TS state set 1933 * incorrectly for the "real" thread to resume on this CPU. 1934 */ 1935 1936 if ((flags & KFPU_NO_STATE) == 0) { 1937 kpreempt_disable(); 1938 } else { 1939 ASSERT(curthread->t_preempt > 0); 1940 } 1941 1942 curthread->t_flag &= ~T_KFPU; 1943 1944 /* 1945 * When we are ending things, we explicitly don't save the current 1946 * kernel FPU state back to the temporary state. The kfpu API is not 1947 * intended to be a permanent save location. 1948 * 1949 * If this is a user-level thread and we were to context switch 1950 * before returning to user-land, fpsave_ctxt will be a no-op since we 1951 * already saved the user-level FPU state the first time we run 1952 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over 1953 * the user-level fpu state). The fpsave_ctxt functions only save if 1954 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so 1955 * fprestore_ctxt will be done in sys_rtt_common when the thread 1956 * finally returns to user-land. 1957 */ 1958 1959 if ((curthread->t_procp->p_flag & SSYS) != 0 && 1960 curthread->t_intr == NULL) { 1961 /* 1962 * A kernel thread which is not an interrupt thread, so we 1963 * STTS now. 1964 */ 1965 fpdisable(); 1966 } 1967 1968 if ((flags & KFPU_NO_STATE) == 0) { 1969 ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu); 1970 1971 if (kfpu != NULL) { 1972 if (kfpu->kfpu_curthread != curthread) { 1973 panic("attempting to end kernel FPU state " 1974 "for %p, but active thread is not " 1975 "curthread", kfpu); 1976 } else { 1977 kfpu->kfpu_curthread = NULL; 1978 } 1979 } 1980 1981 kpreempt_enable(); 1982 } 1983 1984 if (curthread->t_lwp != NULL) { 1985 uint_t f; 1986 1987 if (flags & KFPU_USE_LWP) { 1988 f = FPU_EN | FPU_KERNEL; 1989 } else { 1990 f = FPU_KERNEL; 1991 } 1992 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f; 1993 } 1994 } 1995 1996 void 1997 fpu_save_cache_init(void) 1998 { 1999 switch (fp_save_mech) { 2000 case FP_FXSAVE: 2001 fpsave_cachep = kmem_cache_create("fxsave_cache", 2002 sizeof (struct fxsave_state), FXSAVE_ALIGN, 2003 NULL, NULL, NULL, NULL, NULL, 0); 2004 break; 2005 case FP_XSAVE: 2006 fpsave_cachep = kmem_cache_create("xsave_cache", 2007 cpuid_get_xsave_size(), XSAVE_ALIGN, 2008 NULL, NULL, NULL, NULL, NULL, 0); 2009 break; 2010 default: 2011 panic("Invalid fp_save_mech"); 2012 } 2013 } 2014 2015 /* 2016 * Fill in FPU information that is required by exec. 2017 */ 2018 void 2019 fpu_auxv_info(int *typep, size_t *lenp) 2020 { 2021 *typep = fp_elf; 2022 switch (fp_save_mech) { 2023 case FP_FXSAVE: 2024 *lenp = sizeof (struct fxsave_state); 2025 break; 2026 case FP_XSAVE: 2027 *lenp = cpuid_get_xsave_size(); 2028 break; 2029 default: 2030 *lenp = 0; 2031 break; 2032 } 2033 } 2034 2035 /* 2036 * This function exists to transform an xsave_state into an fxsave_state. The 2037 * way that we have to do this is nuanced. We assume that callers have already 2038 * handled FPU_EN and thus we only need to consider the xsave_state and its 2039 * component vector itself. This results in the following cases that we need to 2040 * consider: 2041 * 2042 * o Neither the x87 / XMM state bits are set. We use the hardware default and 2043 * need to ensure to copy the xsave header. 2044 * o Both x87 / XMM state bits are set. We can copy everything. 2045 * o Only the x87 bit is set. We need to copy the x87 state but make the XMM 2046 * state be in the initial case. 2047 * o Only the XMM bit is set. The reverse of the above case. 2048 * 2049 * The illumos and hardware defaults in 'sse_initial' and 'avx_initial' are 2050 * generally the same; however, the default floating point control word is 2051 * different. 2052 * 2053 * Finally, we have the complication of the MXCSR and MCXSR_MASK registers. 2054 * Because we are using xsave and xsaveopt in the kernel right now and not 2055 * xsavec, the hardware may write out the MXCSR and MXCSR_MASK registers if the 2056 * XFEATURE_AVX bit is set. Therefore if we don't have the XMM bit set but AVX 2057 * is set, we must also come back and copy out the MXCSR register. Sorry, we 2058 * don't make the rules. 2059 */ 2060 static void 2061 fpu_xsave_to_fxsave(const struct xsave_state *xsave, struct fxsave_state *fx) 2062 { 2063 const uint64_t comps = xsave->xs_header.xsh_xstate_bv; 2064 2065 switch (comps & (XFEATURE_LEGACY_FP | XFEATURE_SSE)) { 2066 case XFEATURE_LEGACY_FP | XFEATURE_SSE: 2067 bcopy(xsave, fx, sizeof (*fx)); 2068 return; 2069 case XFEATURE_LEGACY_FP: 2070 bcopy(xsave, fx, offsetof(struct fxsave_state, fx_xmm)); 2071 fx->fx_mxcsr = SSE_MXCSR_INIT; 2072 fx->fx_mxcsr_mask = 0; 2073 break; 2074 case XFEATURE_SSE: 2075 bcopy(&sse_initial, fx, offsetof(struct fxsave_state, 2076 fx_mxcsr)); 2077 2078 fx->fx_fcw = FPU_CW_INIT_HW; 2079 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr; 2080 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask; 2081 bcopy(xsave->xs_fxsave.fx_xmm, fx->fx_xmm, sizeof (fx->fx_xmm)); 2082 break; 2083 default: 2084 bcopy(&sse_initial, fx, sizeof (*fx)); 2085 fx->fx_fcw = FPU_CW_INIT_HW; 2086 break; 2087 } 2088 2089 /* 2090 * Account for the AVX causing MXCSR to be valid. 2091 */ 2092 if ((xsave->xs_header.xsh_xstate_bv & XFEATURE_AVX) != 0 && 2093 (xsave->xs_header.xsh_xstate_bv & XFEATURE_SSE) == 0) { 2094 fx->fx_mxcsr = xsave->xs_fxsave.fx_mxcsr; 2095 fx->fx_mxcsr_mask = xsave->xs_fxsave.fx_mxcsr_mask; 2096 } 2097 } 2098 2099 /* 2100 * This function is designed to answer the question of are we using any xsave 2101 * family of instructions in context switch and therefore we have this state. 2102 * This should still remain true if we are using xsavec or xsaves in the kernel 2103 * in the future. 2104 */ 2105 boolean_t 2106 fpu_xsave_enabled(void) 2107 { 2108 return (fp_save_mech == FP_XSAVE); 2109 } 2110 2111 /* 2112 * The following structure is used to track and manage the programmatic 2113 * construction of /proc and signal stack spilling of xsave information. All 2114 * known xsave types that the kernel supports must be included here. 2115 */ 2116 typedef struct xsave_proc_info { 2117 /* 2118 * This matches the /proc xregs type that this data represents. This s 2119 * used for /proc only. 2120 */ 2121 uint32_t xi_type; 2122 /* 2123 * This indicates the size of the /proc data that we're operating on. 2124 * This is only used for /proc. 2125 */ 2126 size_t xi_size; 2127 /* 2128 * This indicates the alignment that we want to have for the member when 2129 * we're writing out. This is not used when setting data. This is only 2130 * used for /proc. 2131 */ 2132 size_t xi_align; 2133 /* 2134 * This indicates whether this member must always be considered or not. 2135 * This is used in both /proc and context/signal handling. 2136 */ 2137 bool xi_always; 2138 /* 2139 * This contains the corresponding bits in the xsave bit vector that 2140 * corresponds to this entry. This is used for both /proc and 2141 * context/signal handling. 2142 */ 2143 uint64_t xi_bits; 2144 /* 2145 * The xi_fill function pointer is used to write out the /proc regset 2146 * data (e.g. when a user reads xregs). This is only used for the /proc 2147 * handling. The xi_valid function pointer is used instead to validate a 2148 * given set of data that we've read in, while the xi_set pointer is 2149 * used to actually transform the data in the underlying fpu save area. 2150 */ 2151 void (*xi_fill)(const fpu_ctx_t *, const struct xsave_proc_info *, 2152 void *); 2153 bool (*xi_valid)(model_t, const void *); 2154 void (*xi_set)(fpu_ctx_t *, const struct xsave_proc_info *, 2155 uint64_t, const void *); 2156 /* 2157 * The xi_signal_in and xi_signal_out function pointers are used for 2158 * extended context and signal handling information. They are used when 2159 * reading in data from a ucontext_t and writing it out respectively. 2160 * These are only used for context/signal handling. 2161 */ 2162 int (*xi_signal_in)(const struct xsave_proc_info *, 2163 const ucontext_t *, const uc_xsave_t *, void *, uintptr_t *, 2164 const uintptr_t); 2165 int (*xi_signal_out)(const struct xsave_proc_info *, fpu_copyout_f, 2166 uc_xsave_t *, const void *fpup, uintptr_t); 2167 } xsave_proc_info_t; 2168 2169 static bool 2170 fpu_proc_xregs_initial_state(const fpu_ctx_t *fpu, uint64_t feats) 2171 { 2172 const struct xsave_state *xs = fpu->fpu_regs.kfpu_u.kfpu_xs; 2173 2174 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) { 2175 return (true); 2176 } 2177 2178 return ((xs->xs_header.xsh_xstate_bv & feats) == 0); 2179 } 2180 2181 static void 2182 fpu_proc_xregs_xcr_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2183 void *datap) 2184 { 2185 prxregset_xcr_t *xcr = datap; 2186 2187 xcr->prx_xcr_xcr0 = xsave_bv_all; 2188 } 2189 2190 /* 2191 * Unlike other instruction portions, we treat the xsave header and the legacy 2192 * XMM section together as both are somewhat tied at the instruction hip. Unlike 2193 * the when dealing with other xsave regions like the ymm and zmm components, 2194 * the initial state here is much more nuanced as it has to match what we actual 2195 * do in the OS and depends on the components that are present. 2196 */ 2197 static void 2198 fpu_proc_xregs_xsave_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2199 void *datap) 2200 { 2201 prxregset_xsave_t *prxsave = datap; 2202 const struct xsave_state *xsave = fpu->fpu_regs.kfpu_u.kfpu_xs; 2203 size_t hdr_off; 2204 2205 /* 2206 * In the x87/XMM case, the no device vs. initial state is different 2207 * because the initial state case still wants us to copy the real xsave 2208 * header. It's also worth calling out that the actual illumos default 2209 * fxsave state is not the same as what Intel documents. The main 2210 * difference is in what the x87 FPU control word is. This results in 2211 * the following different cases that we need to think about: 2212 * 2213 * o FPU_EN is not set. So we use the illumos default. 2214 */ 2215 if ((fpu->fpu_flags & FPU_EN) == 0) { 2216 bcopy(&avx_initial, prxsave, sizeof (*prxsave)); 2217 return; 2218 } 2219 2220 /* 2221 * Convert all the fxsave region while taking into account the validity 2222 * of the xsave bits. The prxregset_xsave_t structure is the same as the 2223 * xsave structure in our ABI and Intel designed the xsave header to 2224 * begin with the 512-bit fxsave structure. 2225 */ 2226 fpu_xsave_to_fxsave(xsave, (struct fxsave_state *)prxsave); 2227 2228 /* 2229 * Now that we've dealt with the x87 and XMM state, take care of the 2230 * header. 2231 */ 2232 hdr_off = offsetof(prxregset_xsave_t, prx_xsh_xstate_bv); 2233 bcopy((const void *)((uintptr_t)xsave + hdr_off), 2234 (void *)((uintptr_t)prxsave + hdr_off), 2235 sizeof (struct xsave_header)); 2236 } 2237 2238 static void 2239 fpu_proc_xregs_std_fill(const fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2240 void *datap) 2241 { 2242 if (!fpu_proc_xregs_initial_state(fpu, info->xi_bits)) { 2243 size_t size, off; 2244 const void *xsave_off; 2245 2246 cpuid_get_xsave_info(info->xi_bits, &size, &off); 2247 ASSERT3U(size, ==, info->xi_size); 2248 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs + 2249 off); 2250 bcopy(xsave_off, datap, info->xi_size); 2251 } 2252 } 2253 2254 /* 2255 * Users are not allowed to actually set the xcr information this way. However, 2256 * to make it easier for someone to just do a read, modify, write, of the xregs 2257 * data, if it is identical, then we will accept it (and do nothing). 2258 */ 2259 static bool 2260 fpu_proc_xregs_xcr_valid(model_t model, const void *datap) 2261 { 2262 const prxregset_xcr_t *xcr = datap; 2263 2264 return (xcr->prx_xcr_xcr0 == xsave_bv_all && xcr->prx_xcr_xfd == 0 && 2265 xcr->prx_xcr_pad[0] == 0 && xcr->prx_xcr_pad[1] == 0); 2266 } 2267 2268 /* 2269 * To match traditional /proc semantics, we do not error if reserved bits of 2270 * MXCSR are set, they will be masked off when writing data. We do not allow 2271 * someone to indicate that they are asking for compressed xsave data, hence the 2272 * check that prx_xsh_comp_bv is zero. Separately, in fpu_proc_xregs_set() we 2273 * check that each component that was indicated in the xstate_bv is actually 2274 * present. 2275 */ 2276 static bool 2277 fpu_proc_xregs_xsave_valid(model_t model, const void *datap) 2278 { 2279 const prxregset_xsave_t *xsave = datap; 2280 uint64_t rsvd[6] = { 0 }; 2281 2282 if (bcmp(rsvd, xsave->prx_xsh_reserved, sizeof (rsvd)) != 0 || 2283 xsave->prx_xsh_xcomp_bv != 0) { 2284 return (false); 2285 } 2286 2287 if ((xsave->prx_xsh_xstate_bv & ~xsave_bv_all) != 0) { 2288 return (false); 2289 } 2290 2291 return (true); 2292 } 2293 2294 /* 2295 * The YMM, ZMM, and Hi-ZMM registers are all valid when in an LP64 environment 2296 * on x86; however, when operating in ILP32, subsets are reserved. We require 2297 * that all reserved portions are set to zero. 2298 */ 2299 static bool 2300 fpu_proc_xregs_ymm_valid(model_t model, const void *datap) 2301 { 2302 upad128_t ymm_zero[8]; 2303 const prxregset_ymm_t *ymm = datap; 2304 2305 if (model == DATAMODEL_LP64) { 2306 return (true); 2307 } 2308 2309 bzero(&ymm_zero, sizeof (ymm_zero)); 2310 return (bcmp(&ymm->prx_ymm[8], &ymm_zero, sizeof (ymm_zero)) == 0); 2311 } 2312 2313 static bool 2314 fpu_proc_xregs_zmm_valid(model_t model, const void *datap) 2315 { 2316 upad256_t zmm_zero[8]; 2317 const prxregset_zmm_t *zmm = datap; 2318 2319 if (model == DATAMODEL_LP64) { 2320 return (true); 2321 } 2322 2323 bzero(&zmm_zero, sizeof (zmm_zero)); 2324 return (bcmp(&zmm->prx_zmm[8], &zmm_zero, sizeof (zmm_zero)) == 0); 2325 } 2326 2327 static bool 2328 fpu_proc_xregs_hi_zmm_valid(model_t model, const void *datap) 2329 { 2330 prxregset_hi_zmm_t hi_zmm_zero; 2331 const prxregset_hi_zmm_t *hi_zmm = datap; 2332 2333 if (model == DATAMODEL_LP64) { 2334 return (true); 2335 } 2336 2337 bzero(&hi_zmm_zero, sizeof (hi_zmm_zero)); 2338 return (bcmp(hi_zmm, &hi_zmm_zero, sizeof (hi_zmm_zero)) == 0); 2339 } 2340 2341 /* 2342 * The xsave state consists of the first 512 bytes of the XMM state and then the 2343 * xsave header itself. Because of the xsave header, this structure is marked 2344 * with xi_always, so we must always process and consider it. 2345 * 2346 * Semantically if either of the bits around SSE / x87 is set, then we will copy 2347 * the entire thing. This may mean that we end up copying a region that is not 2348 * valid into the save area; however, that should be OK as we still have the 2349 * specific bit flags that indicate what we should consider or not. 2350 * 2351 * There is one additional wrinkle we need to consider and honor here. The CPU 2352 * will load the MXCSR values if the AVX bit is set in an xrstor regardless of 2353 * anything else. So if this is set and we do not have a valid x87/XMM bits 2354 * set then we will set the MXCSR to its default state in case the processor 2355 * tries to load it. For reference see: 2356 * 2357 * o Intel SDM Volume 1: 13.8.1 Standard Form of XRSTOR 2358 * o AMD64 Volume 2: Section 11.5.9 MXCSR State Management 2359 * 2360 * Note, the behavior around this changes depending on whether using the 2361 * compressed xrstor or not. We are not, but it's worth being aware of. We do 2362 * not worry about MXCSR_MASK because the instructions ignore it. 2363 */ 2364 static void 2365 fpu_proc_xregs_xsave_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2366 uint64_t xsave_bv, const void *datap) 2367 { 2368 const struct xsave_state *src_xs = datap; 2369 struct xsave_state *targ_xs = fpu->fpu_regs.kfpu_u.kfpu_xs; 2370 2371 if ((xsave_bv & info->xi_bits) != 0) { 2372 bcopy(&src_xs->xs_fxsave, &targ_xs->xs_fxsave, 2373 sizeof (struct fxsave_state)); 2374 } else if ((xsave_bv & XFEATURE_AVX) != 0) { 2375 targ_xs->xs_fxsave.fx_mxcsr = SSE_MXCSR_INIT; 2376 } 2377 2378 bcopy(&src_xs->xs_header, &targ_xs->xs_header, 2379 sizeof (struct xsave_header)); 2380 targ_xs->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask; 2381 } 2382 2383 static void 2384 fpu_proc_xregs_std_set(fpu_ctx_t *fpu, const xsave_proc_info_t *info, 2385 uint64_t xsave_bv, const void *datap) 2386 { 2387 size_t size, off; 2388 void *xsave_off; 2389 2390 cpuid_get_xsave_info(info->xi_bits, &size, &off); 2391 xsave_off = (void *)((uintptr_t)fpu->fpu_regs.kfpu_u.kfpu_xs + 2392 off); 2393 bcopy(datap, xsave_off, size); 2394 } 2395 2396 /* 2397 * Dealing with XMM data is a little more annoying in signal context. If UC_FPU 2398 * is set, the ucontext_t's fpregset_t contains a copy of the XMM region. That 2399 * must take priority over an XMM region that showed up in the uc_xsave_t data. 2400 * In the signal copyout code we do not save XMM region in the uc_xsave_t or set 2401 * it as a present component because of it being kept in the fpregset_t. Because 2402 * of this behavior, if we find the XMM (or x87) state bits present, we treat 2403 * that as an error. 2404 * 2405 * The system has always gone through and cleaned up the reserved bits in the 2406 * fxsave state when someone calls setcontext(). Therefore we need to do the 2407 * same thing which is why you see the masking of the mxcsr below. 2408 * 2409 * Finally, there is one last wrinkle here that we need to consider. The 2410 * fpregset_t has two private words which cache the status/exception 2411 * information. Therefore, we well... cheat. Intel has left bytes 464 (0x1d0) 2412 * through 511 (0x1ff) available for us to do what we want. So we will pass this 2413 * through that for the moment to help us pass this state around without too 2414 * much extra allocation. 2415 */ 2416 static int 2417 fpu_signal_copyin_xmm(const xsave_proc_info_t *info, const ucontext_t *kuc, 2418 const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap, 2419 const uintptr_t max_udata) 2420 { 2421 struct xsave_state *xsave = fpup; 2422 2423 if ((ucx->ucx_bv & info->xi_bits) != 0) { 2424 return (EINVAL); 2425 } 2426 2427 if ((kuc->uc_flags & UC_FPU) != 0) { 2428 bcopy(&kuc->uc_mcontext.fpregs, &xsave->xs_fxsave, 2429 sizeof (struct fxsave_state)); 2430 xsave->xs_fxsave.__fx_ign2[3]._l[0] = 2431 kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.status; 2432 xsave->xs_fxsave.__fx_ign2[3]._l[1] = 2433 kuc->uc_mcontext.fpregs.fp_reg_set.fpchip_state.xstatus; 2434 xsave->xs_fxsave.fx_mxcsr &= sse_mxcsr_mask; 2435 xsave->xs_header.xsh_xstate_bv |= info->xi_bits; 2436 } 2437 2438 return (0); 2439 } 2440 2441 static int 2442 fpu_signal_copyin_std(const xsave_proc_info_t *info, const ucontext_t *kuc, 2443 const uc_xsave_t *ucx, void *fpup, uintptr_t *udatap, 2444 const uintptr_t max_udata) 2445 { 2446 size_t len, xsave_off; 2447 void *copy_to; 2448 struct xsave_state *xsave = fpup; 2449 2450 cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off); 2451 if (*udatap + len > max_udata) { 2452 return (EOVERFLOW); 2453 } 2454 2455 copy_to = (void *)((uintptr_t)fpup + xsave_off); 2456 if (ddi_copyin((void *)*udatap, copy_to, len, 0) != 0) { 2457 return (EFAULT); 2458 } 2459 2460 xsave->xs_header.xsh_xstate_bv |= info->xi_bits; 2461 *udatap = *udatap + len; 2462 2463 return (0); 2464 } 2465 2466 static int 2467 fpu_signal_copyout_std(const xsave_proc_info_t *info, fpu_copyout_f copyfunc, 2468 uc_xsave_t *ucx, const void *fpup, uintptr_t udatap) 2469 { 2470 size_t len, xsave_off; 2471 const void *copy_from; 2472 void *copy_to; 2473 int ret; 2474 2475 cpuid_get_xsave_info(info->xi_bits, &len, &xsave_off); 2476 copy_from = (void *)(uintptr_t)fpup + xsave_off; 2477 copy_to = (void *)(udatap + ucx->ucx_len); 2478 2479 ret = copyfunc(copy_from, copy_to, len); 2480 if (ret != 0) { 2481 return (ret); 2482 } 2483 2484 ucx->ucx_len += len; 2485 ucx->ucx_bv |= info->xi_bits; 2486 return (0); 2487 } 2488 2489 /* 2490 * This table contains information about the extended FPU states and synthetic 2491 * information we create for /proc, the ucontext_t, and signal handling. The 2492 * definition of the xsave_proc_info_t describes how each member is used. 2493 * 2494 * In general, this table is expected to be in the order of the xsave data 2495 * structure itself. Synthetic elements that we create can go anywhere and new 2496 * ones should be inserted at the end. This structure is walked in order to 2497 * produce the /proc and signal handling logic, so changing the order is 2498 * meaningful for those and should not be done lightly. 2499 */ 2500 static const xsave_proc_info_t fpu_xsave_info[] = { { 2501 .xi_type = PRX_INFO_XCR, 2502 .xi_size = sizeof (prxregset_xcr_t), 2503 .xi_align = alignof (prxregset_xcr_t), 2504 .xi_always = true, 2505 .xi_bits = 0, 2506 .xi_fill = fpu_proc_xregs_xcr_fill, 2507 .xi_valid = fpu_proc_xregs_xcr_valid 2508 }, { 2509 /* 2510 * The XSAVE entry covers both the xsave header and the %xmm registers. 2511 * Note, there is no signal copyout information for the %xmm registers 2512 * because it is expected that that data is already in the fpregset_t. 2513 */ 2514 .xi_type = PRX_INFO_XSAVE, 2515 .xi_size = sizeof (prxregset_xsave_t), 2516 .xi_align = FPU_ALIGN_XMM, 2517 .xi_always = true, 2518 .xi_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE, 2519 .xi_fill = fpu_proc_xregs_xsave_fill, 2520 .xi_set = fpu_proc_xregs_xsave_set, 2521 .xi_valid = fpu_proc_xregs_xsave_valid, 2522 .xi_signal_in = fpu_signal_copyin_xmm 2523 }, { 2524 .xi_type = PRX_INFO_YMM, 2525 .xi_size = sizeof (prxregset_ymm_t), 2526 .xi_align = FPU_ALIGN_YMM, 2527 .xi_always = false, 2528 .xi_bits = XFEATURE_AVX, 2529 .xi_fill = fpu_proc_xregs_std_fill, 2530 .xi_set = fpu_proc_xregs_std_set, 2531 .xi_signal_in = fpu_signal_copyin_std, 2532 .xi_valid = fpu_proc_xregs_ymm_valid, 2533 .xi_signal_out = fpu_signal_copyout_std 2534 }, { 2535 /* 2536 * There is no /proc validation function for the mask registers because 2537 * they are the same in ILP32 / LP64 and there is nothing for us to 2538 * actually validate. 2539 */ 2540 .xi_type = PRX_INFO_OPMASK, 2541 .xi_size = sizeof (prxregset_opmask_t), 2542 .xi_align = alignof (prxregset_opmask_t), 2543 .xi_always = false, 2544 .xi_bits = XFEATURE_AVX512_OPMASK, 2545 .xi_fill = fpu_proc_xregs_std_fill, 2546 .xi_set = fpu_proc_xregs_std_set, 2547 .xi_signal_in = fpu_signal_copyin_std, 2548 .xi_signal_out = fpu_signal_copyout_std 2549 }, { 2550 .xi_type = PRX_INFO_ZMM, 2551 .xi_size = sizeof (prxregset_zmm_t), 2552 .xi_align = FPU_ALIGN_ZMM, 2553 .xi_always = false, 2554 .xi_bits = XFEATURE_AVX512_ZMM, 2555 .xi_fill = fpu_proc_xregs_std_fill, 2556 .xi_set = fpu_proc_xregs_std_set, 2557 .xi_valid = fpu_proc_xregs_zmm_valid, 2558 .xi_signal_in = fpu_signal_copyin_std, 2559 .xi_signal_out = fpu_signal_copyout_std 2560 }, { 2561 .xi_type = PRX_INFO_HI_ZMM, 2562 .xi_size = sizeof (prxregset_hi_zmm_t), 2563 .xi_align = FPU_ALIGN_ZMM, 2564 .xi_always = false, 2565 .xi_bits = XFEATURE_AVX512_HI_ZMM, 2566 .xi_fill = fpu_proc_xregs_std_fill, 2567 .xi_set = fpu_proc_xregs_std_set, 2568 .xi_valid = fpu_proc_xregs_hi_zmm_valid, 2569 .xi_signal_in = fpu_signal_copyin_std, 2570 .xi_signal_out = fpu_signal_copyout_std 2571 } }; 2572 2573 static bool 2574 fpu_proc_xregs_include(const xsave_proc_info_t *infop) 2575 { 2576 return (infop->xi_always || (xsave_bv_all & infop->xi_bits) != 0); 2577 } 2578 2579 void 2580 fpu_proc_xregs_info(struct proc *p __unused, uint32_t *ninfop, uint32_t *sizep, 2581 uint32_t *dstart) 2582 { 2583 size_t ret = sizeof (prxregset_hdr_t); 2584 uint32_t ninfo = 0; 2585 2586 ASSERT(fpu_xsave_enabled()); 2587 2588 /* 2589 * Right now the set of flags that are enabled in the FPU is global. 2590 * That is, while the pcb's fcpu_ctx_t has the fpu_xsave_mask, the 2591 * actual things that might show up and we care about are all about what 2592 * is set up in %xcr0 which is stored in the global xsave_bv_all. If we 2593 * move to per-process FPU enablement which is likely to come with AMX, 2594 * then this will need the proc_t to look at, hence why we've set things 2595 * up with the unused variable above. 2596 * 2597 * We take two passes through the array. The first is just to count up 2598 * how many informational entries we need. 2599 */ 2600 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2601 if (!fpu_proc_xregs_include(&fpu_xsave_info[i])) 2602 continue; 2603 ninfo++; 2604 } 2605 2606 ASSERT3U(ninfo, >, 0); 2607 ret += sizeof (prxregset_info_t) * ninfo; 2608 2609 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2610 size_t curphase; 2611 if (!fpu_proc_xregs_include(&fpu_xsave_info[i])) 2612 continue; 2613 2614 curphase = ret % fpu_xsave_info[i].xi_align; 2615 if (ret < fpu_xsave_info[i].xi_align) { 2616 ret = fpu_xsave_info[i].xi_align; 2617 } else if (curphase != 0) { 2618 ret += curphase; 2619 } 2620 2621 if (i == 0 && dstart != NULL) { 2622 *dstart = ret; 2623 } 2624 2625 ret += fpu_xsave_info[i].xi_size; 2626 } 2627 2628 VERIFY3U(ret, <=, UINT32_MAX); 2629 if (sizep != NULL) { 2630 *sizep = ret; 2631 } 2632 2633 if (ninfop != NULL) { 2634 *ninfop = ninfo; 2635 } 2636 } 2637 2638 /* 2639 * This function supports /proc. Because /proc does not have a process locked 2640 * while processing a PCSXREG, this tries to establish an upper bound that we 2641 * will validate later in fpu_proc_xregs_set(). We basically say that if you 2642 * take the maximum xsave size and add 1 KiB that is a good enough approximation 2643 * for the maximum size. The 1 KiB is us basically trying to rationalize the 2644 * overhead of our structures that we're adding right, while being cognisant of 2645 * differing alignments and the fact that the full xsave size is in some cases 2646 * (when supervisor states or features we don't support are present) going to be 2647 * larger than we would need for this. 2648 */ 2649 size_t 2650 fpu_proc_xregs_max_size(void) 2651 { 2652 VERIFY(fpu_xsave_enabled()); 2653 return (cpuid_get_xsave_size() + 0x1000); 2654 } 2655 2656 /* 2657 * This functions supports /proc. In particular, it's meant to perform the 2658 * following: 2659 * 2660 * o Potentially save the current thread's registers. 2661 * o Write out the x86 xsave /proc xregs format data from the xsave data we 2662 * actually have. Note, this can be a little weird for cases where the FPU is 2663 * not actually enabled, which happens for system processes. 2664 */ 2665 void 2666 fpu_proc_xregs_get(klwp_t *lwp, void *buf) 2667 { 2668 uint32_t size, ninfo, curinfo, dstart; 2669 fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu; 2670 prxregset_hdr_t *hdr = buf; 2671 2672 ASSERT(fpu_xsave_enabled()); 2673 fpu_proc_xregs_info(lwp->lwp_procp, &ninfo, &size, &dstart); 2674 2675 /* 2676 * Before we get going, defensively zero out all the data buffer so that 2677 * the rest of the fill functions can assume a specific base. 2678 */ 2679 bzero(buf, size); 2680 2681 kpreempt_disable(); 2682 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 2683 /* 2684 * This case suggests that thread in question doesn't have a 2685 * valid FPU save state which should only happen when it is on 2686 * CPU. If this is the case, we must ensure that we save the 2687 * current FPU state before proceeding. We also sanity check 2688 * several things here before doing this as using /proc on 2689 * yourself is always exciting. fp_save() will ensure that the 2690 * thread is flagged to go back to being an eager FPU before 2691 * returning back to userland. 2692 */ 2693 VERIFY3P(curthread, ==, lwptot(lwp)); 2694 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 2695 fp_save(fpu); 2696 } 2697 kpreempt_enable(); 2698 2699 hdr->pr_type = PR_TYPE_XSAVE; 2700 hdr->pr_size = size; 2701 hdr->pr_flags = hdr->pr_pad[0] = hdr->pr_pad[1] = hdr->pr_pad[2] = 2702 hdr->pr_pad[3] = 0; 2703 hdr->pr_ninfo = ninfo; 2704 2705 curinfo = 0; 2706 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2707 void *startp; 2708 uint32_t phase; 2709 2710 if (!fpu_proc_xregs_include(&fpu_xsave_info[i])) 2711 continue; 2712 2713 phase = dstart % fpu_xsave_info[i].xi_align; 2714 if (dstart < fpu_xsave_info[i].xi_align) { 2715 ASSERT3U(i, !=, 0); 2716 dstart = fpu_xsave_info[i].xi_align; 2717 } else if (phase != 0) { 2718 ASSERT3U(i, !=, 0); 2719 dstart += phase; 2720 } 2721 2722 hdr->pr_info[curinfo].pri_type = fpu_xsave_info[i].xi_type; 2723 hdr->pr_info[curinfo].pri_flags = 0; 2724 hdr->pr_info[curinfo].pri_size = fpu_xsave_info[i].xi_size; 2725 hdr->pr_info[curinfo].pri_offset = dstart; 2726 2727 startp = (void *)((uintptr_t)buf + dstart); 2728 fpu_xsave_info[i].xi_fill(fpu, &fpu_xsave_info[i], startp); 2729 dstart += fpu_xsave_info[i].xi_size; 2730 ASSERT3U(curinfo, <=, ninfo); 2731 curinfo++; 2732 } 2733 } 2734 2735 /* 2736 * We have been asked to set the data in the FPU for a given thread. Our 2737 * prmachdep code has already validated that the raw semantics of the data that 2738 * we have are valid (that is the appropriate sizes, offsets, and flags). We now 2739 * apply additional checking here: 2740 * 2741 * o The xsave structure is present and only valid bits are set. 2742 * o If the xsave component bit-vector is set, we have the corresponding proc 2743 * info item. 2744 * o Read-only items are ignored if and only if they actually match what we 2745 * gave the user mostly as a courtesy to simplify things here. 2746 * o ILP32 processes which can't support many of the regions are allowed to 2747 * have the items here (as we likely gave them to them), but they must be 2748 * zero if they are set. 2749 * 2750 * We take a first pass through all the data, validating it makes sense for the 2751 * FPU. Only after that point do we ensure that we have the FPU data in question 2752 * and then we clobber all the FPU data. Part of the semantics of setting this 2753 * is that we're setting the entire extended FPU. 2754 */ 2755 int 2756 fpu_proc_xregs_set(klwp_t *lwp, void *buf) 2757 { 2758 prxregset_hdr_t *prx = buf; 2759 model_t model = lwp_getdatamodel(lwp); 2760 uint64_t bv_found = 0; 2761 const prxregset_xsave_t *xsave = NULL; 2762 fpu_ctx_t *fpu = &lwp->lwp_pcb.pcb_fpu; 2763 2764 VERIFY(fpu_xsave_enabled()); 2765 2766 /* 2767 * First, walk each note info header that we have from the user and 2768 * proceed to validate it. The prmachdep code has already validated that 2769 * the size, type, and offset information is valid, but it has not 2770 * validated the semantic contents of this or if someone is trying to 2771 * write something they shouldn't. 2772 * 2773 * While we walk this, we keep track of where the xsave header is. We 2774 * also track all of the bits that we have found along the way so we can 2775 * match up and ensure that everything that was set has a corresponding 2776 * bit in the xsave bitmap. If we have something in the xsave bitmap, 2777 * but not its corresponding data, then that is an error. However, we 2778 * allow folks to write data regions without the bit set in the xsave 2779 * data to make the read, modify, write process simpler. 2780 */ 2781 for (uint32_t i = 0; i < prx->pr_ninfo; i++) { 2782 const prxregset_info_t *info = &prx->pr_info[i]; 2783 bool found = false; 2784 2785 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) { 2786 void *data; 2787 if (info->pri_type != fpu_xsave_info[pt].xi_type) 2788 continue; 2789 2790 found = true; 2791 data = (void *)((uintptr_t)buf + info->pri_offset); 2792 if (fpu_xsave_info[pt].xi_valid != NULL && 2793 !fpu_xsave_info[pt].xi_valid(model, data)) { 2794 return (EINVAL); 2795 } 2796 2797 if (info->pri_type == PRX_INFO_XSAVE) { 2798 xsave = data; 2799 } 2800 bv_found |= fpu_xsave_info[pt].xi_bits; 2801 break; 2802 } 2803 2804 if (!found) { 2805 return (EINVAL); 2806 } 2807 } 2808 2809 /* 2810 * No xsave data, no dice. 2811 */ 2812 if (xsave == NULL) { 2813 return (EINVAL); 2814 } 2815 2816 /* 2817 * If anything is set in the xsave header that was not found as we 2818 * walked structures, then that is an error. The opposite is not true as 2819 * discussed above. 2820 */ 2821 if ((xsave->prx_xsh_xstate_bv & ~bv_found) != 0) { 2822 return (EINVAL); 2823 } 2824 2825 /* 2826 * At this point, we consider all the data actually valid. Now we must 2827 * set up this information in the save area. If this is our own lwp, we 2828 * must disable it first. Otherwise, we expect that it is already valid. 2829 * To try to sanitize this, we will defensively zero the entire region 2830 * as we are setting everything that will result in here. 2831 */ 2832 kpreempt_disable(); 2833 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 2834 /* 2835 * This case suggests that thread in question doesn't have a 2836 * valid FPU save state which should only happen when it is on 2837 * CPU. If this is the case, we explicitly disable the FPU, but 2838 * do not save it before proceeding. We also sanity check 2839 * several things here before doing this as using /proc on 2840 * yourself is always exciting. Unlike fp_save(), fp_free() does 2841 * not signal that an update is required, so we unconditionally 2842 * set that for all threads. 2843 */ 2844 VERIFY3P(curthread, ==, lwptot(lwp)); 2845 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 2846 fp_free(fpu); 2847 } 2848 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb); 2849 bzero(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, 2850 cpuid_get_xsave_size()); 2851 2852 for (uint32_t i = 0; i < prx->pr_ninfo; i++) { 2853 const prxregset_info_t *info = &prx->pr_info[i]; 2854 bool found = false; 2855 2856 for (size_t pt = 0; pt < ARRAY_SIZE(fpu_xsave_info); pt++) { 2857 const void *data; 2858 if (info->pri_type != fpu_xsave_info[pt].xi_type) 2859 continue; 2860 2861 /* 2862 * Check if we have a set function and if we should 2863 * include this. We may not if this is something like 2864 * PRX_INFO_XCR which is read-only. 2865 * 2866 * We may not include a given entry as it may not have 2867 * been set in the actual xsave state that we have been 2868 * asked to restore, in which case to not break the 2869 * xsaveopt logic, we must leave it in its initial 2870 * state, e.g. zeroed (generally). XMM data initial 2871 * state is not zeroed, but is marked with xi_always to 2872 * help account for this. 2873 */ 2874 found = true; 2875 if (fpu_xsave_info[pt].xi_set == NULL) 2876 break; 2877 if (!fpu_xsave_info[pt].xi_always && 2878 (xsave->prx_xsh_xstate_bv & 2879 fpu_xsave_info[pt].xi_bits) != 2880 fpu_xsave_info[pt].xi_bits) { 2881 break; 2882 } 2883 2884 data = (void *)((uintptr_t)buf + info->pri_offset); 2885 fpu_xsave_info[pt].xi_set(fpu, &fpu_xsave_info[pt], 2886 xsave->prx_xsh_xstate_bv, data); 2887 } 2888 2889 VERIFY(found); 2890 } 2891 kpreempt_enable(); 2892 2893 return (0); 2894 } 2895 2896 /* 2897 * To be included in the signal copyout logic we must have a copy function and 2898 * the bit in question must be included. Note, we don't consult xi_always here 2899 * as that is really part of what is always present for xsave logic and 2900 * therefore isn't really pertinent here because of our custom format. See the 2901 * big theory statement for more info. 2902 */ 2903 static bool 2904 fpu_signal_include(const xsave_proc_info_t *infop, uint64_t xs_bv) 2905 { 2906 return ((infop->xi_bits & xs_bv) == infop->xi_bits && 2907 infop->xi_signal_out != NULL); 2908 } 2909 2910 /* 2911 * We need to fill out the xsave related data into the ucontext_t that we've 2912 * been given. We should have a valid user pointer at this point in the uc_xsave 2913 * member. This is much simpler than the copyin that we have. Here are the 2914 * current assumptions: 2915 * 2916 * o This is being called for the current thread. This is not meant to operate 2917 * on an arbitrary thread's state. 2918 * o We cannot assume whether the FPU is valid in the pcb or not. While most 2919 * callers will have just called getfpregs() which saved the state, don't 2920 * assume that. 2921 * o We assume that the user address has the requisite required space for this 2922 * to be copied out. 2923 * o We assume that copyfunc() will ensure we are not copying into a kernel 2924 * address. 2925 * 2926 * For more information on the format of the data, see the 'Signal Handling and 2927 * the ucontext_t' portion of the big theory statement. We copy out all the 2928 * constituent parts and then come back and write out the actual final header 2929 * information. 2930 */ 2931 int 2932 fpu_signal_copyout(klwp_t *lwp, uintptr_t uaddr, fpu_copyout_f copyfunc) 2933 { 2934 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 2935 uint64_t xs_bv; 2936 uc_xsave_t ucx; 2937 int ret; 2938 2939 VERIFY3P(curthread, ==, lwptot(lwp)); 2940 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 2941 VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN); 2942 2943 if (!fpu_xsave_enabled()) { 2944 return (ENOTSUP); 2945 } 2946 2947 /* 2948 * Unlike when we're dealing with /proc, we can unconditionally call 2949 * fp_save() because this is always called in the context where the lwp 2950 * we're operating on is always the one on CPU (which is what fp_save() 2951 * asserts). 2952 */ 2953 fp_save(fpu); 2954 2955 bzero(&ucx, sizeof (ucx)); 2956 ucx.ucx_vers = UC_XSAVE_VERS; 2957 ucx.ucx_len += sizeof (uc_xsave_t); 2958 2959 xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv; 2960 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 2961 const xsave_proc_info_t *info = &fpu_xsave_info[i]; 2962 2963 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv)) 2964 continue; 2965 ret = info->xi_signal_out(info, copyfunc, &ucx, 2966 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, 2967 uaddr); 2968 if (ret != 0) { 2969 kpreempt_enable(); 2970 return (ret); 2971 } 2972 } 2973 2974 /* 2975 * Now that everything has been copied out, we should have an accurate 2976 * value in the uc_xsave_t header and we can copy that out at the start 2977 * of the user data. 2978 */ 2979 ret = copyfunc(&ucx, (void *)uaddr, sizeof (ucx)); 2980 return (ret); 2981 } 2982 2983 /* 2984 * Here we've been given a ucontext_t which potentially has a user pointer to 2985 * xsave state that we've copied out previously. In this case we need to do the 2986 * following, assuming UC_XSAVE is present: 2987 * 2988 * o Copy in our header and validate it. 2989 * o Allocate an fpu context to use as a holding ground for all this data. 2990 * o If UC_FPU is set, override the xsave structure with the saved XMM state, 2991 * clear UC_FPU, and make sure that the correct xsave_bv bits are set. 2992 * 2993 * Currently we always allocate the additional state as a holding ground for the 2994 * FPU. What we're copying in may not be valid and we don't want to clobber the 2995 * existing FPU state or deal with merging it until we believe it's reasonable 2996 * enough. The proc_t is here to set us up for when we have per-process settings 2997 * in the extended feature disable MSRs. 2998 */ 2999 int 3000 fpu_signal_copyin(klwp_t *lwp, ucontext_t *kuc) 3001 { 3002 uc_xsave_t ucx; 3003 uint64_t bv; 3004 uintptr_t data, max_data; 3005 void *fpu; 3006 proc_t *p = lwp->lwp_procp; 3007 size_t ksize; 3008 3009 /* 3010 * Because this has been opaque filler and the kernel has never 3011 * historically looked at it, we don't really care about the uc_xsave 3012 * pointer being garbage in the case that the flag is not set. While 3013 * this isn't perhaps the most sporting choice in some cases, this is on 3014 * the other hand, pragmatic. 3015 */ 3016 if ((kuc->uc_flags & UC_XSAVE) != 0) { 3017 if (kuc->uc_xsave == 0) { 3018 return (EINVAL); 3019 } 3020 3021 if (!fpu_xsave_enabled()) { 3022 return (ENOTSUP); 3023 } 3024 } else { 3025 return (0); 3026 } 3027 3028 if (ddi_copyin((const void *)kuc->uc_xsave, &ucx, sizeof (ucx), 0) != 3029 0) { 3030 return (EFAULT); 3031 } 3032 3033 ksize = cpuid_get_xsave_size(); 3034 if (ucx.ucx_vers != UC_XSAVE_VERS || ucx.ucx_len < sizeof (ucx) || 3035 ucx.ucx_len > ksize || 3036 (ucx.ucx_bv & ~xsave_bv_all) != 0 || 3037 (uintptr_t)p->p_as->a_userlimit - ucx.ucx_len < 3038 (uintptr_t)kuc->uc_xsave) { 3039 return (EINVAL); 3040 } 3041 3042 /* 3043 * OK, our goal right now is to recreate a valid xsave_state structure 3044 * that we'll ultimately end up having to merge with our existing one in 3045 * the FPU save state. The reason we describe this as a merge is to help 3046 * future us when we want to retain supervisor state which will never be 3047 * part of userland signal state. The design of the userland signal 3048 * state is basically to compress it as much as we can. This is done for 3049 * two reasons: 3050 * 3051 * 1) We currently consider this a private interface. 3052 * 2) We really want to minimize the actual amount of stack space we 3053 * use as much as possible. Most applications aren't using AVX-512 3054 * right now, so doing our own compression style is worthwhile. If 3055 * libc adopts AVX-512 routines, we may want to change this. 3056 * 3057 * On the allocation below, our assumption is that if a thread has taken 3058 * a signal, then it is likely to take a signal again in the future (or 3059 * be shortly headed to its demise). As such, when that happens we will 3060 * leave the allocated signal stack around for the process. Most 3061 * applications don't allow all threads to take signals, so this should 3062 * hopefully help amortize the cost of the allocation. 3063 */ 3064 max_data = (uintptr_t)kuc->uc_xsave + ucx.ucx_len; 3065 data = (uintptr_t)kuc->uc_xsave + sizeof (ucx); 3066 bv = ucx.ucx_bv; 3067 if (lwp->lwp_pcb.pcb_fpu.fpu_signal == NULL) { 3068 lwp->lwp_pcb.pcb_fpu.fpu_signal = 3069 kmem_cache_alloc(fpsave_cachep, KM_SLEEP); 3070 } 3071 fpu = lwp->lwp_pcb.pcb_fpu.fpu_signal; 3072 3073 /* 3074 * Unconditionally initialize the memory we get in here to ensure that 3075 * it is in a reasonable state for ourselves. This ensures that unused 3076 * regions are mostly left in their initial state (the main exception 3077 * here is the x87/XMM state, but that should be OK). We don't fill in 3078 * the initial xsave state as we expect that to happen as part of our 3079 * processing. 3080 */ 3081 bzero(fpu, ksize); 3082 3083 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 3084 int ret; 3085 const xsave_proc_info_t *info = &fpu_xsave_info[i]; 3086 if (!info->xi_always && (info->xi_bits & bv) == 0) 3087 continue; 3088 bv &= ~info->xi_bits; 3089 3090 if (info->xi_signal_in == NULL) 3091 continue; 3092 ret = info->xi_signal_in(info, kuc, &ucx, fpu, &data, max_data); 3093 if (ret != 0) { 3094 return (ret); 3095 } 3096 } 3097 ASSERT0(bv); 3098 3099 /* 3100 * As described in the big theory statement section 'Signal Handling and 3101 * the ucontext_t', we always remove UC_FPU from here as we've taken 3102 * care of reassembling it ourselves. 3103 */ 3104 kuc->uc_flags &= ~UC_FPU; 3105 kuc->uc_xsave = (uintptr_t)fpu; 3106 3107 return (0); 3108 } 3109 3110 /* 3111 * This determines the size of the signal stack that we need for our custom form 3112 * of the xsave state. 3113 */ 3114 size_t 3115 fpu_signal_size(klwp_t *lwp) 3116 { 3117 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3118 size_t len = sizeof (uc_xsave_t); 3119 uint64_t xs_bv; 3120 3121 VERIFY3P(curthread, ==, lwptot(lwp)); 3122 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3123 VERIFY3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN); 3124 3125 if (!fpu_xsave_enabled()) { 3126 return (0); 3127 } 3128 3129 kpreempt_disable(); 3130 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3131 fp_save(fpu); 3132 } 3133 3134 xs_bv = fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv; 3135 for (size_t i = 0; i < ARRAY_SIZE(fpu_xsave_info); i++) { 3136 size_t comp_size; 3137 3138 if (!fpu_signal_include(&fpu_xsave_info[i], xs_bv)) 3139 continue; 3140 3141 cpuid_get_xsave_info(fpu_xsave_info[i].xi_bits, &comp_size, 3142 NULL); 3143 len += comp_size; 3144 } 3145 3146 kpreempt_enable(); 3147 return (len); 3148 } 3149 3150 /* 3151 * This function is used in service of restorecontext() to set the specified 3152 * thread's extended FPU state to the passed in data. Our assumptions at this 3153 * point from the system are: 3154 * 3155 * o Someone has already verified that the actual xsave header is correct. 3156 * o Any traditional XMM state that causes a #gp has been clamped. 3157 * o That data is basically the correct sized xsave state structure. Right now 3158 * that means it is not compressed and follows the CPUID-based rules for 3159 * constructing and laying out data. 3160 * o That the lwp argument refers to the current thread. 3161 * 3162 * Our primary purpose here is to merge the current FPU state with what exists 3163 * here. Right now, "merge", strictly speaking is just "replace". We can get 3164 * away with just replacing everything because all we currently save are user 3165 * states. If we start saving kernel states in here, this will get more nuanced 3166 * and we will need to be more careful about how we store data here. 3167 */ 3168 void 3169 fpu_set_xsave(klwp_t *lwp, const void *data) 3170 { 3171 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3172 uint32_t status, xstatus; 3173 struct xsave_state *dst_xsave; 3174 3175 VERIFY(fpu_xsave_enabled()); 3176 VERIFY3P(curthread, ==, lwptot(lwp)); 3177 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3178 ASSERT3U(fpu->fpu_flags & FPU_EN, ==, FPU_EN); 3179 3180 /* 3181 * We use fp_save() here rather than a stock fpdisable() so we can 3182 * attempt to honor our invariants that when the thread state has been 3183 * saved, the valid flag is set, even though we're going to be 3184 * overwriting it shortly. If we just called fpdisable() then we would 3185 * basically be asking for trouble. 3186 * 3187 * Because we are modifying the state here and we don't want the system 3188 * to end up in an odd state, we are being a little paranoid and 3189 * disabling preemption across this operation. In particular, once the 3190 * state is properly tagged with FPU_VALID, there should be no other way 3191 * that this thread can return to userland and get cleared out because 3192 * we're resetting its context; however, we let paranoia win out. 3193 */ 3194 kpreempt_disable(); 3195 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3196 fp_save(fpu); 3197 } 3198 3199 bcopy(data, lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, 3200 cpuid_get_xsave_size()); 3201 dst_xsave = lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic; 3202 status = dst_xsave->xs_fxsave.__fx_ign2[3]._l[0]; 3203 xstatus = dst_xsave->xs_fxsave.__fx_ign2[3]._l[1]; 3204 dst_xsave->xs_fxsave.__fx_ign2[3]._l[0] = 0; 3205 dst_xsave->xs_fxsave.__fx_ign2[3]._l[1] = 0; 3206 3207 /* 3208 * These two status words are information that the kernel itself uses to 3209 * track additional information and is part of the traditional fpregset, 3210 * but is not part of our xregs information. Because we are setting this 3211 * state, we leave it up to the rest of the kernel to determine whether 3212 * this came from an fpregset_t or is being reset to the default of 0. 3213 */ 3214 fpu->fpu_regs.kfpu_status = status; 3215 fpu->fpu_regs.kfpu_xstatus = xstatus; 3216 3217 fpu->fpu_flags |= FPU_VALID; 3218 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb); 3219 kpreempt_enable(); 3220 } 3221 3222 /* 3223 * Convert the current FPU state to the traditional fpregset_t. In the 64-bit 3224 * kernel, this is just an fxsave_state with additional values for the status 3225 * and xstatus members. 3226 * 3227 * This has the same nuance as the xregs cases discussed above, but is simpler 3228 * in that we only need to handle the fxsave state, but more complicated because 3229 * we need to check our save mechanism. 3230 */ 3231 void 3232 fpu_get_fpregset(klwp_t *lwp, fpregset_t *fp) 3233 { 3234 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3235 3236 kpreempt_disable(); 3237 fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status; 3238 fp->fp_reg_set.fpchip_state.xstatus = fpu->fpu_regs.kfpu_xstatus; 3239 3240 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3241 /* 3242 * If we're requesting the fpregs of a thread that isn't 3243 * currently valid and isn't the one that we're executing, then 3244 * we consider getting this information to be a best-effort and 3245 * we will not stop the thread in question to serialize it, 3246 * which means possibly getting stale data. This is the 3247 * traditional semantics that the system has used to service 3248 * this for /proc. 3249 */ 3250 if (curthread == lwptot(lwp)) { 3251 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3252 fp_save(fpu); 3253 } 3254 } 3255 3256 /* 3257 * If the FPU is not enabled and the state isn't valid (due to someone 3258 * else setting it), just copy the initial state. 3259 */ 3260 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == 0) { 3261 bcopy(&sse_initial, fp, sizeof (sse_initial)); 3262 kpreempt_enable(); 3263 return; 3264 } 3265 3266 /* 3267 * Given that we have an enabled FPU, we must look at the type of FPU 3268 * save mechanism to clean this up. In particular, while we can just 3269 * copy the save area with FXSAVE, with XSAVE we must carefully copy 3270 * only the bits that are valid and reset the rest to their default 3271 * state. 3272 */ 3273 switch (fp_save_mech) { 3274 case FP_FXSAVE: 3275 bcopy(fpu->fpu_regs.kfpu_u.kfpu_fx, fp, 3276 sizeof (struct fxsave_state)); 3277 break; 3278 case FP_XSAVE: 3279 fpu_xsave_to_fxsave(fpu->fpu_regs.kfpu_u.kfpu_xs, 3280 (struct fxsave_state *)fp); 3281 break; 3282 default: 3283 panic("Invalid fp_save_mech"); 3284 } 3285 3286 kpreempt_enable(); 3287 } 3288 3289 /* 3290 * This is a request to set the ABI fpregset_t into our actual hardware state. 3291 * In the 64-bit kernel the first 512 bytes of the fpregset_t is the same as the 3292 * 512-byte fxsave area. 3293 */ 3294 void 3295 fpu_set_fpregset(klwp_t *lwp, const fpregset_t *fp) 3296 { 3297 struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; 3298 3299 kpreempt_disable(); 3300 if ((fpu->fpu_flags & (FPU_EN | FPU_VALID)) == FPU_EN) { 3301 /* 3302 * We always save the entire FPU. This is required if we're 3303 * using xsave. If we're using fxsave, we could skip the 3304 * 512-byte write and instead just disable the FPU since we'd be 3305 * replacing it all. For now we don't bother with more 3306 * conditional logic. 3307 */ 3308 VERIFY3P(curthread, ==, lwptot(lwp)); 3309 VERIFY0(lwptot(lwp)->t_flag & T_KFPU); 3310 fp_save(fpu); 3311 } 3312 3313 fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus; 3314 fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status; 3315 switch (fp_save_mech) { 3316 case FP_FXSAVE: 3317 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_fx, 3318 sizeof (struct fxsave_state)); 3319 break; 3320 case FP_XSAVE: 3321 bcopy(fp, fpu->fpu_regs.kfpu_u.kfpu_xs, 3322 sizeof (struct fxsave_state)); 3323 fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= 3324 XFEATURE_LEGACY_FP | XFEATURE_SSE; 3325 break; 3326 default: 3327 panic("Invalid fp_save_mech"); 3328 } 3329 3330 fpu->fpu_flags |= FPU_VALID; 3331 PCB_SET_UPDATE_FPU(&lwp->lwp_pcb); 3332 kpreempt_enable(); 3333 } 3334