1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2021 Joyent, Inc. 24 * Copyright 2021 RackTop Systems, Inc. 25 * Copyright 2021 Oxide Computer Company 26 */ 27 28 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 30 /* All Rights Reserved */ 31 32 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 33 /* All Rights Reserved */ 34 35 /* 36 * Copyright (c) 2009, Intel Corporation. 37 * All rights reserved. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/signal.h> 43 #include <sys/regset.h> 44 #include <sys/privregs.h> 45 #include <sys/psw.h> 46 #include <sys/trap.h> 47 #include <sys/fault.h> 48 #include <sys/systm.h> 49 #include <sys/user.h> 50 #include <sys/file.h> 51 #include <sys/proc.h> 52 #include <sys/pcb.h> 53 #include <sys/lwp.h> 54 #include <sys/cpuvar.h> 55 #include <sys/thread.h> 56 #include <sys/disp.h> 57 #include <sys/fp.h> 58 #include <sys/siginfo.h> 59 #include <sys/archsystm.h> 60 #include <sys/kmem.h> 61 #include <sys/debug.h> 62 #include <sys/x86_archext.h> 63 #include <sys/sysmacros.h> 64 #include <sys/cmn_err.h> 65 #include <sys/kfpu.h> 66 67 /* 68 * FPU Management Overview 69 * ----------------------- 70 * 71 * The x86 FPU has evolved substantially since its days as the x87 coprocessor; 72 * however, many aspects of its life as a coprocessor are still around in x86. 73 * 74 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU. 75 * While that state still exists, there is much more that is covered by the FPU. 76 * Today, this includes not just traditional FPU state, but also supervisor only 77 * state. The following state is currently managed and covered logically by the 78 * idea of the FPU registers: 79 * 80 * o Traditional x87 FPU 81 * o Vector Registers (%xmm, %ymm, %zmm) 82 * o Memory Protection Extensions (MPX) Bounds Registers 83 * o Protected Key Rights Registers (PKRU) 84 * o Processor Trace data 85 * 86 * The rest of this covers how the FPU is managed and controlled, how state is 87 * saved and restored between threads, interactions with hypervisors, and other 88 * information exported to user land through aux vectors. A lot of background 89 * information is here to synthesize major parts of the Intel SDM, but 90 * unfortunately, it is not a replacement for reading it. 91 * 92 * FPU Control Registers 93 * --------------------- 94 * 95 * Because the x87 FPU began its life as a co-processor and the FPU was 96 * optional there are several bits that show up in %cr0 that we have to 97 * manipulate when dealing with the FPU. These are: 98 * 99 * o CR0.ET The 'extension type' bit. This was used originally to indicate 100 * that the FPU co-processor was present. Now it is forced on for 101 * compatibility. This is often used to verify whether or not the 102 * FPU is present. 103 * 104 * o CR0.NE The 'native error' bit. Used to indicate that native error 105 * mode should be enabled. This indicates that we should take traps 106 * on FPU errors. The OS enables this early in boot. 107 * 108 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not 109 * wait/fwait instructions generate a #NM if CR0.TS is set. 110 * 111 * o CR0.EM The 'Emulation' bit. This is used to cause floating point 112 * operations (x87 through SSE4) to trap with a #UD so they can be 113 * emulated. The system never sets this bit, but makes sure it is 114 * clear on processor start up. 115 * 116 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating 117 * point operation will generate a #NM. An fwait will as well, 118 * depending on the value in CR0.MP. 119 * 120 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by 121 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more 122 * complicated role. Historically it has been used to allow running systems to 123 * restore the FPU registers lazily. This will be discussed in greater depth 124 * later on. 125 * 126 * %cr4 is also used as part of the FPU control. Specifically we need to worry 127 * about the following bits in the system: 128 * 129 * o CR4.OSFXSR This bit is used to indicate that the OS understands and 130 * supports the execution of the fxsave and fxrstor 131 * instructions. This bit is required to be set to enable 132 * the use of the SSE->SSE4 instructions. 133 * 134 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand 135 * and take a SIMD floating point exception (#XM). This bit 136 * is always enabled by the system. 137 * 138 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and 139 * supports the execution of the xsave and xrstor family of 140 * instructions. This bit is required to use any of the AVX 141 * and newer feature sets. 142 * 143 * Because all supported processors are 64-bit, they'll always support the XMM 144 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot. 145 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid. 146 * 147 * %xcr0 is used to manage the behavior of the xsave feature set and is only 148 * present on the system if xsave is supported. %xcr0 is read and written to 149 * through by the xgetbv and xsetbv instructions. This register is present 150 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a 151 * different component of the xsave state and controls whether or not that 152 * information is saved and restored. For newer feature sets like AVX and MPX, 153 * it also controls whether or not the corresponding instructions can be 154 * executed (much like CR0.OSFXSR does for the SSE feature sets). 155 * 156 * Everything in %xcr0 is around features available to users. There is also the 157 * IA32_XSS MSR which is used to control supervisor-only features that are still 158 * part of the xsave state. Bits that can be set in %xcr0 are reserved in 159 * IA32_XSS and vice versa. This is an important property that is particularly 160 * relevant to how the xsave instructions operate. 161 * 162 * Save Mechanisms 163 * --------------- 164 * 165 * When switching between running threads the FPU state needs to be saved and 166 * restored by the OS. If this state was not saved, users would rightfully 167 * complain about corrupt state. There are three mechanisms that exist on the 168 * processor for saving and restoring these state images: 169 * 170 * o fsave 171 * o fxsave 172 * o xsave 173 * 174 * fsave saves and restores only the x87 FPU and is the oldest of these 175 * mechanisms. This mechanism is never used in the kernel today because we are 176 * always running on systems that support fxsave. 177 * 178 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register 179 * state to be saved and restored to and from a struct fxsave_state. This is the 180 * default mechanism that is used to save and restore the FPU on amd64. An 181 * important aspect of fxsave that was different from the original i386 fsave 182 * mechanism is that the restoring of FPU state with pending exceptions will not 183 * generate an exception, it will be deferred to the next use of the FPU. 184 * 185 * The final and by far the most complex mechanism is that of the xsave set. 186 * xsave allows for saving and restoring all of the traditional x86 pieces (x87 187 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc. 188 * registers. 189 * 190 * Data is saved and restored into and out of a struct xsave_state. The first 191 * part of the struct xsave_state is equivalent to the struct fxsave_state. 192 * After that, there is a header which is used to describe the remaining 193 * portions of the state. The header is a 64-byte value of which the first two 194 * uint64_t values are defined and the rest are reserved and must be zero. The 195 * first uint64_t is the xstate_bv member. This describes which values in the 196 * xsave_state are actually valid and present. This is updated on a save and 197 * used on restore. The second member is the xcomp_bv member. Its last bit 198 * determines whether or not a compressed version of the structure is used. 199 * 200 * When the uncompressed structure is used (currently the only format we 201 * support), then each state component is at a fixed offset in the structure, 202 * even if it is not being used. For example, if you only saved the AVX related 203 * state, but did not save the MPX related state, the offset would not change 204 * for any component. With the compressed format, components that aren't used 205 * are all elided (though the x87 and SSE state are always there). 206 * 207 * Unlike fxsave which saves all state, the xsave family does not always save 208 * and restore all the state that could be covered by the xsave_state. The 209 * instructions all take an argument which is a mask of what to consider. This 210 * is the same mask that will be used in the xstate_bv vector and it is also the 211 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only 212 * considered with the xsaves and xrstors instructions. 213 * 214 * When a save or restore is requested, a bitwise and is performed between the 215 * requested bits and those that have been enabled in %xcr0. Only the bits that 216 * match that are then saved or restored. Others will be silently ignored by 217 * the processor. This idea is used often in the OS. We will always request that 218 * we save and restore all of the state, but only those portions that are 219 * actually enabled in %xcr0 will be touched. 220 * 221 * If a feature has been asked to be restored that is not set in the xstate_bv 222 * feature vector of the save state, then it will be set to its initial state by 223 * the processor (usually zeros). Also, when asked to save state, the processor 224 * may not write out data that is in its initial state as an optimization. This 225 * optimization only applies to saving data and not to restoring data. 226 * 227 * There are a few different variants of the xsave and xrstor instruction. They 228 * are: 229 * 230 * o xsave This is the original save instruction. It will save all of the 231 * requested data in the xsave state structure. It only saves data 232 * in the uncompressed (xcomp_bv[63] is zero) format. It may be 233 * executed at all privilege levels. 234 * 235 * o xrstor This is the original restore instruction. It will restore all of 236 * the requested data. The xrstor function can handle both the 237 * compressed and uncompressed formats. It may be executed at all 238 * privilege levels. 239 * 240 * o xsaveopt This is a variant of the xsave instruction that employs 241 * optimizations to try and only write out state that has been 242 * modified since the last time an xrstor instruction was called. 243 * The processor tracks a tuple of information about the last 244 * xrstor and tries to ensure that the same buffer is being used 245 * when this optimization is being used. However, because of the 246 * way that it tracks the xrstor buffer based on the address of it, 247 * it is not suitable for use if that buffer can be easily reused. 248 * The most common case is trying to save data to the stack in 249 * rtld. It may be executed at all privilege levels. 250 * 251 * o xsavec This is a variant of the xsave instruction that writes out the 252 * compressed form of the xsave_state. Otherwise it behaves as 253 * xsave. It may be executed at all privilege levels. 254 * 255 * o xsaves This is a variant of the xsave instruction. It is similar to 256 * xsavec in that it always writes the compressed form of the 257 * buffer. Unlike all the other forms, this instruction looks at 258 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine 259 * what to save and restore. xsaves also implements the same 260 * optimization that xsaveopt does around modified pieces. User 261 * land may not execute the instruction. 262 * 263 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves 264 * it can save and restore both the user and privileged states. 265 * Unlike xrstor it can only operate on the compressed form. 266 * User land may not execute the instruction. 267 * 268 * Based on all of these, the kernel has a precedence for what it will use. 269 * Basically, xsaves (not supported) is preferred to xsaveopt, which is 270 * preferred to xsave. A similar scheme is used when informing rtld (more later) 271 * about what it should use. xsavec is preferred to xsave. xsaveopt is not 272 * recommended due to the modified optimization not being appropriate for this 273 * use. 274 * 275 * Finally, there is one last gotcha with the xsave state. Importantly some AMD 276 * processors did not always save and restore some of the FPU exception state in 277 * some cases like Intel did. In those cases the OS will make up for this fact 278 * itself. 279 * 280 * FPU Initialization 281 * ------------------ 282 * 283 * One difference with the FPU registers is that not all threads have FPU state, 284 * only those that have an lwp. Generally this means kernel threads, which all 285 * share p0 and its lwp, do not have FPU state. Though there are definitely 286 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread 287 * and lwp interchangeably, just think of thread meaning a thread that has a 288 * lwp. 289 * 290 * Each lwp has its FPU state allocated in its pcb (process control block). The 291 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized 292 * dynamically at start up based on the save mechanism that we're using and the 293 * amount of memory required for it. This is dynamic because the xsave_state 294 * size varies based on the supported feature set. 295 * 296 * The hardware side of the FPU is initialized early in boot before we mount the 297 * root file system. This is effectively done in fpu_probe(). This is where we 298 * make the final decision about what the save and restore mechanisms we should 299 * use are, create the fpsave_cachep kmem cache, and initialize a number of 300 * function pointers that use save and restoring logic. 301 * 302 * The thread/lwp side is a a little more involved. There are two different 303 * things that we need to concern ourselves with. The first is how the FPU 304 * resources are allocated and the second is how the FPU state is initialized 305 * for a given lwp. 306 * 307 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init(). 308 * This is always called unconditionally by the system as part of creating an 309 * LWP. 310 * 311 * There are three different initialization paths that we deal with. The first 312 * is when we are executing a new process. As part of exec all of the register 313 * state is reset. The exec case is particularly important because init is born 314 * like Athena, sprouting from the head of the kernel, without any true parent 315 * to fork from. The second is used whenever we fork or create a new lwp. The 316 * third is to deal with special lwps like the agent lwp. 317 * 318 * During exec, we will call fp_exec() which will initialize and set up the FPU 319 * state for the process. That will fill in the initial state for the FPU and 320 * also set that state in the FPU itself. As part of fp_exec() we also install a 321 * thread context operations vector that takes care of dealing with the saving 322 * and restoring of the FPU. These context handlers will also be called whenever 323 * an lwp is created or forked. In those cases, to initialize the FPU we will 324 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context 325 * operations vector for the new thread. 326 * 327 * Next we'll end up in the context operation fp_new_lwp(). This saves the 328 * current thread's state, initializes the new thread's state, and copies over 329 * the relevant parts of the originating thread's state. It's as this point that 330 * we also install the FPU context operations into the new thread, which ensures 331 * that all future threads that are descendants of the current one get the 332 * thread context operations (unless they call exec). 333 * 334 * To deal with some things like the agent lwp, we double check the state of the 335 * FPU in sys_rtt_common() to make sure that it has been enabled before 336 * returning to user land. In general, this path should be rare, but it's useful 337 * for the odd lwp here and there. 338 * 339 * The FPU state will remain valid most of the time. There are times that 340 * the state will be rewritten. For example in restorecontext, due to /proc, or 341 * the lwp calls exec(). Whether the context is being freed or we are resetting 342 * the state, we will call fp_free() to disable the FPU and our context. 343 * 344 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU 345 * state by calling fp_lwp_cleanup(). 346 * 347 * Kernel FPU Multiplexing 348 * ----------------------- 349 * 350 * Just as the kernel has to maintain all of the general purpose registers when 351 * switching between scheduled threads, the same is true of the FPU registers. 352 * 353 * When a thread has FPU state, it also has a set of context operations 354 * installed. These context operations take care of making sure that the FPU is 355 * properly saved and restored during a context switch (fpsave_ctxt and 356 * fprestore_ctxt respectively). This means that the current implementation of 357 * the FPU is 'eager', when a thread is running the CPU will have its FPU state 358 * loaded. While this is always true when executing in userland, there are a few 359 * cases where this is not true in the kernel. 360 * 361 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was 362 * employed. This meant that the FPU would be saved on a context switch and the 363 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would 364 * then take a #NM trap, at which point we would restore the FPU from the save 365 * area and return to user land. Given the frequency of use of the FPU alone by 366 * libc, there's no point returning to user land just to trap again. 367 * 368 * There are a few cases though where the FPU state may need to be changed for a 369 * thread on its behalf. The most notable cases are in the case of processes 370 * using /proc, restorecontext, forking, etc. In all of these cases the kernel 371 * will force a threads FPU state to be saved into the PCB through the fp_save() 372 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the 373 * pcb. This indicates that the save state holds currently valid data. As a side 374 * effect of this, CR0.TS will be set. To make sure that all of the state is 375 * updated before returning to user land, in these cases, we set a flag on the 376 * PCB that says the FPU needs to be updated. This will make sure that we take 377 * the slow path out of a system call to fix things up for the thread. Due to 378 * the fact that this is a rather rare case, effectively setting the equivalent 379 * of t_postsys is acceptable. 380 * 381 * CR0.TS will be set after a save occurs and cleared when a restore occurs. 382 * Generally this means it will be cleared immediately by the new thread that is 383 * running in a context switch. However, this isn't the case for kernel threads. 384 * They currently operate with CR0.TS set as no kernel state is restored for 385 * them. This means that using the FPU will cause a #NM and panic. 386 * 387 * The FPU_VALID flag on the currently executing thread's pcb is meant to track 388 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set. 389 * However, because we eagerly restore, the only time that CR0.TS should be set 390 * for a non-kernel thread is during operations where it will be cleared before 391 * returning to user land and importantly, the only data that is in it is its 392 * own. 393 * 394 * Kernel FPU Usage 395 * ---------------- 396 * 397 * Traditionally the kernel never used the FPU since it had no need for 398 * floating point operations. However, modern FPU hardware supports a variety 399 * of SIMD extensions which can speed up code such as parity calculations or 400 * encryption. 401 * 402 * To allow the kernel to take advantage of these features, the 403 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped 404 * around any usage of the FPU by the kernel to ensure that user-level context 405 * is properly saved/restored, as well as to properly setup the FPU for use by 406 * the kernel. There are a variety of ways this wrapping can be used, as 407 * discussed in this section below. 408 * 409 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended 410 * operations, the kernel_fpu_alloc() function should be used to allocate a 411 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU 412 * state. This structure is not tied to any thread. That is, different threads 413 * can reuse the same kfpu_state_t structure, although not concurrently. A 414 * kfpu_state_t structure is freed by the kernel_fpu_free() function. 415 * 416 * In some cases, the kernel may need to use the FPU for a short operation 417 * without the overhead to manage a kfpu_state_t structure and without 418 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE 419 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags 420 * parameter. This indicates that there is no kfpu_state_t. When used this way, 421 * kernel preemption should be disabled by the caller (kpreempt_disable) before 422 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end(). 423 * For this usage, it is important to limit the kernel's FPU use to short 424 * operations. The tradeoff between using the FPU without a kfpu_state_t 425 * structure vs. the overhead of allowing a context switch while using the FPU 426 * should be carefully considered on a case by case basis. 427 * 428 * In other cases, kernel threads have an LWP, but never execute in user space. 429 * In this situation, the LWP's pcb_fpu area can be used to save/restore the 430 * kernel's FPU state if the thread is context switched, instead of having to 431 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the 432 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to 433 * enable this behavior. It is the caller's responsibility to ensure that this 434 * is only used for a kernel thread which never executes in user space. 435 * 436 * FPU Exceptions 437 * -------------- 438 * 439 * Certain operations can cause the kernel to take traps due to FPU activity. 440 * Generally these events will cause a user process to receive a SIGFPU and if 441 * the kernel receives it in kernel context, we will die. Traditionally the #NM 442 * (Device Not Available / No Math) exception generated by CR0.TS would have 443 * caused us to restore the FPU. Now it is a fatal event regardless of whether 444 * or not user land causes it. 445 * 446 * While there are some cases where the kernel uses the FPU, it is up to the 447 * kernel to use the FPU in a way such that it cannot receive a trap or to use 448 * the appropriate trap protection mechanisms. 449 * 450 * Hypervisors 451 * ----------- 452 * 453 * When providing support for hypervisors things are a little bit more 454 * complicated because the FPU is not virtualized at all. This means that they 455 * need to save and restore the FPU and %xcr0 across entry and exit to the 456 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These 457 * allow us to use the full native state to make sure that we are always saving 458 * and restoring the full FPU that the host sees, even when the guest is using a 459 * subset. 460 * 461 * One tricky aspect of this is that the guest may be using a subset of %xcr0 462 * and therefore changing our %xcr0 on the fly. It is vital that when we're 463 * saving and restoring the FPU that we always use the largest %xcr0 contents 464 * otherwise we will end up leaving behind data in it. 465 * 466 * ELF PLT Support 467 * --------------- 468 * 469 * rtld has to preserve a subset of the FPU when it is saving and restoring 470 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for 471 * more information. As a result, we set up an aux vector that contains 472 * information about what save and restore mechanisms it should be using and 473 * the sizing thereof based on what the kernel supports. This is passed down in 474 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is 475 * initialized in fpu_subr.c. 476 */ 477 478 kmem_cache_t *fpsave_cachep; 479 480 /* Legacy fxsave layout + xsave header + ymm */ 481 #define AVX_XSAVE_SIZE (512 + 64 + 256) 482 483 /* 484 * Various sanity checks. 485 */ 486 CTASSERT(sizeof (struct fxsave_state) == 512); 487 CTASSERT(sizeof (struct fnsave_state) == 108); 488 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0); 489 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE); 490 491 /* 492 * This structure is the x86 implementation of the kernel FPU that is defined in 493 * uts/common/sys/kfpu.h. 494 */ 495 496 typedef enum kfpu_flags { 497 /* 498 * This indicates that the save state has initial FPU data. 499 */ 500 KFPU_F_INITIALIZED = 0x01 501 } kfpu_flags_t; 502 503 struct kfpu_state { 504 fpu_ctx_t kfpu_ctx; 505 kfpu_flags_t kfpu_flags; 506 kthread_t *kfpu_curthread; 507 }; 508 509 /* 510 * Initial kfpu state for SSE/SSE2 used by fpinit() 511 */ 512 const struct fxsave_state sse_initial = { 513 FPU_CW_INIT, /* fx_fcw */ 514 0, /* fx_fsw */ 515 0, /* fx_fctw */ 516 0, /* fx_fop */ 517 0, /* fx_rip */ 518 0, /* fx_rdp */ 519 SSE_MXCSR_INIT /* fx_mxcsr */ 520 /* rest of structure is zero */ 521 }; 522 523 /* 524 * Initial kfpu state for AVX used by fpinit() 525 */ 526 const struct xsave_state avx_initial = { 527 /* 528 * The definition below needs to be identical with sse_initial 529 * defined above. 530 */ 531 { 532 FPU_CW_INIT, /* fx_fcw */ 533 0, /* fx_fsw */ 534 0, /* fx_fctw */ 535 0, /* fx_fop */ 536 0, /* fx_rip */ 537 0, /* fx_rdp */ 538 SSE_MXCSR_INIT /* fx_mxcsr */ 539 /* rest of structure is zero */ 540 }, 541 /* 542 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid, 543 * and CPU should initialize XMM/YMM. 544 */ 545 1, 546 0 /* xs_xcomp_bv */ 547 /* rest of structure is zero */ 548 }; 549 550 /* 551 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid 552 * the #gp exception caused by setting unsupported bits in the 553 * MXCSR register 554 */ 555 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT; 556 557 /* 558 * Initial kfpu state for x87 used by fpinit() 559 */ 560 const struct fnsave_state x87_initial = { 561 FPU_CW_INIT, /* f_fcw */ 562 0, /* __f_ign0 */ 563 0, /* f_fsw */ 564 0, /* __f_ign1 */ 565 0xffff, /* f_ftw */ 566 /* rest of structure is zero */ 567 }; 568 569 /* 570 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we 571 * have an XSAVE-capable chip in fpu_probe. 572 */ 573 void (*fpsave_ctxt)(void *) = fpxsave_ctxt; 574 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt; 575 576 /* 577 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable. 578 */ 579 void (*xsavep)(struct xsave_state *, uint64_t) = xsave; 580 581 static int fpe_sicode(uint_t); 582 static int fpe_simd_sicode(uint_t); 583 static void fp_new_lwp(void *, void *); 584 static void fp_free_ctx(void *, int); 585 586 static struct ctxop * 587 fp_ctxop_allocate(struct fpu_ctx *fp) 588 { 589 const struct ctxop_template tpl = { 590 .ct_rev = CTXOP_TPL_REV, 591 .ct_save = fpsave_ctxt, 592 .ct_restore = fprestore_ctxt, 593 .ct_fork = fp_new_lwp, 594 .ct_lwp_create = fp_new_lwp, 595 .ct_free = fp_free_ctx, 596 }; 597 return (ctxop_allocate(&tpl, fp)); 598 } 599 600 /* 601 * Copy the state of parent lwp's floating point context into the new lwp. 602 * Invoked for both fork() and lwp_create(). 603 * 604 * Note that we inherit -only- the control state (e.g. exception masks, 605 * rounding, precision control, etc.); the FPU registers are otherwise 606 * reset to their initial state. 607 */ 608 static void 609 fp_new_lwp(void *parent, void *child) 610 { 611 kthread_id_t t = parent, ct = child; 612 struct fpu_ctx *fp; /* parent fpu context */ 613 struct fpu_ctx *cfp; /* new fpu context */ 614 struct fxsave_state *fx, *cfx; 615 struct xsave_state *cxs; 616 617 ASSERT(fp_kind != FP_NO); 618 619 fp = &t->t_lwp->lwp_pcb.pcb_fpu; 620 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu; 621 622 /* 623 * If the parent FPU state is still in the FPU hw then save it; 624 * conveniently, fp_save() already does this for us nicely. 625 */ 626 fp_save(fp); 627 628 cfp->fpu_flags = FPU_EN | FPU_VALID; 629 cfp->fpu_regs.kfpu_status = 0; 630 cfp->fpu_regs.kfpu_xstatus = 0; 631 632 /* 633 * Make sure that the child's FPU is cleaned up and made ready for user 634 * land. 635 */ 636 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb); 637 638 switch (fp_save_mech) { 639 case FP_FXSAVE: 640 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 641 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx; 642 bcopy(&sse_initial, cfx, sizeof (*cfx)); 643 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 644 cfx->fx_fcw = fx->fx_fcw; 645 break; 646 647 case FP_XSAVE: 648 cfp->fpu_xsave_mask = fp->fpu_xsave_mask; 649 650 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL); 651 652 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 653 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs; 654 cfx = &cxs->xs_fxsave; 655 656 bcopy(&avx_initial, cxs, sizeof (*cxs)); 657 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 658 cfx->fx_fcw = fx->fx_fcw; 659 cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) & 660 XFEATURE_FP_INITIAL); 661 break; 662 default: 663 panic("Invalid fp_save_mech"); 664 /*NOTREACHED*/ 665 } 666 667 /* 668 * Mark that both the parent and child need to have the FPU cleaned up 669 * before returning to user land. 670 */ 671 672 ctxop_attach(ct, fp_ctxop_allocate(cfp)); 673 } 674 675 /* 676 * Free any state associated with floating point context. 677 * Fp_free can be called in three cases: 678 * 1) from reaper -> thread_free -> freectx-> fp_free 679 * fp context belongs to a thread on deathrow 680 * nothing to do, thread will never be resumed 681 * thread calling ctxfree is reaper 682 * 683 * 2) from exec -> freectx -> fp_free 684 * fp context belongs to the current thread 685 * must disable fpu, thread calling ctxfree is curthread 686 * 687 * 3) from restorecontext -> setfpregs -> fp_free 688 * we have a modified context in the memory (lwp->pcb_fpu) 689 * disable fpu and release the fp context for the CPU 690 * 691 */ 692 void 693 fp_free(struct fpu_ctx *fp) 694 { 695 ASSERT(fp_kind != FP_NO); 696 697 if (fp->fpu_flags & FPU_VALID) 698 return; 699 700 kpreempt_disable(); 701 /* 702 * We want to do fpsave rather than fpdisable so that we can 703 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit 704 */ 705 fp->fpu_flags |= FPU_VALID; 706 /* If for current thread disable FP to track FPU_VALID */ 707 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) { 708 /* Clear errors if any to prevent frstor from complaining */ 709 (void) fperr_reset(); 710 if (fp_kind & __FP_SSE) 711 (void) fpxerr_reset(); 712 fpdisable(); 713 } 714 kpreempt_enable(); 715 } 716 717 /* 718 * Wrapper for freectx to make the types line up for fp_free() 719 */ 720 static void 721 fp_free_ctx(void *arg, int isexec __unused) 722 { 723 fp_free((struct fpu_ctx *)arg); 724 } 725 726 /* 727 * Store the floating point state and disable the floating point unit. 728 */ 729 void 730 fp_save(struct fpu_ctx *fp) 731 { 732 ASSERT(fp_kind != FP_NO); 733 734 kpreempt_disable(); 735 if (!fp || fp->fpu_flags & FPU_VALID || 736 (fp->fpu_flags & FPU_EN) == 0) { 737 kpreempt_enable(); 738 return; 739 } 740 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu); 741 742 switch (fp_save_mech) { 743 case FP_FXSAVE: 744 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx); 745 break; 746 747 case FP_XSAVE: 748 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 749 break; 750 default: 751 panic("Invalid fp_save_mech"); 752 /*NOTREACHED*/ 753 } 754 755 fp->fpu_flags |= FPU_VALID; 756 757 /* 758 * We save the FPU as part of forking, execing, modifications via /proc, 759 * restorecontext, etc. As such, we need to make sure that we return to 760 * userland with valid state in the FPU. If we're context switched out 761 * before we hit sys_rtt_common() we'll end up having restored the FPU 762 * as part of the context ops operations. The restore logic always makes 763 * sure that FPU_VALID is set before doing a restore so we don't restore 764 * it a second time. 765 */ 766 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb); 767 768 kpreempt_enable(); 769 } 770 771 /* 772 * Restore the FPU context for the thread: 773 * The possibilities are: 774 * 1. No active FPU context: Load the new context into the FPU hw 775 * and enable the FPU. 776 */ 777 void 778 fp_restore(struct fpu_ctx *fp) 779 { 780 switch (fp_save_mech) { 781 case FP_FXSAVE: 782 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx); 783 break; 784 785 case FP_XSAVE: 786 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 787 break; 788 default: 789 panic("Invalid fp_save_mech"); 790 /*NOTREACHED*/ 791 } 792 793 fp->fpu_flags &= ~FPU_VALID; 794 } 795 796 /* 797 * Reset the FPU such that it is in a valid state for a new thread that is 798 * coming out of exec. The FPU will be in a usable state at this point. At this 799 * point we know that the FPU state has already been allocated and if this 800 * wasn't an init process, then it will have had fp_free() previously called. 801 */ 802 void 803 fp_exec(void) 804 { 805 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 806 807 if (fp_save_mech == FP_XSAVE) { 808 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 809 } 810 811 struct ctxop *ctx = fp_ctxop_allocate(fp); 812 /* 813 * Make sure that we're not preempted in the middle of initializing the 814 * FPU on CPU. 815 */ 816 kpreempt_disable(); 817 ctxop_attach(curthread, ctx); 818 fpinit(); 819 fp->fpu_flags = FPU_EN; 820 kpreempt_enable(); 821 } 822 823 824 /* 825 * Seeds the initial state for the current thread. The possibilities are: 826 * 1. Another process has modified the FPU state before we have done any 827 * initialization: Load the FPU state from the LWP state. 828 * 2. The FPU state has not been externally modified: Load a clean state. 829 */ 830 void 831 fp_seed(void) 832 { 833 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 834 835 ASSERT(curthread->t_preempt >= 1); 836 ASSERT((fp->fpu_flags & FPU_EN) == 0); 837 838 /* 839 * Always initialize a new context and initialize the hardware. 840 */ 841 if (fp_save_mech == FP_XSAVE) { 842 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 843 } 844 845 ctxop_attach(curthread, fp_ctxop_allocate(fp)); 846 fpinit(); 847 848 /* 849 * If FPU_VALID is set, it means someone has modified registers via 850 * /proc. In this case, restore the current lwp's state. 851 */ 852 if (fp->fpu_flags & FPU_VALID) 853 fp_restore(fp); 854 855 ASSERT((fp->fpu_flags & FPU_VALID) == 0); 856 fp->fpu_flags = FPU_EN; 857 } 858 859 /* 860 * When using xsave/xrstor, these three functions are used by the lwp code to 861 * manage the memory for the xsave area. 862 */ 863 void 864 fp_lwp_init(struct _klwp *lwp) 865 { 866 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 867 868 /* 869 * We keep a copy of the pointer in lwp_fpu so that we can restore the 870 * value in forklwp() after we duplicate the parent's LWP state. 871 */ 872 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = 873 kmem_cache_alloc(fpsave_cachep, KM_SLEEP); 874 875 if (fp_save_mech == FP_XSAVE) { 876 /* 877 * 878 * We bzero since the fpinit() code path will only 879 * partially initialize the xsave area using avx_inital. 880 */ 881 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state)); 882 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size()); 883 } 884 } 885 886 void 887 fp_lwp_cleanup(struct _klwp *lwp) 888 { 889 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 890 891 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) { 892 kmem_cache_free(fpsave_cachep, 893 fp->fpu_regs.kfpu_u.kfpu_generic); 894 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL; 895 } 896 } 897 898 /* 899 * Called during the process of forklwp(). The kfpu_u pointer will have been 900 * overwritten while copying the parent's LWP structure. We have a valid copy 901 * stashed in the child's lwp_fpu which we use to restore the correct value. 902 */ 903 void 904 fp_lwp_dup(struct _klwp *lwp) 905 { 906 void *xp = lwp->lwp_fpu; 907 size_t sz; 908 909 switch (fp_save_mech) { 910 case FP_FXSAVE: 911 sz = sizeof (struct fxsave_state); 912 break; 913 case FP_XSAVE: 914 sz = cpuid_get_xsave_size(); 915 break; 916 default: 917 panic("Invalid fp_save_mech"); 918 /*NOTREACHED*/ 919 } 920 921 /* copy the parent's values into the new lwp's struct */ 922 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz); 923 /* now restore the pointer */ 924 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp; 925 } 926 927 /* 928 * Handle a processor extension error fault 929 * Returns non zero for error. 930 */ 931 932 /*ARGSUSED*/ 933 int 934 fpexterrflt(struct regs *rp) 935 { 936 uint32_t fpcw, fpsw; 937 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 938 939 ASSERT(fp_kind != FP_NO); 940 941 /* 942 * Now we can enable the interrupts. 943 * (NOTE: x87 fp exceptions come thru interrupt gate) 944 */ 945 sti(); 946 947 if (!fpu_exists) 948 return (FPE_FLTINV); 949 950 /* 951 * Do an unconditional save of the FP state. If it's dirty (TS=0), 952 * it'll be saved into the fpu context area passed in (that of the 953 * current thread). If it's not dirty (it may not be, due to 954 * an intervening save due to a context switch between the sti(), 955 * above and here, then it's safe to just use the stored values in 956 * the context save area to determine the cause of the fault. 957 */ 958 fp_save(fp); 959 960 /* clear exception flags in saved state, as if by fnclex */ 961 switch (fp_save_mech) { 962 case FP_FXSAVE: 963 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 964 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw; 965 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS; 966 break; 967 968 case FP_XSAVE: 969 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 970 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw; 971 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS; 972 /* 973 * Always set LEGACY_FP as it may have been cleared by XSAVE 974 * instruction 975 */ 976 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; 977 break; 978 default: 979 panic("Invalid fp_save_mech"); 980 /*NOTREACHED*/ 981 } 982 983 fp->fpu_regs.kfpu_status = fpsw; 984 985 if ((fpsw & FPS_ES) == 0) 986 return (0); /* No exception */ 987 988 /* 989 * "and" the exception flags with the complement of the mask 990 * bits to determine which exception occurred 991 */ 992 return (fpe_sicode(fpsw & ~fpcw & 0x3f)); 993 } 994 995 /* 996 * Handle an SSE/SSE2 precise exception. 997 * Returns a non-zero sicode for error. 998 */ 999 /*ARGSUSED*/ 1000 int 1001 fpsimderrflt(struct regs *rp) 1002 { 1003 uint32_t mxcsr, xmask; 1004 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 1005 1006 ASSERT(fp_kind & __FP_SSE); 1007 1008 /* 1009 * NOTE: Interrupts are disabled during execution of this 1010 * function. They are enabled by the caller in trap.c. 1011 */ 1012 1013 /* 1014 * The only way we could have gotten here if there is no FP unit 1015 * is via a user executing an INT $19 instruction, so there is 1016 * no fault in that case. 1017 */ 1018 if (!fpu_exists) 1019 return (0); 1020 1021 /* 1022 * Do an unconditional save of the FP state. If it's dirty (TS=0), 1023 * it'll be saved into the fpu context area passed in (that of the 1024 * current thread). If it's not dirty, then it's safe to just use 1025 * the stored values in the context save area to determine the 1026 * cause of the fault. 1027 */ 1028 fp_save(fp); /* save the FPU state */ 1029 1030 if (fp_save_mech == FP_XSAVE) { 1031 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr; 1032 fp->fpu_regs.kfpu_status = 1033 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 1034 } else { 1035 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr; 1036 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 1037 } 1038 fp->fpu_regs.kfpu_xstatus = mxcsr; 1039 1040 /* 1041 * compute the mask that determines which conditions can cause 1042 * a #xm exception, and use this to clean the status bits so that 1043 * we can identify the true cause of this one. 1044 */ 1045 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS; 1046 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask)); 1047 } 1048 1049 /* 1050 * In the unlikely event that someone is relying on this subcode being 1051 * FPE_FLTILL for denormalize exceptions, it can always be patched back 1052 * again to restore old behaviour. 1053 */ 1054 int fpe_fltden = FPE_FLTDEN; 1055 1056 /* 1057 * Map from the FPU status word to the FP exception si_code. 1058 */ 1059 static int 1060 fpe_sicode(uint_t sw) 1061 { 1062 if (sw & FPS_IE) 1063 return (FPE_FLTINV); 1064 if (sw & FPS_ZE) 1065 return (FPE_FLTDIV); 1066 if (sw & FPS_DE) 1067 return (fpe_fltden); 1068 if (sw & FPS_OE) 1069 return (FPE_FLTOVF); 1070 if (sw & FPS_UE) 1071 return (FPE_FLTUND); 1072 if (sw & FPS_PE) 1073 return (FPE_FLTRES); 1074 return (FPE_FLTINV); /* default si_code for other exceptions */ 1075 } 1076 1077 /* 1078 * Map from the SSE status word to the FP exception si_code. 1079 */ 1080 static int 1081 fpe_simd_sicode(uint_t sw) 1082 { 1083 if (sw & SSE_IE) 1084 return (FPE_FLTINV); 1085 if (sw & SSE_ZE) 1086 return (FPE_FLTDIV); 1087 if (sw & SSE_DE) 1088 return (FPE_FLTDEN); 1089 if (sw & SSE_OE) 1090 return (FPE_FLTOVF); 1091 if (sw & SSE_UE) 1092 return (FPE_FLTUND); 1093 if (sw & SSE_PE) 1094 return (FPE_FLTRES); 1095 return (FPE_FLTINV); /* default si_code for other exceptions */ 1096 } 1097 1098 /* 1099 * This routine is invoked as part of libc's __fpstart implementation 1100 * via sysi86(2). 1101 * 1102 * It may be called -before- any context has been assigned in which case 1103 * we try and avoid touching the hardware. Or it may be invoked well 1104 * after the context has been assigned and fiddled with, in which case 1105 * just tweak it directly. 1106 */ 1107 void 1108 fpsetcw(uint16_t fcw, uint32_t mxcsr) 1109 { 1110 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1111 struct fxsave_state *fx; 1112 1113 if (!fpu_exists || fp_kind == FP_NO) 1114 return; 1115 1116 if ((fp->fpu_flags & FPU_EN) == 0) { 1117 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) { 1118 /* 1119 * Common case. Floating point unit not yet 1120 * enabled, and kernel already intends to initialize 1121 * the hardware the way the caller wants. 1122 */ 1123 return; 1124 } 1125 /* 1126 * Hmm. Userland wants a different default. 1127 * Do a fake "first trap" to establish the context, then 1128 * handle as if we already had a context before we came in. 1129 */ 1130 kpreempt_disable(); 1131 fp_seed(); 1132 kpreempt_enable(); 1133 } 1134 1135 /* 1136 * Ensure that the current hardware state is flushed back to the 1137 * pcb, then modify that copy. Next use of the fp will 1138 * restore the context. 1139 */ 1140 fp_save(fp); 1141 1142 switch (fp_save_mech) { 1143 case FP_FXSAVE: 1144 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 1145 fx->fx_fcw = fcw; 1146 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1147 break; 1148 1149 case FP_XSAVE: 1150 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 1151 fx->fx_fcw = fcw; 1152 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1153 /* 1154 * Always set LEGACY_FP as it may have been cleared by XSAVE 1155 * instruction 1156 */ 1157 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; 1158 break; 1159 default: 1160 panic("Invalid fp_save_mech"); 1161 /*NOTREACHED*/ 1162 } 1163 } 1164 1165 static void 1166 kernel_fpu_fpstate_init(kfpu_state_t *kfpu) 1167 { 1168 struct xsave_state *xs; 1169 1170 switch (fp_save_mech) { 1171 case FP_FXSAVE: 1172 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx, 1173 sizeof (struct fxsave_state)); 1174 kfpu->kfpu_ctx.fpu_xsave_mask = 0; 1175 break; 1176 case FP_XSAVE: 1177 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs; 1178 bzero(xs, cpuid_get_xsave_size()); 1179 bcopy(&avx_initial, xs, sizeof (*xs)); 1180 xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; 1181 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL; 1182 break; 1183 default: 1184 panic("invalid fp_save_mech"); 1185 } 1186 1187 /* 1188 * Set the corresponding flags that the system expects on the FPU state 1189 * to indicate that this is our state. The FPU_EN flag is required to 1190 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly 1191 * not set below as it represents that this state is being suppressed 1192 * by the kernel. 1193 */ 1194 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID; 1195 kfpu->kfpu_flags |= KFPU_F_INITIALIZED; 1196 } 1197 1198 kfpu_state_t * 1199 kernel_fpu_alloc(int kmflags) 1200 { 1201 kfpu_state_t *kfpu; 1202 1203 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) { 1204 return (NULL); 1205 } 1206 1207 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic = 1208 kmem_cache_alloc(fpsave_cachep, kmflags); 1209 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) { 1210 kmem_free(kfpu, sizeof (kfpu_state_t)); 1211 return (NULL); 1212 } 1213 1214 kernel_fpu_fpstate_init(kfpu); 1215 1216 return (kfpu); 1217 } 1218 1219 void 1220 kernel_fpu_free(kfpu_state_t *kfpu) 1221 { 1222 kmem_cache_free(fpsave_cachep, 1223 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic); 1224 kmem_free(kfpu, sizeof (kfpu_state_t)); 1225 } 1226 1227 static void 1228 kernel_fpu_ctx_save(void *arg) 1229 { 1230 kfpu_state_t *kfpu = arg; 1231 fpu_ctx_t *pf; 1232 1233 if (kfpu == NULL) { 1234 /* 1235 * A NULL kfpu implies this is a kernel thread with an LWP and 1236 * no user-level FPU usage. Use the lwp fpu save area. 1237 */ 1238 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1239 1240 ASSERT(curthread->t_procp->p_flag & SSYS); 1241 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1242 1243 fp_save(pf); 1244 } else { 1245 pf = &kfpu->kfpu_ctx; 1246 1247 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1248 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1249 1250 /* 1251 * Note, we can't use fp_save because it assumes that we're 1252 * saving to the thread's PCB and not somewhere else. Because 1253 * this is a different FPU context, we instead have to do this 1254 * ourselves. 1255 */ 1256 switch (fp_save_mech) { 1257 case FP_FXSAVE: 1258 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx); 1259 break; 1260 case FP_XSAVE: 1261 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask); 1262 break; 1263 default: 1264 panic("Invalid fp_save_mech"); 1265 } 1266 1267 /* 1268 * Because we have saved context here, our save state is no 1269 * longer valid and therefore needs to be reinitialized. 1270 */ 1271 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED; 1272 } 1273 1274 pf->fpu_flags |= FPU_VALID; 1275 1276 /* 1277 * Clear KFPU flag. This allows swtch to check for improper kernel 1278 * usage of the FPU (i.e. switching to a new thread while the old 1279 * thread was in the kernel and using the FPU, but did not perform a 1280 * context save). 1281 */ 1282 curthread->t_flag &= ~T_KFPU; 1283 } 1284 1285 static void 1286 kernel_fpu_ctx_restore(void *arg) 1287 { 1288 kfpu_state_t *kfpu = arg; 1289 fpu_ctx_t *pf; 1290 1291 if (kfpu == NULL) { 1292 /* 1293 * A NULL kfpu implies this is a kernel thread with an LWP and 1294 * no user-level FPU usage. Use the lwp fpu save area. 1295 */ 1296 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1297 1298 ASSERT(curthread->t_procp->p_flag & SSYS); 1299 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1300 } else { 1301 pf = &kfpu->kfpu_ctx; 1302 1303 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1304 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1305 } 1306 1307 fp_restore(pf); 1308 curthread->t_flag |= T_KFPU; 1309 } 1310 1311 /* 1312 * Validate that the thread is not switching off-cpu while actively using the 1313 * FPU within the kernel. 1314 */ 1315 void 1316 kernel_fpu_no_swtch(void) 1317 { 1318 if ((curthread->t_flag & T_KFPU) != 0) { 1319 panic("curthread swtch-ing while the kernel is using the FPU"); 1320 } 1321 } 1322 1323 static const struct ctxop_template kfpu_ctxop_tpl = { 1324 .ct_rev = CTXOP_TPL_REV, 1325 .ct_save = kernel_fpu_ctx_save, 1326 .ct_restore = kernel_fpu_ctx_restore, 1327 }; 1328 1329 void 1330 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags) 1331 { 1332 klwp_t *pl = curthread->t_lwp; 1333 struct ctxop *ctx; 1334 1335 if ((curthread->t_flag & T_KFPU) != 0) { 1336 panic("curthread attempting to nest kernel FPU states"); 1337 } 1338 1339 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */ 1340 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) != 1341 (KFPU_USE_LWP | KFPU_NO_STATE)); 1342 1343 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) { 1344 /* 1345 * Since we don't have a kfpu_state or usable lwp pcb_fpu to 1346 * hold our kernel FPU context, we depend on the caller doing 1347 * kpreempt_disable for the duration of our FPU usage. This 1348 * should only be done for very short periods of time. 1349 */ 1350 ASSERT(curthread->t_preempt > 0); 1351 ASSERT(kfpu == NULL); 1352 1353 if (pl != NULL) { 1354 /* 1355 * We might have already saved once so FPU_VALID could 1356 * be set. This is handled in fp_save. 1357 */ 1358 fp_save(&pl->lwp_pcb.pcb_fpu); 1359 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1360 } 1361 1362 curthread->t_flag |= T_KFPU; 1363 1364 /* Always restore the fpu to the initial state. */ 1365 fpinit(); 1366 1367 return; 1368 } 1369 1370 /* 1371 * We either have a kfpu, or are using the LWP pcb_fpu for context ops. 1372 */ 1373 1374 if ((flags & KFPU_USE_LWP) == 0) { 1375 if (kfpu->kfpu_curthread != NULL) 1376 panic("attempting to reuse kernel FPU state at %p when " 1377 "another thread already is using", kfpu); 1378 1379 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0) 1380 kernel_fpu_fpstate_init(kfpu); 1381 1382 kfpu->kfpu_curthread = curthread; 1383 } 1384 1385 /* 1386 * Not all threads may have an active LWP. If they do and we're not 1387 * going to re-use the LWP, then we should go ahead and save the state. 1388 * We must also note that the fpu is now being used by the kernel and 1389 * therefore we do not want to manage the fpu state via the user-level 1390 * thread's context handlers. 1391 * 1392 * We might have already saved once (due to a prior use of the kernel 1393 * FPU or another code path) so FPU_VALID could be set. This is handled 1394 * by fp_save, as is the FPU_EN check. 1395 */ 1396 ctx = ctxop_allocate(&kfpu_ctxop_tpl, kfpu); 1397 kpreempt_disable(); 1398 if (pl != NULL) { 1399 if ((flags & KFPU_USE_LWP) == 0) 1400 fp_save(&pl->lwp_pcb.pcb_fpu); 1401 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1402 } 1403 1404 /* 1405 * Set the context operations for kernel FPU usage. Because kernel FPU 1406 * setup and ctxop attachment needs to happen under the protection of 1407 * kpreempt_disable(), we allocate the ctxop outside the guard so its 1408 * sleeping allocation will not cause a voluntary swtch(). This allows 1409 * the rest of the initialization to proceed, ensuring valid state for 1410 * the ctxop handlers. 1411 */ 1412 ctxop_attach(curthread, ctx); 1413 curthread->t_flag |= T_KFPU; 1414 1415 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) { 1416 /* 1417 * For pure kernel threads with an LWP, we can use the LWP's 1418 * pcb_fpu to save/restore context. 1419 */ 1420 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu; 1421 1422 VERIFY(curthread->t_procp->p_flag & SSYS); 1423 VERIFY(kfpu == NULL); 1424 ASSERT((pf->fpu_flags & FPU_EN) == 0); 1425 1426 /* Always restore the fpu to the initial state. */ 1427 if (fp_save_mech == FP_XSAVE) 1428 pf->fpu_xsave_mask = XFEATURE_FP_ALL; 1429 fpinit(); 1430 pf->fpu_flags = FPU_EN | FPU_KERNEL; 1431 } else { 1432 /* initialize the kfpu state */ 1433 kernel_fpu_ctx_restore(kfpu); 1434 } 1435 kpreempt_enable(); 1436 } 1437 1438 void 1439 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags) 1440 { 1441 ulong_t iflags; 1442 1443 if ((curthread->t_flag & T_KFPU) == 0) { 1444 panic("curthread attempting to clear kernel FPU state " 1445 "without using it"); 1446 } 1447 1448 /* 1449 * General comments on why the rest of this function is structured the 1450 * way it is. Be aware that there is a lot of subtlety here. 1451 * 1452 * If a user-level thread ever uses the fpu while in the kernel, then 1453 * we cannot call fpdisable since that does STTS. That will set the 1454 * ts bit in %cr0 which will cause an exception if anything touches the 1455 * fpu. However, the user-level context switch handler (fpsave_ctxt) 1456 * needs to access the fpu to save the registers into the pcb. 1457 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in 1458 * fprestore_ctxt when the thread context switched onto the CPU. 1459 * 1460 * Calling fpdisable only effects the current CPU's %cr0 register. 1461 * 1462 * During ctxop_remove and kpreempt_enable, we can voluntarily context 1463 * switch, so the CPU we were on when we entered this function might 1464 * not be the same one we're on when we return from ctxop_remove or end 1465 * the function. Note there can be user-level context switch handlers 1466 * still installed if this is a user-level thread. 1467 * 1468 * We also must be careful in the unlikely chance we're running in an 1469 * interrupt thread, since we can't leave the CPU's %cr0 TS state set 1470 * incorrectly for the "real" thread to resume on this CPU. 1471 */ 1472 1473 if ((flags & KFPU_NO_STATE) == 0) { 1474 kpreempt_disable(); 1475 } else { 1476 ASSERT(curthread->t_preempt > 0); 1477 } 1478 1479 curthread->t_flag &= ~T_KFPU; 1480 1481 /* 1482 * When we are ending things, we explicitly don't save the current 1483 * kernel FPU state back to the temporary state. The kfpu API is not 1484 * intended to be a permanent save location. 1485 * 1486 * If this is a user-level thread and we were to context switch 1487 * before returning to user-land, fpsave_ctxt will be a no-op since we 1488 * already saved the user-level FPU state the first time we run 1489 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over 1490 * the user-level fpu state). The fpsave_ctxt functions only save if 1491 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so 1492 * fprestore_ctxt will be done in sys_rtt_common when the thread 1493 * finally returns to user-land. 1494 */ 1495 1496 if ((curthread->t_procp->p_flag & SSYS) != 0 && 1497 curthread->t_intr == NULL) { 1498 /* 1499 * A kernel thread which is not an interrupt thread, so we 1500 * STTS now. 1501 */ 1502 fpdisable(); 1503 } 1504 1505 if ((flags & KFPU_NO_STATE) == 0) { 1506 ctxop_remove(curthread, &kfpu_ctxop_tpl, kfpu); 1507 1508 if (kfpu != NULL) { 1509 if (kfpu->kfpu_curthread != curthread) { 1510 panic("attempting to end kernel FPU state " 1511 "for %p, but active thread is not " 1512 "curthread", kfpu); 1513 } else { 1514 kfpu->kfpu_curthread = NULL; 1515 } 1516 } 1517 1518 kpreempt_enable(); 1519 } 1520 1521 if (curthread->t_lwp != NULL) { 1522 uint_t f; 1523 1524 if (flags & KFPU_USE_LWP) { 1525 f = FPU_EN | FPU_KERNEL; 1526 } else { 1527 f = FPU_KERNEL; 1528 } 1529 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f; 1530 } 1531 } 1532