1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2021 Joyent, Inc. 24 * Copyright 2021 RackTop Systems, Inc. 25 */ 26 27 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ 29 /* All Rights Reserved */ 30 31 /* Copyright (c) 1987, 1988 Microsoft Corporation */ 32 /* All Rights Reserved */ 33 34 /* 35 * Copyright (c) 2009, Intel Corporation. 36 * All rights reserved. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/param.h> 41 #include <sys/signal.h> 42 #include <sys/regset.h> 43 #include <sys/privregs.h> 44 #include <sys/psw.h> 45 #include <sys/trap.h> 46 #include <sys/fault.h> 47 #include <sys/systm.h> 48 #include <sys/user.h> 49 #include <sys/file.h> 50 #include <sys/proc.h> 51 #include <sys/pcb.h> 52 #include <sys/lwp.h> 53 #include <sys/cpuvar.h> 54 #include <sys/thread.h> 55 #include <sys/disp.h> 56 #include <sys/fp.h> 57 #include <sys/siginfo.h> 58 #include <sys/archsystm.h> 59 #include <sys/kmem.h> 60 #include <sys/debug.h> 61 #include <sys/x86_archext.h> 62 #include <sys/sysmacros.h> 63 #include <sys/cmn_err.h> 64 #include <sys/kfpu.h> 65 66 /* 67 * FPU Management Overview 68 * ----------------------- 69 * 70 * The x86 FPU has evolved substantially since its days as the x87 coprocessor; 71 * however, many aspects of its life as a coprocessor are still around in x86. 72 * 73 * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU. 74 * While that state still exists, there is much more that is covered by the FPU. 75 * Today, this includes not just traditional FPU state, but also supervisor only 76 * state. The following state is currently managed and covered logically by the 77 * idea of the FPU registers: 78 * 79 * o Traditional x87 FPU 80 * o Vector Registers (%xmm, %ymm, %zmm) 81 * o Memory Protection Extensions (MPX) Bounds Registers 82 * o Protected Key Rights Registers (PKRU) 83 * o Processor Trace data 84 * 85 * The rest of this covers how the FPU is managed and controlled, how state is 86 * saved and restored between threads, interactions with hypervisors, and other 87 * information exported to user land through aux vectors. A lot of background 88 * information is here to synthesize major parts of the Intel SDM, but 89 * unfortunately, it is not a replacement for reading it. 90 * 91 * FPU Control Registers 92 * --------------------- 93 * 94 * Because the x87 FPU began its life as a co-processor and the FPU was 95 * optional there are several bits that show up in %cr0 that we have to 96 * manipulate when dealing with the FPU. These are: 97 * 98 * o CR0.ET The 'extension type' bit. This was used originally to indicate 99 * that the FPU co-processor was present. Now it is forced on for 100 * compatibility. This is often used to verify whether or not the 101 * FPU is present. 102 * 103 * o CR0.NE The 'native error' bit. Used to indicate that native error 104 * mode should be enabled. This indicates that we should take traps 105 * on FPU errors. The OS enables this early in boot. 106 * 107 * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not 108 * wait/fwait instructions generate a #NM if CR0.TS is set. 109 * 110 * o CR0.EM The 'Emulation' bit. This is used to cause floating point 111 * operations (x87 through SSE4) to trap with a #UD so they can be 112 * emulated. The system never sets this bit, but makes sure it is 113 * clear on processor start up. 114 * 115 * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating 116 * point operation will generate a #NM. An fwait will as well, 117 * depending on the value in CR0.MP. 118 * 119 * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by 120 * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more 121 * complicated role. Historically it has been used to allow running systems to 122 * restore the FPU registers lazily. This will be discussed in greater depth 123 * later on. 124 * 125 * %cr4 is also used as part of the FPU control. Specifically we need to worry 126 * about the following bits in the system: 127 * 128 * o CR4.OSFXSR This bit is used to indicate that the OS understands and 129 * supports the execution of the fxsave and fxrstor 130 * instructions. This bit is required to be set to enable 131 * the use of the SSE->SSE4 instructions. 132 * 133 * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand 134 * and take a SIMD floating point exception (#XM). This bit 135 * is always enabled by the system. 136 * 137 * o CR4.OSXSAVE This bit is used to indicate that the OS understands and 138 * supports the execution of the xsave and xrstor family of 139 * instructions. This bit is required to use any of the AVX 140 * and newer feature sets. 141 * 142 * Because all supported processors are 64-bit, they'll always support the XMM 143 * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot. 144 * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid. 145 * 146 * %xcr0 is used to manage the behavior of the xsave feature set and is only 147 * present on the system if xsave is supported. %xcr0 is read and written to 148 * through by the xgetbv and xsetbv instructions. This register is present 149 * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a 150 * different component of the xsave state and controls whether or not that 151 * information is saved and restored. For newer feature sets like AVX and MPX, 152 * it also controls whether or not the corresponding instructions can be 153 * executed (much like CR0.OSFXSR does for the SSE feature sets). 154 * 155 * Everything in %xcr0 is around features available to users. There is also the 156 * IA32_XSS MSR which is used to control supervisor-only features that are still 157 * part of the xsave state. Bits that can be set in %xcr0 are reserved in 158 * IA32_XSS and vice versa. This is an important property that is particularly 159 * relevant to how the xsave instructions operate. 160 * 161 * Save Mechanisms 162 * --------------- 163 * 164 * When switching between running threads the FPU state needs to be saved and 165 * restored by the OS. If this state was not saved, users would rightfully 166 * complain about corrupt state. There are three mechanisms that exist on the 167 * processor for saving and restoring these state images: 168 * 169 * o fsave 170 * o fxsave 171 * o xsave 172 * 173 * fsave saves and restores only the x87 FPU and is the oldest of these 174 * mechanisms. This mechanism is never used in the kernel today because we are 175 * always running on systems that support fxsave. 176 * 177 * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register 178 * state to be saved and restored to and from a struct fxsave_state. This is the 179 * default mechanism that is used to save and restore the FPU on amd64. An 180 * important aspect of fxsave that was different from the original i386 fsave 181 * mechanism is that the restoring of FPU state with pending exceptions will not 182 * generate an exception, it will be deferred to the next use of the FPU. 183 * 184 * The final and by far the most complex mechanism is that of the xsave set. 185 * xsave allows for saving and restoring all of the traditional x86 pieces (x87 186 * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc. 187 * registers. 188 * 189 * Data is saved and restored into and out of a struct xsave_state. The first 190 * part of the struct xsave_state is equivalent to the struct fxsave_state. 191 * After that, there is a header which is used to describe the remaining 192 * portions of the state. The header is a 64-byte value of which the first two 193 * uint64_t values are defined and the rest are reserved and must be zero. The 194 * first uint64_t is the xstate_bv member. This describes which values in the 195 * xsave_state are actually valid and present. This is updated on a save and 196 * used on restore. The second member is the xcomp_bv member. Its last bit 197 * determines whether or not a compressed version of the structure is used. 198 * 199 * When the uncompressed structure is used (currently the only format we 200 * support), then each state component is at a fixed offset in the structure, 201 * even if it is not being used. For example, if you only saved the AVX related 202 * state, but did not save the MPX related state, the offset would not change 203 * for any component. With the compressed format, components that aren't used 204 * are all elided (though the x87 and SSE state are always there). 205 * 206 * Unlike fxsave which saves all state, the xsave family does not always save 207 * and restore all the state that could be covered by the xsave_state. The 208 * instructions all take an argument which is a mask of what to consider. This 209 * is the same mask that will be used in the xstate_bv vector and it is also the 210 * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only 211 * considered with the xsaves and xrstors instructions. 212 * 213 * When a save or restore is requested, a bitwise and is performed between the 214 * requested bits and those that have been enabled in %xcr0. Only the bits that 215 * match that are then saved or restored. Others will be silently ignored by 216 * the processor. This idea is used often in the OS. We will always request that 217 * we save and restore all of the state, but only those portions that are 218 * actually enabled in %xcr0 will be touched. 219 * 220 * If a feature has been asked to be restored that is not set in the xstate_bv 221 * feature vector of the save state, then it will be set to its initial state by 222 * the processor (usually zeros). Also, when asked to save state, the processor 223 * may not write out data that is in its initial state as an optimization. This 224 * optimization only applies to saving data and not to restoring data. 225 * 226 * There are a few different variants of the xsave and xrstor instruction. They 227 * are: 228 * 229 * o xsave This is the original save instruction. It will save all of the 230 * requested data in the xsave state structure. It only saves data 231 * in the uncompressed (xcomp_bv[63] is zero) format. It may be 232 * executed at all privilege levels. 233 * 234 * o xrstor This is the original restore instruction. It will restore all of 235 * the requested data. The xrstor function can handle both the 236 * compressed and uncompressed formats. It may be executed at all 237 * privilege levels. 238 * 239 * o xsaveopt This is a variant of the xsave instruction that employs 240 * optimizations to try and only write out state that has been 241 * modified since the last time an xrstor instruction was called. 242 * The processor tracks a tuple of information about the last 243 * xrstor and tries to ensure that the same buffer is being used 244 * when this optimization is being used. However, because of the 245 * way that it tracks the xrstor buffer based on the address of it, 246 * it is not suitable for use if that buffer can be easily reused. 247 * The most common case is trying to save data to the stack in 248 * rtld. It may be executed at all privilege levels. 249 * 250 * o xsavec This is a variant of the xsave instruction that writes out the 251 * compressed form of the xsave_state. Otherwise it behaves as 252 * xsave. It may be executed at all privilege levels. 253 * 254 * o xsaves This is a variant of the xsave instruction. It is similar to 255 * xsavec in that it always writes the compressed form of the 256 * buffer. Unlike all the other forms, this instruction looks at 257 * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine 258 * what to save and restore. xsaves also implements the same 259 * optimization that xsaveopt does around modified pieces. User 260 * land may not execute the instruction. 261 * 262 * o xrstors This is a variant of the xrstor instruction. Similar to xsaves 263 * it can save and restore both the user and privileged states. 264 * Unlike xrstor it can only operate on the compressed form. 265 * User land may not execute the instruction. 266 * 267 * Based on all of these, the kernel has a precedence for what it will use. 268 * Basically, xsaves (not supported) is preferred to xsaveopt, which is 269 * preferred to xsave. A similar scheme is used when informing rtld (more later) 270 * about what it should use. xsavec is preferred to xsave. xsaveopt is not 271 * recommended due to the modified optimization not being appropriate for this 272 * use. 273 * 274 * Finally, there is one last gotcha with the xsave state. Importantly some AMD 275 * processors did not always save and restore some of the FPU exception state in 276 * some cases like Intel did. In those cases the OS will make up for this fact 277 * itself. 278 * 279 * FPU Initialization 280 * ------------------ 281 * 282 * One difference with the FPU registers is that not all threads have FPU state, 283 * only those that have an lwp. Generally this means kernel threads, which all 284 * share p0 and its lwp, do not have FPU state. Though there are definitely 285 * exceptions such as kcfpoold. In the rest of this discussion we'll use thread 286 * and lwp interchangeably, just think of thread meaning a thread that has a 287 * lwp. 288 * 289 * Each lwp has its FPU state allocated in its pcb (process control block). The 290 * actual storage comes from the fpsave_cachep kmem cache. This cache is sized 291 * dynamically at start up based on the save mechanism that we're using and the 292 * amount of memory required for it. This is dynamic because the xsave_state 293 * size varies based on the supported feature set. 294 * 295 * The hardware side of the FPU is initialized early in boot before we mount the 296 * root file system. This is effectively done in fpu_probe(). This is where we 297 * make the final decision about what the save and restore mechanisms we should 298 * use are, create the fpsave_cachep kmem cache, and initialize a number of 299 * function pointers that use save and restoring logic. 300 * 301 * The thread/lwp side is a a little more involved. There are two different 302 * things that we need to concern ourselves with. The first is how the FPU 303 * resources are allocated and the second is how the FPU state is initialized 304 * for a given lwp. 305 * 306 * We allocate the FPU save state from our kmem cache as part of lwp_fp_init(). 307 * This is always called unconditionally by the system as part of creating an 308 * LWP. 309 * 310 * There are three different initialization paths that we deal with. The first 311 * is when we are executing a new process. As part of exec all of the register 312 * state is reset. The exec case is particularly important because init is born 313 * like Athena, sprouting from the head of the kernel, without any true parent 314 * to fork from. The second is used whenever we fork or create a new lwp. The 315 * third is to deal with special lwps like the agent lwp. 316 * 317 * During exec, we will call fp_exec() which will initialize and set up the FPU 318 * state for the process. That will fill in the initial state for the FPU and 319 * also set that state in the FPU itself. As part of fp_exec() we also install a 320 * thread context operations vector that takes care of dealing with the saving 321 * and restoring of the FPU. These context handlers will also be called whenever 322 * an lwp is created or forked. In those cases, to initialize the FPU we will 323 * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context 324 * operations vector for the new thread. 325 * 326 * Next we'll end up in the context operation fp_new_lwp(). This saves the 327 * current thread's state, initializes the new thread's state, and copies over 328 * the relevant parts of the originating thread's state. It's as this point that 329 * we also install the FPU context operations into the new thread, which ensures 330 * that all future threads that are descendants of the current one get the 331 * thread context operations (unless they call exec). 332 * 333 * To deal with some things like the agent lwp, we double check the state of the 334 * FPU in sys_rtt_common() to make sure that it has been enabled before 335 * returning to user land. In general, this path should be rare, but it's useful 336 * for the odd lwp here and there. 337 * 338 * The FPU state will remain valid most of the time. There are times that 339 * the state will be rewritten. For example in restorecontext, due to /proc, or 340 * the lwp calls exec(). Whether the context is being freed or we are resetting 341 * the state, we will call fp_free() to disable the FPU and our context. 342 * 343 * Finally, when the lwp is destroyed, it will actually destroy and free the FPU 344 * state by calling fp_lwp_cleanup(). 345 * 346 * Kernel FPU Multiplexing 347 * ----------------------- 348 * 349 * Just as the kernel has to maintain all of the general purpose registers when 350 * switching between scheduled threads, the same is true of the FPU registers. 351 * 352 * When a thread has FPU state, it also has a set of context operations 353 * installed. These context operations take care of making sure that the FPU is 354 * properly saved and restored during a context switch (fpsave_ctxt and 355 * fprestore_ctxt respectively). This means that the current implementation of 356 * the FPU is 'eager', when a thread is running the CPU will have its FPU state 357 * loaded. While this is always true when executing in userland, there are a few 358 * cases where this is not true in the kernel. 359 * 360 * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was 361 * employed. This meant that the FPU would be saved on a context switch and the 362 * CR0.TS bit would be set. When a thread next tried to use the FPU, it would 363 * then take a #NM trap, at which point we would restore the FPU from the save 364 * area and return to user land. Given the frequency of use of the FPU alone by 365 * libc, there's no point returning to user land just to trap again. 366 * 367 * There are a few cases though where the FPU state may need to be changed for a 368 * thread on its behalf. The most notable cases are in the case of processes 369 * using /proc, restorecontext, forking, etc. In all of these cases the kernel 370 * will force a threads FPU state to be saved into the PCB through the fp_save() 371 * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the 372 * pcb. This indicates that the save state holds currently valid data. As a side 373 * effect of this, CR0.TS will be set. To make sure that all of the state is 374 * updated before returning to user land, in these cases, we set a flag on the 375 * PCB that says the FPU needs to be updated. This will make sure that we take 376 * the slow path out of a system call to fix things up for the thread. Due to 377 * the fact that this is a rather rare case, effectively setting the equivalent 378 * of t_postsys is acceptable. 379 * 380 * CR0.TS will be set after a save occurs and cleared when a restore occurs. 381 * Generally this means it will be cleared immediately by the new thread that is 382 * running in a context switch. However, this isn't the case for kernel threads. 383 * They currently operate with CR0.TS set as no kernel state is restored for 384 * them. This means that using the FPU will cause a #NM and panic. 385 * 386 * The FPU_VALID flag on the currently executing thread's pcb is meant to track 387 * what the value of CR0.TS should be. If it is set, then CR0.TS will be set. 388 * However, because we eagerly restore, the only time that CR0.TS should be set 389 * for a non-kernel thread is during operations where it will be cleared before 390 * returning to user land and importantly, the only data that is in it is its 391 * own. 392 * 393 * Kernel FPU Usage 394 * ---------------- 395 * 396 * Traditionally the kernel never used the FPU since it had no need for 397 * floating point operations. However, modern FPU hardware supports a variety 398 * of SIMD extensions which can speed up code such as parity calculations or 399 * encryption. 400 * 401 * To allow the kernel to take advantage of these features, the 402 * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped 403 * around any usage of the FPU by the kernel to ensure that user-level context 404 * is properly saved/restored, as well as to properly setup the FPU for use by 405 * the kernel. There are a variety of ways this wrapping can be used, as 406 * discussed in this section below. 407 * 408 * When kernel_fpu_begin() and kernel_fpu_end() are used for extended 409 * operations, the kernel_fpu_alloc() function should be used to allocate a 410 * kfpu_state_t structure that is used to save/restore the thread's kernel FPU 411 * state. This structure is not tied to any thread. That is, different threads 412 * can reuse the same kfpu_state_t structure, although not concurrently. A 413 * kfpu_state_t structure is freed by the kernel_fpu_free() function. 414 * 415 * In some cases, the kernel may need to use the FPU for a short operation 416 * without the overhead to manage a kfpu_state_t structure and without 417 * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE 418 * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags 419 * parameter. This indicates that there is no kfpu_state_t. When used this way, 420 * kernel preemption should be disabled by the caller (kpreempt_disable) before 421 * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end(). 422 * For this usage, it is important to limit the kernel's FPU use to short 423 * operations. The tradeoff between using the FPU without a kfpu_state_t 424 * structure vs. the overhead of allowing a context switch while using the FPU 425 * should be carefully considered on a case by case basis. 426 * 427 * In other cases, kernel threads have an LWP, but never execute in user space. 428 * In this situation, the LWP's pcb_fpu area can be used to save/restore the 429 * kernel's FPU state if the thread is context switched, instead of having to 430 * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the 431 * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to 432 * enable this behavior. It is the caller's responsibility to ensure that this 433 * is only used for a kernel thread which never executes in user space. 434 * 435 * FPU Exceptions 436 * -------------- 437 * 438 * Certain operations can cause the kernel to take traps due to FPU activity. 439 * Generally these events will cause a user process to receive a SIGFPU and if 440 * the kernel receives it in kernel context, we will die. Traditionally the #NM 441 * (Device Not Available / No Math) exception generated by CR0.TS would have 442 * caused us to restore the FPU. Now it is a fatal event regardless of whether 443 * or not user land causes it. 444 * 445 * While there are some cases where the kernel uses the FPU, it is up to the 446 * kernel to use the FPU in a way such that it cannot receive a trap or to use 447 * the appropriate trap protection mechanisms. 448 * 449 * Hypervisors 450 * ----------- 451 * 452 * When providing support for hypervisors things are a little bit more 453 * complicated because the FPU is not virtualized at all. This means that they 454 * need to save and restore the FPU and %xcr0 across entry and exit to the 455 * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These 456 * allow us to use the full native state to make sure that we are always saving 457 * and restoring the full FPU that the host sees, even when the guest is using a 458 * subset. 459 * 460 * One tricky aspect of this is that the guest may be using a subset of %xcr0 461 * and therefore changing our %xcr0 on the fly. It is vital that when we're 462 * saving and restoring the FPU that we always use the largest %xcr0 contents 463 * otherwise we will end up leaving behind data in it. 464 * 465 * ELF PLT Support 466 * --------------- 467 * 468 * rtld has to preserve a subset of the FPU when it is saving and restoring 469 * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for 470 * more information. As a result, we set up an aux vector that contains 471 * information about what save and restore mechanisms it should be using and 472 * the sizing thereof based on what the kernel supports. This is passed down in 473 * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is 474 * initialized in fpu_subr.c. 475 */ 476 477 kmem_cache_t *fpsave_cachep; 478 479 /* Legacy fxsave layout + xsave header + ymm */ 480 #define AVX_XSAVE_SIZE (512 + 64 + 256) 481 482 /* 483 * Various sanity checks. 484 */ 485 CTASSERT(sizeof (struct fxsave_state) == 512); 486 CTASSERT(sizeof (struct fnsave_state) == 108); 487 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0); 488 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE); 489 490 /* 491 * This structure is the x86 implementation of the kernel FPU that is defined in 492 * uts/common/sys/kfpu.h. 493 */ 494 495 typedef enum kfpu_flags { 496 /* 497 * This indicates that the save state has initial FPU data. 498 */ 499 KFPU_F_INITIALIZED = 0x01 500 } kfpu_flags_t; 501 502 struct kfpu_state { 503 fpu_ctx_t kfpu_ctx; 504 kfpu_flags_t kfpu_flags; 505 kthread_t *kfpu_curthread; 506 }; 507 508 /* 509 * Initial kfpu state for SSE/SSE2 used by fpinit() 510 */ 511 const struct fxsave_state sse_initial = { 512 FPU_CW_INIT, /* fx_fcw */ 513 0, /* fx_fsw */ 514 0, /* fx_fctw */ 515 0, /* fx_fop */ 516 0, /* fx_rip */ 517 0, /* fx_rdp */ 518 SSE_MXCSR_INIT /* fx_mxcsr */ 519 /* rest of structure is zero */ 520 }; 521 522 /* 523 * Initial kfpu state for AVX used by fpinit() 524 */ 525 const struct xsave_state avx_initial = { 526 /* 527 * The definition below needs to be identical with sse_initial 528 * defined above. 529 */ 530 { 531 FPU_CW_INIT, /* fx_fcw */ 532 0, /* fx_fsw */ 533 0, /* fx_fctw */ 534 0, /* fx_fop */ 535 0, /* fx_rip */ 536 0, /* fx_rdp */ 537 SSE_MXCSR_INIT /* fx_mxcsr */ 538 /* rest of structure is zero */ 539 }, 540 /* 541 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid, 542 * and CPU should initialize XMM/YMM. 543 */ 544 1, 545 0 /* xs_xcomp_bv */ 546 /* rest of structure is zero */ 547 }; 548 549 /* 550 * mxcsr_mask value (possibly reset in fpu_probe); used to avoid 551 * the #gp exception caused by setting unsupported bits in the 552 * MXCSR register 553 */ 554 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT; 555 556 /* 557 * Initial kfpu state for x87 used by fpinit() 558 */ 559 const struct fnsave_state x87_initial = { 560 FPU_CW_INIT, /* f_fcw */ 561 0, /* __f_ign0 */ 562 0, /* f_fsw */ 563 0, /* __f_ign1 */ 564 0xffff, /* f_ftw */ 565 /* rest of structure is zero */ 566 }; 567 568 /* 569 * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we 570 * have an XSAVE-capable chip in fpu_probe. 571 */ 572 void (*fpsave_ctxt)(void *) = fpxsave_ctxt; 573 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt; 574 575 /* 576 * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable. 577 */ 578 void (*xsavep)(struct xsave_state *, uint64_t) = xsave; 579 580 static int fpe_sicode(uint_t); 581 static int fpe_simd_sicode(uint_t); 582 583 /* 584 * Copy the state of parent lwp's floating point context into the new lwp. 585 * Invoked for both fork() and lwp_create(). 586 * 587 * Note that we inherit -only- the control state (e.g. exception masks, 588 * rounding, precision control, etc.); the FPU registers are otherwise 589 * reset to their initial state. 590 */ 591 static void 592 fp_new_lwp(kthread_id_t t, kthread_id_t ct) 593 { 594 struct fpu_ctx *fp; /* parent fpu context */ 595 struct fpu_ctx *cfp; /* new fpu context */ 596 struct fxsave_state *fx, *cfx; 597 struct xsave_state *cxs; 598 599 ASSERT(fp_kind != FP_NO); 600 601 fp = &t->t_lwp->lwp_pcb.pcb_fpu; 602 cfp = &ct->t_lwp->lwp_pcb.pcb_fpu; 603 604 /* 605 * If the parent FPU state is still in the FPU hw then save it; 606 * conveniently, fp_save() already does this for us nicely. 607 */ 608 fp_save(fp); 609 610 cfp->fpu_flags = FPU_EN | FPU_VALID; 611 cfp->fpu_regs.kfpu_status = 0; 612 cfp->fpu_regs.kfpu_xstatus = 0; 613 614 /* 615 * Make sure that the child's FPU is cleaned up and made ready for user 616 * land. 617 */ 618 PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb); 619 620 switch (fp_save_mech) { 621 case FP_FXSAVE: 622 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 623 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx; 624 bcopy(&sse_initial, cfx, sizeof (*cfx)); 625 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 626 cfx->fx_fcw = fx->fx_fcw; 627 break; 628 629 case FP_XSAVE: 630 cfp->fpu_xsave_mask = fp->fpu_xsave_mask; 631 632 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL); 633 634 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 635 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs; 636 cfx = &cxs->xs_fxsave; 637 638 bcopy(&avx_initial, cxs, sizeof (*cxs)); 639 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; 640 cfx->fx_fcw = fx->fx_fcw; 641 cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) & 642 XFEATURE_FP_INITIAL); 643 break; 644 default: 645 panic("Invalid fp_save_mech"); 646 /*NOTREACHED*/ 647 } 648 649 /* 650 * Mark that both the parent and child need to have the FPU cleaned up 651 * before returning to user land. 652 */ 653 654 installctx(ct, cfp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp, 655 fp_new_lwp, NULL, fp_free, NULL); 656 } 657 658 /* 659 * Free any state associated with floating point context. 660 * Fp_free can be called in three cases: 661 * 1) from reaper -> thread_free -> freectx-> fp_free 662 * fp context belongs to a thread on deathrow 663 * nothing to do, thread will never be resumed 664 * thread calling ctxfree is reaper 665 * 666 * 2) from exec -> freectx -> fp_free 667 * fp context belongs to the current thread 668 * must disable fpu, thread calling ctxfree is curthread 669 * 670 * 3) from restorecontext -> setfpregs -> fp_free 671 * we have a modified context in the memory (lwp->pcb_fpu) 672 * disable fpu and release the fp context for the CPU 673 * 674 */ 675 /*ARGSUSED*/ 676 void 677 fp_free(struct fpu_ctx *fp, int isexec) 678 { 679 ASSERT(fp_kind != FP_NO); 680 681 if (fp->fpu_flags & FPU_VALID) 682 return; 683 684 kpreempt_disable(); 685 /* 686 * We want to do fpsave rather than fpdisable so that we can 687 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit 688 */ 689 fp->fpu_flags |= FPU_VALID; 690 /* If for current thread disable FP to track FPU_VALID */ 691 if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) { 692 /* Clear errors if any to prevent frstor from complaining */ 693 (void) fperr_reset(); 694 if (fp_kind & __FP_SSE) 695 (void) fpxerr_reset(); 696 fpdisable(); 697 } 698 kpreempt_enable(); 699 } 700 701 /* 702 * Store the floating point state and disable the floating point unit. 703 */ 704 void 705 fp_save(struct fpu_ctx *fp) 706 { 707 ASSERT(fp_kind != FP_NO); 708 709 kpreempt_disable(); 710 if (!fp || fp->fpu_flags & FPU_VALID || 711 (fp->fpu_flags & FPU_EN) == 0) { 712 kpreempt_enable(); 713 return; 714 } 715 ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu); 716 717 switch (fp_save_mech) { 718 case FP_FXSAVE: 719 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx); 720 break; 721 722 case FP_XSAVE: 723 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 724 break; 725 default: 726 panic("Invalid fp_save_mech"); 727 /*NOTREACHED*/ 728 } 729 730 fp->fpu_flags |= FPU_VALID; 731 732 /* 733 * We save the FPU as part of forking, execing, modifications via /proc, 734 * restorecontext, etc. As such, we need to make sure that we return to 735 * userland with valid state in the FPU. If we're context switched out 736 * before we hit sys_rtt_common() we'll end up having restored the FPU 737 * as part of the context ops operations. The restore logic always makes 738 * sure that FPU_VALID is set before doing a restore so we don't restore 739 * it a second time. 740 */ 741 PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb); 742 743 kpreempt_enable(); 744 } 745 746 /* 747 * Restore the FPU context for the thread: 748 * The possibilities are: 749 * 1. No active FPU context: Load the new context into the FPU hw 750 * and enable the FPU. 751 */ 752 void 753 fp_restore(struct fpu_ctx *fp) 754 { 755 switch (fp_save_mech) { 756 case FP_FXSAVE: 757 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx); 758 break; 759 760 case FP_XSAVE: 761 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); 762 break; 763 default: 764 panic("Invalid fp_save_mech"); 765 /*NOTREACHED*/ 766 } 767 768 fp->fpu_flags &= ~FPU_VALID; 769 } 770 771 /* 772 * Reset the FPU such that it is in a valid state for a new thread that is 773 * coming out of exec. The FPU will be in a usable state at this point. At this 774 * point we know that the FPU state has already been allocated and if this 775 * wasn't an init process, then it will have had fp_free() previously called. 776 */ 777 void 778 fp_exec(void) 779 { 780 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 781 struct ctxop *ctx = installctx_preallocate(); 782 783 if (fp_save_mech == FP_XSAVE) { 784 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 785 } 786 787 /* 788 * Make sure that we're not preempted in the middle of initializing the 789 * FPU on CPU. 790 */ 791 kpreempt_disable(); 792 installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp, 793 fp_new_lwp, NULL, fp_free, ctx); 794 fpinit(); 795 fp->fpu_flags = FPU_EN; 796 kpreempt_enable(); 797 } 798 799 800 /* 801 * Seeds the initial state for the current thread. The possibilities are: 802 * 1. Another process has modified the FPU state before we have done any 803 * initialization: Load the FPU state from the LWP state. 804 * 2. The FPU state has not been externally modified: Load a clean state. 805 */ 806 void 807 fp_seed(void) 808 { 809 struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 810 811 ASSERT(curthread->t_preempt >= 1); 812 ASSERT((fp->fpu_flags & FPU_EN) == 0); 813 814 /* 815 * Always initialize a new context and initialize the hardware. 816 */ 817 if (fp_save_mech == FP_XSAVE) { 818 fp->fpu_xsave_mask = XFEATURE_FP_ALL; 819 } 820 821 installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp, 822 fp_new_lwp, NULL, fp_free, NULL); 823 fpinit(); 824 825 /* 826 * If FPU_VALID is set, it means someone has modified registers via 827 * /proc. In this case, restore the current lwp's state. 828 */ 829 if (fp->fpu_flags & FPU_VALID) 830 fp_restore(fp); 831 832 ASSERT((fp->fpu_flags & FPU_VALID) == 0); 833 fp->fpu_flags = FPU_EN; 834 } 835 836 /* 837 * When using xsave/xrstor, these three functions are used by the lwp code to 838 * manage the memory for the xsave area. 839 */ 840 void 841 fp_lwp_init(struct _klwp *lwp) 842 { 843 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 844 845 /* 846 * We keep a copy of the pointer in lwp_fpu so that we can restore the 847 * value in forklwp() after we duplicate the parent's LWP state. 848 */ 849 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = 850 kmem_cache_alloc(fpsave_cachep, KM_SLEEP); 851 852 if (fp_save_mech == FP_XSAVE) { 853 /* 854 * 855 * We bzero since the fpinit() code path will only 856 * partially initialize the xsave area using avx_inital. 857 */ 858 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state)); 859 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size()); 860 } 861 } 862 863 void 864 fp_lwp_cleanup(struct _klwp *lwp) 865 { 866 struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; 867 868 if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) { 869 kmem_cache_free(fpsave_cachep, 870 fp->fpu_regs.kfpu_u.kfpu_generic); 871 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL; 872 } 873 } 874 875 /* 876 * Called during the process of forklwp(). The kfpu_u pointer will have been 877 * overwritten while copying the parent's LWP structure. We have a valid copy 878 * stashed in the child's lwp_fpu which we use to restore the correct value. 879 */ 880 void 881 fp_lwp_dup(struct _klwp *lwp) 882 { 883 void *xp = lwp->lwp_fpu; 884 size_t sz; 885 886 switch (fp_save_mech) { 887 case FP_FXSAVE: 888 sz = sizeof (struct fxsave_state); 889 break; 890 case FP_XSAVE: 891 sz = cpuid_get_xsave_size(); 892 break; 893 default: 894 panic("Invalid fp_save_mech"); 895 /*NOTREACHED*/ 896 } 897 898 /* copy the parent's values into the new lwp's struct */ 899 bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz); 900 /* now restore the pointer */ 901 lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp; 902 } 903 904 /* 905 * Handle a processor extension error fault 906 * Returns non zero for error. 907 */ 908 909 /*ARGSUSED*/ 910 int 911 fpexterrflt(struct regs *rp) 912 { 913 uint32_t fpcw, fpsw; 914 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 915 916 ASSERT(fp_kind != FP_NO); 917 918 /* 919 * Now we can enable the interrupts. 920 * (NOTE: x87 fp exceptions come thru interrupt gate) 921 */ 922 sti(); 923 924 if (!fpu_exists) 925 return (FPE_FLTINV); 926 927 /* 928 * Do an unconditional save of the FP state. If it's dirty (TS=0), 929 * it'll be saved into the fpu context area passed in (that of the 930 * current thread). If it's not dirty (it may not be, due to 931 * an intervening save due to a context switch between the sti(), 932 * above and here, then it's safe to just use the stored values in 933 * the context save area to determine the cause of the fault. 934 */ 935 fp_save(fp); 936 937 /* clear exception flags in saved state, as if by fnclex */ 938 switch (fp_save_mech) { 939 case FP_FXSAVE: 940 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 941 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw; 942 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS; 943 break; 944 945 case FP_XSAVE: 946 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 947 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw; 948 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS; 949 /* 950 * Always set LEGACY_FP as it may have been cleared by XSAVE 951 * instruction 952 */ 953 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; 954 break; 955 default: 956 panic("Invalid fp_save_mech"); 957 /*NOTREACHED*/ 958 } 959 960 fp->fpu_regs.kfpu_status = fpsw; 961 962 if ((fpsw & FPS_ES) == 0) 963 return (0); /* No exception */ 964 965 /* 966 * "and" the exception flags with the complement of the mask 967 * bits to determine which exception occurred 968 */ 969 return (fpe_sicode(fpsw & ~fpcw & 0x3f)); 970 } 971 972 /* 973 * Handle an SSE/SSE2 precise exception. 974 * Returns a non-zero sicode for error. 975 */ 976 /*ARGSUSED*/ 977 int 978 fpsimderrflt(struct regs *rp) 979 { 980 uint32_t mxcsr, xmask; 981 fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; 982 983 ASSERT(fp_kind & __FP_SSE); 984 985 /* 986 * NOTE: Interrupts are disabled during execution of this 987 * function. They are enabled by the caller in trap.c. 988 */ 989 990 /* 991 * The only way we could have gotten here if there is no FP unit 992 * is via a user executing an INT $19 instruction, so there is 993 * no fault in that case. 994 */ 995 if (!fpu_exists) 996 return (0); 997 998 /* 999 * Do an unconditional save of the FP state. If it's dirty (TS=0), 1000 * it'll be saved into the fpu context area passed in (that of the 1001 * current thread). If it's not dirty, then it's safe to just use 1002 * the stored values in the context save area to determine the 1003 * cause of the fault. 1004 */ 1005 fp_save(fp); /* save the FPU state */ 1006 1007 if (fp_save_mech == FP_XSAVE) { 1008 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr; 1009 fp->fpu_regs.kfpu_status = 1010 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; 1011 } else { 1012 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr; 1013 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; 1014 } 1015 fp->fpu_regs.kfpu_xstatus = mxcsr; 1016 1017 /* 1018 * compute the mask that determines which conditions can cause 1019 * a #xm exception, and use this to clean the status bits so that 1020 * we can identify the true cause of this one. 1021 */ 1022 xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS; 1023 return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask)); 1024 } 1025 1026 /* 1027 * In the unlikely event that someone is relying on this subcode being 1028 * FPE_FLTILL for denormalize exceptions, it can always be patched back 1029 * again to restore old behaviour. 1030 */ 1031 int fpe_fltden = FPE_FLTDEN; 1032 1033 /* 1034 * Map from the FPU status word to the FP exception si_code. 1035 */ 1036 static int 1037 fpe_sicode(uint_t sw) 1038 { 1039 if (sw & FPS_IE) 1040 return (FPE_FLTINV); 1041 if (sw & FPS_ZE) 1042 return (FPE_FLTDIV); 1043 if (sw & FPS_DE) 1044 return (fpe_fltden); 1045 if (sw & FPS_OE) 1046 return (FPE_FLTOVF); 1047 if (sw & FPS_UE) 1048 return (FPE_FLTUND); 1049 if (sw & FPS_PE) 1050 return (FPE_FLTRES); 1051 return (FPE_FLTINV); /* default si_code for other exceptions */ 1052 } 1053 1054 /* 1055 * Map from the SSE status word to the FP exception si_code. 1056 */ 1057 static int 1058 fpe_simd_sicode(uint_t sw) 1059 { 1060 if (sw & SSE_IE) 1061 return (FPE_FLTINV); 1062 if (sw & SSE_ZE) 1063 return (FPE_FLTDIV); 1064 if (sw & SSE_DE) 1065 return (FPE_FLTDEN); 1066 if (sw & SSE_OE) 1067 return (FPE_FLTOVF); 1068 if (sw & SSE_UE) 1069 return (FPE_FLTUND); 1070 if (sw & SSE_PE) 1071 return (FPE_FLTRES); 1072 return (FPE_FLTINV); /* default si_code for other exceptions */ 1073 } 1074 1075 /* 1076 * This routine is invoked as part of libc's __fpstart implementation 1077 * via sysi86(2). 1078 * 1079 * It may be called -before- any context has been assigned in which case 1080 * we try and avoid touching the hardware. Or it may be invoked well 1081 * after the context has been assigned and fiddled with, in which case 1082 * just tweak it directly. 1083 */ 1084 void 1085 fpsetcw(uint16_t fcw, uint32_t mxcsr) 1086 { 1087 struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1088 struct fxsave_state *fx; 1089 1090 if (!fpu_exists || fp_kind == FP_NO) 1091 return; 1092 1093 if ((fp->fpu_flags & FPU_EN) == 0) { 1094 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) { 1095 /* 1096 * Common case. Floating point unit not yet 1097 * enabled, and kernel already intends to initialize 1098 * the hardware the way the caller wants. 1099 */ 1100 return; 1101 } 1102 /* 1103 * Hmm. Userland wants a different default. 1104 * Do a fake "first trap" to establish the context, then 1105 * handle as if we already had a context before we came in. 1106 */ 1107 kpreempt_disable(); 1108 fp_seed(); 1109 kpreempt_enable(); 1110 } 1111 1112 /* 1113 * Ensure that the current hardware state is flushed back to the 1114 * pcb, then modify that copy. Next use of the fp will 1115 * restore the context. 1116 */ 1117 fp_save(fp); 1118 1119 switch (fp_save_mech) { 1120 case FP_FXSAVE: 1121 fx = fp->fpu_regs.kfpu_u.kfpu_fx; 1122 fx->fx_fcw = fcw; 1123 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1124 break; 1125 1126 case FP_XSAVE: 1127 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; 1128 fx->fx_fcw = fcw; 1129 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; 1130 /* 1131 * Always set LEGACY_FP as it may have been cleared by XSAVE 1132 * instruction 1133 */ 1134 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; 1135 break; 1136 default: 1137 panic("Invalid fp_save_mech"); 1138 /*NOTREACHED*/ 1139 } 1140 } 1141 1142 static void 1143 kernel_fpu_fpstate_init(kfpu_state_t *kfpu) 1144 { 1145 struct xsave_state *xs; 1146 1147 switch (fp_save_mech) { 1148 case FP_FXSAVE: 1149 bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx, 1150 sizeof (struct fxsave_state)); 1151 kfpu->kfpu_ctx.fpu_xsave_mask = 0; 1152 break; 1153 case FP_XSAVE: 1154 xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs; 1155 bzero(xs, cpuid_get_xsave_size()); 1156 bcopy(&avx_initial, xs, sizeof (*xs)); 1157 xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; 1158 kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL; 1159 break; 1160 default: 1161 panic("invalid fp_save_mech"); 1162 } 1163 1164 /* 1165 * Set the corresponding flags that the system expects on the FPU state 1166 * to indicate that this is our state. The FPU_EN flag is required to 1167 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly 1168 * not set below as it represents that this state is being suppressed 1169 * by the kernel. 1170 */ 1171 kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID; 1172 kfpu->kfpu_flags |= KFPU_F_INITIALIZED; 1173 } 1174 1175 kfpu_state_t * 1176 kernel_fpu_alloc(int kmflags) 1177 { 1178 kfpu_state_t *kfpu; 1179 1180 if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) { 1181 return (NULL); 1182 } 1183 1184 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic = 1185 kmem_cache_alloc(fpsave_cachep, kmflags); 1186 if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) { 1187 kmem_free(kfpu, sizeof (kfpu_state_t)); 1188 return (NULL); 1189 } 1190 1191 kernel_fpu_fpstate_init(kfpu); 1192 1193 return (kfpu); 1194 } 1195 1196 void 1197 kernel_fpu_free(kfpu_state_t *kfpu) 1198 { 1199 kmem_cache_free(fpsave_cachep, 1200 kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic); 1201 kmem_free(kfpu, sizeof (kfpu_state_t)); 1202 } 1203 1204 static void 1205 kernel_fpu_ctx_save(void *arg) 1206 { 1207 kfpu_state_t *kfpu = arg; 1208 fpu_ctx_t *pf; 1209 1210 if (kfpu == NULL) { 1211 /* 1212 * A NULL kfpu implies this is a kernel thread with an LWP and 1213 * no user-level FPU usage. Use the lwp fpu save area. 1214 */ 1215 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1216 1217 ASSERT(curthread->t_procp->p_flag & SSYS); 1218 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1219 1220 fp_save(pf); 1221 } else { 1222 pf = &kfpu->kfpu_ctx; 1223 1224 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1225 ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); 1226 1227 /* 1228 * Note, we can't use fp_save because it assumes that we're 1229 * saving to the thread's PCB and not somewhere else. Because 1230 * this is a different FPU context, we instead have to do this 1231 * ourselves. 1232 */ 1233 switch (fp_save_mech) { 1234 case FP_FXSAVE: 1235 fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx); 1236 break; 1237 case FP_XSAVE: 1238 xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask); 1239 break; 1240 default: 1241 panic("Invalid fp_save_mech"); 1242 } 1243 1244 /* 1245 * Because we have saved context here, our save state is no 1246 * longer valid and therefore needs to be reinitialized. 1247 */ 1248 kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED; 1249 } 1250 1251 pf->fpu_flags |= FPU_VALID; 1252 1253 /* 1254 * Clear KFPU flag. This allows swtch to check for improper kernel 1255 * usage of the FPU (i.e. switching to a new thread while the old 1256 * thread was in the kernel and using the FPU, but did not perform a 1257 * context save). 1258 */ 1259 curthread->t_flag &= ~T_KFPU; 1260 } 1261 1262 static void 1263 kernel_fpu_ctx_restore(void *arg) 1264 { 1265 kfpu_state_t *kfpu = arg; 1266 fpu_ctx_t *pf; 1267 1268 if (kfpu == NULL) { 1269 /* 1270 * A NULL kfpu implies this is a kernel thread with an LWP and 1271 * no user-level FPU usage. Use the lwp fpu save area. 1272 */ 1273 pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; 1274 1275 ASSERT(curthread->t_procp->p_flag & SSYS); 1276 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1277 } else { 1278 pf = &kfpu->kfpu_ctx; 1279 1280 ASSERT3P(kfpu->kfpu_curthread, ==, curthread); 1281 ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); 1282 } 1283 1284 fp_restore(pf); 1285 curthread->t_flag |= T_KFPU; 1286 } 1287 1288 /* 1289 * Validate that the thread is not switching off-cpu while actively using the 1290 * FPU within the kernel. 1291 */ 1292 void 1293 kernel_fpu_no_swtch(void) 1294 { 1295 if ((curthread->t_flag & T_KFPU) != 0) { 1296 panic("curthread swtch-ing while the kernel is using the FPU"); 1297 } 1298 } 1299 1300 void 1301 kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags) 1302 { 1303 klwp_t *pl = curthread->t_lwp; 1304 struct ctxop *ctx; 1305 1306 if ((curthread->t_flag & T_KFPU) != 0) { 1307 panic("curthread attempting to nest kernel FPU states"); 1308 } 1309 1310 /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */ 1311 ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) != 1312 (KFPU_USE_LWP | KFPU_NO_STATE)); 1313 1314 if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) { 1315 /* 1316 * Since we don't have a kfpu_state or usable lwp pcb_fpu to 1317 * hold our kernel FPU context, we depend on the caller doing 1318 * kpreempt_disable for the duration of our FPU usage. This 1319 * should only be done for very short periods of time. 1320 */ 1321 ASSERT(curthread->t_preempt > 0); 1322 ASSERT(kfpu == NULL); 1323 1324 if (pl != NULL) { 1325 /* 1326 * We might have already saved once so FPU_VALID could 1327 * be set. This is handled in fp_save. 1328 */ 1329 fp_save(&pl->lwp_pcb.pcb_fpu); 1330 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1331 } 1332 1333 curthread->t_flag |= T_KFPU; 1334 1335 /* Always restore the fpu to the initial state. */ 1336 fpinit(); 1337 1338 return; 1339 } 1340 1341 /* 1342 * We either have a kfpu, or are using the LWP pcb_fpu for context ops. 1343 */ 1344 1345 if ((flags & KFPU_USE_LWP) == 0) { 1346 if (kfpu->kfpu_curthread != NULL) 1347 panic("attempting to reuse kernel FPU state at %p when " 1348 "another thread already is using", kfpu); 1349 1350 if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0) 1351 kernel_fpu_fpstate_init(kfpu); 1352 1353 kfpu->kfpu_curthread = curthread; 1354 } 1355 1356 /* 1357 * Not all threads may have an active LWP. If they do and we're not 1358 * going to re-use the LWP, then we should go ahead and save the state. 1359 * We must also note that the fpu is now being used by the kernel and 1360 * therefore we do not want to manage the fpu state via the user-level 1361 * thread's context handlers. 1362 * 1363 * We might have already saved once (due to a prior use of the kernel 1364 * FPU or another code path) so FPU_VALID could be set. This is handled 1365 * by fp_save, as is the FPU_EN check. 1366 */ 1367 ctx = installctx_preallocate(); 1368 kpreempt_disable(); 1369 if (pl != NULL) { 1370 if ((flags & KFPU_USE_LWP) == 0) 1371 fp_save(&pl->lwp_pcb.pcb_fpu); 1372 pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; 1373 } 1374 1375 /* 1376 * Set the context operations for kernel FPU usage. Note that this is 1377 * done with a preallocated buffer and under kpreempt_disable because 1378 * without a preallocated buffer, installctx does a sleeping 1379 * allocation. We haven't finished initializing our kernel FPU state 1380 * yet, and in the rare case that we happen to save/restore just as 1381 * installctx() exits its own kpreempt_enable() internal call, we 1382 * guard against restoring an uninitialized buffer (0xbaddcafe). 1383 */ 1384 installctx(curthread, kfpu, kernel_fpu_ctx_save, kernel_fpu_ctx_restore, 1385 NULL, NULL, NULL, NULL, ctx); 1386 1387 curthread->t_flag |= T_KFPU; 1388 1389 if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) { 1390 /* 1391 * For pure kernel threads with an LWP, we can use the LWP's 1392 * pcb_fpu to save/restore context. 1393 */ 1394 fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu; 1395 1396 VERIFY(curthread->t_procp->p_flag & SSYS); 1397 VERIFY(kfpu == NULL); 1398 ASSERT((pf->fpu_flags & FPU_EN) == 0); 1399 1400 /* Always restore the fpu to the initial state. */ 1401 if (fp_save_mech == FP_XSAVE) 1402 pf->fpu_xsave_mask = XFEATURE_FP_ALL; 1403 fpinit(); 1404 pf->fpu_flags = FPU_EN | FPU_KERNEL; 1405 } else { 1406 /* initialize the kfpu state */ 1407 kernel_fpu_ctx_restore(kfpu); 1408 } 1409 kpreempt_enable(); 1410 } 1411 1412 void 1413 kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags) 1414 { 1415 ulong_t iflags; 1416 1417 if ((curthread->t_flag & T_KFPU) == 0) { 1418 panic("curthread attempting to clear kernel FPU state " 1419 "without using it"); 1420 } 1421 1422 /* 1423 * General comments on why the rest of this function is structured the 1424 * way it is. Be aware that there is a lot of subtlety here. 1425 * 1426 * If a user-level thread ever uses the fpu while in the kernel, then 1427 * we cannot call fpdisable since that does STTS. That will set the 1428 * ts bit in %cr0 which will cause an exception if anything touches the 1429 * fpu. However, the user-level context switch handler (fpsave_ctxt) 1430 * needs to access the fpu to save the registers into the pcb. 1431 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in 1432 * fprestore_ctxt when the thread context switched onto the CPU. 1433 * 1434 * Calling fpdisable only effects the current CPU's %cr0 register. 1435 * 1436 * During removectx and kpreempt_enable, we can voluntarily context 1437 * switch, so the CPU we were on when we entered this function might 1438 * not be the same one we're on when we return from removectx or end 1439 * the function. Note there can be user-level context switch handlers 1440 * still installed if this is a user-level thread. 1441 * 1442 * We also must be careful in the unlikely chance we're running in an 1443 * interrupt thread, since we can't leave the CPU's %cr0 TS state set 1444 * incorrectly for the "real" thread to resume on this CPU. 1445 */ 1446 1447 if ((flags & KFPU_NO_STATE) == 0) { 1448 kpreempt_disable(); 1449 } else { 1450 ASSERT(curthread->t_preempt > 0); 1451 } 1452 1453 curthread->t_flag &= ~T_KFPU; 1454 1455 /* 1456 * When we are ending things, we explicitly don't save the current 1457 * kernel FPU state back to the temporary state. The kfpu API is not 1458 * intended to be a permanent save location. 1459 * 1460 * If this is a user-level thread and we were to context switch 1461 * before returning to user-land, fpsave_ctxt will be a no-op since we 1462 * already saved the user-level FPU state the first time we run 1463 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over 1464 * the user-level fpu state). The fpsave_ctxt functions only save if 1465 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so 1466 * fprestore_ctxt will be done in sys_rtt_common when the thread 1467 * finally returns to user-land. 1468 */ 1469 1470 if ((curthread->t_procp->p_flag & SSYS) != 0 && 1471 curthread->t_intr == NULL) { 1472 /* 1473 * A kernel thread which is not an interrupt thread, so we 1474 * STTS now. 1475 */ 1476 fpdisable(); 1477 } 1478 1479 if ((flags & KFPU_NO_STATE) == 0) { 1480 removectx(curthread, kfpu, kernel_fpu_ctx_save, 1481 kernel_fpu_ctx_restore, NULL, NULL, NULL, NULL); 1482 1483 if (kfpu != NULL) { 1484 if (kfpu->kfpu_curthread != curthread) { 1485 panic("attempting to end kernel FPU state " 1486 "for %p, but active thread is not " 1487 "curthread", kfpu); 1488 } else { 1489 kfpu->kfpu_curthread = NULL; 1490 } 1491 } 1492 1493 kpreempt_enable(); 1494 } 1495 1496 if (curthread->t_lwp != NULL) { 1497 uint_t f; 1498 1499 if (flags & KFPU_USE_LWP) { 1500 f = FPU_EN | FPU_KERNEL; 1501 } else { 1502 f = FPU_KERNEL; 1503 } 1504 curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f; 1505 } 1506 } 1507