1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * xsave/xrstor support. 4 * 5 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 6 */ 7 #include <linux/bitops.h> 8 #include <linux/compat.h> 9 #include <linux/cpu.h> 10 #include <linux/mman.h> 11 #include <linux/nospec.h> 12 #include <linux/pkeys.h> 13 #include <linux/seq_file.h> 14 #include <linux/proc_fs.h> 15 #include <linux/vmalloc.h> 16 #include <linux/coredump.h> 17 18 #include <asm/fpu/api.h> 19 #include <asm/fpu/regset.h> 20 #include <asm/fpu/signal.h> 21 #include <asm/fpu/xcr.h> 22 23 #include <asm/cpuid.h> 24 #include <asm/tlbflush.h> 25 #include <asm/prctl.h> 26 #include <asm/elf.h> 27 28 #include <uapi/asm/elf.h> 29 30 #include "context.h" 31 #include "internal.h" 32 #include "legacy.h" 33 #include "xstate.h" 34 35 #define for_each_extended_xfeature(bit, mask) \ 36 (bit) = FIRST_EXTENDED_XFEATURE; \ 37 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) 38 39 /* 40 * Although we spell it out in here, the Processor Trace 41 * xfeature is completely unused. We use other mechanisms 42 * to save/restore PT state in Linux. 43 */ 44 static const char *xfeature_names[] = 45 { 46 "x87 floating point registers", 47 "SSE registers", 48 "AVX registers", 49 "MPX bounds registers", 50 "MPX CSR", 51 "AVX-512 opmask", 52 "AVX-512 Hi256", 53 "AVX-512 ZMM_Hi256", 54 "Processor Trace (unused)", 55 "Protection Keys User registers", 56 "PASID state", 57 "Control-flow User registers", 58 "Control-flow Kernel registers (unused)", 59 "unknown xstate feature", 60 "unknown xstate feature", 61 "unknown xstate feature", 62 "unknown xstate feature", 63 "AMX Tile config", 64 "AMX Tile data", 65 "unknown xstate feature", 66 }; 67 68 static unsigned short xsave_cpuid_features[] __initdata = { 69 [XFEATURE_FP] = X86_FEATURE_FPU, 70 [XFEATURE_SSE] = X86_FEATURE_XMM, 71 [XFEATURE_YMM] = X86_FEATURE_AVX, 72 [XFEATURE_BNDREGS] = X86_FEATURE_MPX, 73 [XFEATURE_BNDCSR] = X86_FEATURE_MPX, 74 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, 75 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, 76 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, 77 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, 78 [XFEATURE_PKRU] = X86_FEATURE_OSPKE, 79 [XFEATURE_PASID] = X86_FEATURE_ENQCMD, 80 [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, 81 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, 82 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, 83 }; 84 85 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = 86 { [ 0 ... XFEATURE_MAX - 1] = -1}; 87 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = 88 { [ 0 ... XFEATURE_MAX - 1] = -1}; 89 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; 90 91 #define XSTATE_FLAG_SUPERVISOR BIT(0) 92 #define XSTATE_FLAG_ALIGNED64 BIT(1) 93 94 /* 95 * Return whether the system supports a given xfeature. 96 * 97 * Also return the name of the (most advanced) feature that the caller requested: 98 */ 99 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) 100 { 101 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; 102 103 if (unlikely(feature_name)) { 104 long xfeature_idx, max_idx; 105 u64 xfeatures_print; 106 /* 107 * So we use FLS here to be able to print the most advanced 108 * feature that was requested but is missing. So if a driver 109 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the 110 * missing AVX feature - this is the most informative message 111 * to users: 112 */ 113 if (xfeatures_missing) 114 xfeatures_print = xfeatures_missing; 115 else 116 xfeatures_print = xfeatures_needed; 117 118 xfeature_idx = fls64(xfeatures_print)-1; 119 max_idx = ARRAY_SIZE(xfeature_names)-1; 120 xfeature_idx = min(xfeature_idx, max_idx); 121 122 *feature_name = xfeature_names[xfeature_idx]; 123 } 124 125 if (xfeatures_missing) 126 return 0; 127 128 return 1; 129 } 130 EXPORT_SYMBOL_GPL(cpu_has_xfeatures); 131 132 static bool xfeature_is_aligned64(int xfeature_nr) 133 { 134 return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; 135 } 136 137 static bool xfeature_is_supervisor(int xfeature_nr) 138 { 139 return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; 140 } 141 142 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) 143 { 144 unsigned int offs, i; 145 146 /* 147 * Non-compacted format and legacy features use the cached fixed 148 * offsets. 149 */ 150 if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || 151 xfeature <= XFEATURE_SSE) 152 return xstate_offsets[xfeature]; 153 154 /* 155 * Compacted format offsets depend on the actual content of the 156 * compacted xsave area which is determined by the xcomp_bv header 157 * field. 158 */ 159 offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; 160 for_each_extended_xfeature(i, xcomp_bv) { 161 if (xfeature_is_aligned64(i)) 162 offs = ALIGN(offs, 64); 163 if (i == xfeature) 164 break; 165 offs += xstate_sizes[i]; 166 } 167 return offs; 168 } 169 170 /* 171 * Enable the extended processor state save/restore feature. 172 * Called once per CPU onlining. 173 */ 174 void fpu__init_cpu_xstate(void) 175 { 176 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) 177 return; 178 179 cr4_set_bits(X86_CR4_OSXSAVE); 180 181 /* 182 * Must happen after CR4 setup and before xsetbv() to allow KVM 183 * lazy passthrough. Write independent of the dynamic state static 184 * key as that does not work on the boot CPU. This also ensures 185 * that any stale state is wiped out from XFD. Reset the per CPU 186 * xfd cache too. 187 */ 188 if (cpu_feature_enabled(X86_FEATURE_XFD)) 189 xfd_set_state(init_fpstate.xfd); 190 191 /* 192 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features 193 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user 194 * states can be set here. 195 */ 196 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 197 198 /* 199 * MSR_IA32_XSS sets supervisor states managed by XSAVES. 200 */ 201 if (boot_cpu_has(X86_FEATURE_XSAVES)) { 202 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 203 xfeatures_mask_independent()); 204 } 205 } 206 207 static bool xfeature_enabled(enum xfeature xfeature) 208 { 209 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); 210 } 211 212 /* 213 * Record the offsets and sizes of various xstates contained 214 * in the XSAVE state memory layout. 215 */ 216 static void __init setup_xstate_cache(void) 217 { 218 u32 eax, ebx, ecx, edx, i; 219 /* start at the beginning of the "extended state" */ 220 unsigned int last_good_offset = offsetof(struct xregs_state, 221 extended_state_area); 222 /* 223 * The FP xstates and SSE xstates are legacy states. They are always 224 * in the fixed offsets in the xsave area in either compacted form 225 * or standard form. 226 */ 227 xstate_offsets[XFEATURE_FP] = 0; 228 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, 229 xmm_space); 230 231 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; 232 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, 233 xmm_space); 234 235 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 236 cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); 237 238 xstate_sizes[i] = eax; 239 xstate_flags[i] = ecx; 240 241 /* 242 * If an xfeature is supervisor state, the offset in EBX is 243 * invalid, leave it to -1. 244 */ 245 if (xfeature_is_supervisor(i)) 246 continue; 247 248 xstate_offsets[i] = ebx; 249 250 /* 251 * In our xstate size checks, we assume that the highest-numbered 252 * xstate feature has the highest offset in the buffer. Ensure 253 * it does. 254 */ 255 WARN_ONCE(last_good_offset > xstate_offsets[i], 256 "x86/fpu: misordered xstate at %d\n", last_good_offset); 257 258 last_good_offset = xstate_offsets[i]; 259 } 260 } 261 262 /* 263 * Print out all the supported xstate features: 264 */ 265 static void __init print_xstate_features(void) 266 { 267 int i; 268 269 for (i = 0; i < XFEATURE_MAX; i++) { 270 u64 mask = BIT_ULL(i); 271 const char *name; 272 273 if (cpu_has_xfeatures(mask, &name)) 274 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name); 275 } 276 } 277 278 /* 279 * This check is important because it is easy to get XSTATE_* 280 * confused with XSTATE_BIT_*. 281 */ 282 #define CHECK_XFEATURE(nr) do { \ 283 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ 284 WARN_ON(nr >= XFEATURE_MAX); \ 285 } while (0) 286 287 /* 288 * Print out xstate component offsets and sizes 289 */ 290 static void __init print_xstate_offset_size(void) 291 { 292 int i; 293 294 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 295 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", 296 i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), 297 i, xstate_sizes[i]); 298 } 299 } 300 301 /* 302 * This function is called only during boot time when x86 caps are not set 303 * up and alternative can not be used yet. 304 */ 305 static __init void os_xrstor_booting(struct xregs_state *xstate) 306 { 307 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; 308 u32 lmask = mask; 309 u32 hmask = mask >> 32; 310 int err; 311 312 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) 313 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); 314 else 315 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); 316 317 /* 318 * We should never fault when copying from a kernel buffer, and the FPU 319 * state we set at boot time should be valid. 320 */ 321 WARN_ON_FPU(err); 322 } 323 324 /* 325 * All supported features have either init state all zeros or are 326 * handled in setup_init_fpu() individually. This is an explicit 327 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch 328 * newly added supported features at build time and make people 329 * actually look at the init state for the new feature. 330 */ 331 #define XFEATURES_INIT_FPSTATE_HANDLED \ 332 (XFEATURE_MASK_FP | \ 333 XFEATURE_MASK_SSE | \ 334 XFEATURE_MASK_YMM | \ 335 XFEATURE_MASK_OPMASK | \ 336 XFEATURE_MASK_ZMM_Hi256 | \ 337 XFEATURE_MASK_Hi16_ZMM | \ 338 XFEATURE_MASK_PKRU | \ 339 XFEATURE_MASK_BNDREGS | \ 340 XFEATURE_MASK_BNDCSR | \ 341 XFEATURE_MASK_PASID | \ 342 XFEATURE_MASK_CET_USER | \ 343 XFEATURE_MASK_XTILE) 344 345 /* 346 * setup the xstate image representing the init state 347 */ 348 static void __init setup_init_fpu_buf(void) 349 { 350 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | 351 XFEATURE_MASK_SUPERVISOR_SUPPORTED) != 352 XFEATURES_INIT_FPSTATE_HANDLED); 353 354 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 355 return; 356 357 print_xstate_features(); 358 359 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); 360 361 /* 362 * Init all the features state with header.xfeatures being 0x0 363 */ 364 os_xrstor_booting(&init_fpstate.regs.xsave); 365 366 /* 367 * All components are now in init state. Read the state back so 368 * that init_fpstate contains all non-zero init state. This only 369 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because 370 * those use the init optimization which skips writing data for 371 * components in init state. 372 * 373 * XSAVE could be used, but that would require to reshuffle the 374 * data when XSAVEC/S is available because XSAVEC/S uses xstate 375 * compaction. But doing so is a pointless exercise because most 376 * components have an all zeros init state except for the legacy 377 * ones (FP and SSE). Those can be saved with FXSAVE into the 378 * legacy area. Adding new features requires to ensure that init 379 * state is all zeroes or if not to add the necessary handling 380 * here. 381 */ 382 fxsave(&init_fpstate.regs.fxsave); 383 } 384 385 int xfeature_size(int xfeature_nr) 386 { 387 u32 eax, ebx, ecx, edx; 388 389 CHECK_XFEATURE(xfeature_nr); 390 cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); 391 return eax; 392 } 393 394 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ 395 static int validate_user_xstate_header(const struct xstate_header *hdr, 396 struct fpstate *fpstate) 397 { 398 /* No unknown or supervisor features may be set */ 399 if (hdr->xfeatures & ~fpstate->user_xfeatures) 400 return -EINVAL; 401 402 /* Userspace must use the uncompacted format */ 403 if (hdr->xcomp_bv) 404 return -EINVAL; 405 406 /* 407 * If 'reserved' is shrunken to add a new field, make sure to validate 408 * that new field here! 409 */ 410 BUILD_BUG_ON(sizeof(hdr->reserved) != 48); 411 412 /* No reserved bits may be set */ 413 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) 414 return -EINVAL; 415 416 return 0; 417 } 418 419 static void __init __xstate_dump_leaves(void) 420 { 421 int i; 422 u32 eax, ebx, ecx, edx; 423 static int should_dump = 1; 424 425 if (!should_dump) 426 return; 427 should_dump = 0; 428 /* 429 * Dump out a few leaves past the ones that we support 430 * just in case there are some goodies up there 431 */ 432 for (i = 0; i < XFEATURE_MAX + 10; i++) { 433 cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); 434 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", 435 CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); 436 } 437 } 438 439 #define XSTATE_WARN_ON(x, fmt, ...) do { \ 440 if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ 441 __xstate_dump_leaves(); \ 442 } \ 443 } while (0) 444 445 #define XCHECK_SZ(sz, nr, __struct) ({ \ 446 if (WARN_ONCE(sz != sizeof(__struct), \ 447 "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ 448 xfeature_names[nr], sizeof(__struct), sz)) { \ 449 __xstate_dump_leaves(); \ 450 } \ 451 true; \ 452 }) 453 454 455 /** 456 * check_xtile_data_against_struct - Check tile data state size. 457 * 458 * Calculate the state size by multiplying the single tile size which is 459 * recorded in a C struct, and the number of tiles that the CPU informs. 460 * Compare the provided size with the calculation. 461 * 462 * @size: The tile data state size 463 * 464 * Returns: 0 on success, -EINVAL on mismatch. 465 */ 466 static int __init check_xtile_data_against_struct(int size) 467 { 468 u32 max_palid, palid, state_size; 469 u32 eax, ebx, ecx, edx; 470 u16 max_tile; 471 472 /* 473 * Check the maximum palette id: 474 * eax: the highest numbered palette subleaf. 475 */ 476 cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); 477 478 /* 479 * Cross-check each tile size and find the maximum number of 480 * supported tiles. 481 */ 482 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { 483 u16 tile_size, max; 484 485 /* 486 * Check the tile size info: 487 * eax[31:16]: bytes per title 488 * ebx[31:16]: the max names (or max number of tiles) 489 */ 490 cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); 491 tile_size = eax >> 16; 492 max = ebx >> 16; 493 494 if (tile_size != sizeof(struct xtile_data)) { 495 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", 496 __stringify(XFEATURE_XTILE_DATA), 497 sizeof(struct xtile_data), tile_size); 498 __xstate_dump_leaves(); 499 return -EINVAL; 500 } 501 502 if (max > max_tile) 503 max_tile = max; 504 } 505 506 state_size = sizeof(struct xtile_data) * max_tile; 507 if (size != state_size) { 508 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", 509 __stringify(XFEATURE_XTILE_DATA), state_size, size); 510 __xstate_dump_leaves(); 511 return -EINVAL; 512 } 513 return 0; 514 } 515 516 /* 517 * We have a C struct for each 'xstate'. We need to ensure 518 * that our software representation matches what the CPU 519 * tells us about the state's size. 520 */ 521 static bool __init check_xstate_against_struct(int nr) 522 { 523 /* 524 * Ask the CPU for the size of the state. 525 */ 526 int sz = xfeature_size(nr); 527 528 /* 529 * Match each CPU state with the corresponding software 530 * structure. 531 */ 532 switch (nr) { 533 case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); 534 case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); 535 case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); 536 case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); 537 case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); 538 case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); 539 case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); 540 case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); 541 case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); 542 case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); 543 case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; 544 default: 545 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); 546 return false; 547 } 548 549 return true; 550 } 551 552 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) 553 { 554 unsigned int topmost = fls64(xfeatures) - 1; 555 unsigned int offset = xstate_offsets[topmost]; 556 557 if (topmost <= XFEATURE_SSE) 558 return sizeof(struct xregs_state); 559 560 if (compacted) 561 offset = xfeature_get_offset(xfeatures, topmost); 562 return offset + xstate_sizes[topmost]; 563 } 564 565 /* 566 * This essentially double-checks what the cpu told us about 567 * how large the XSAVE buffer needs to be. We are recalculating 568 * it to be safe. 569 * 570 * Independent XSAVE features allocate their own buffers and are not 571 * covered by these checks. Only the size of the buffer for task->fpu 572 * is checked here. 573 */ 574 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) 575 { 576 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 577 bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); 578 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; 579 int i; 580 581 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 582 if (!check_xstate_against_struct(i)) 583 return false; 584 /* 585 * Supervisor state components can be managed only by 586 * XSAVES. 587 */ 588 if (!xsaves && xfeature_is_supervisor(i)) { 589 XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); 590 return false; 591 } 592 } 593 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); 594 XSTATE_WARN_ON(size != kernel_size, 595 "size %u != kernel_size %u\n", size, kernel_size); 596 return size == kernel_size; 597 } 598 599 /* 600 * Get total size of enabled xstates in XCR0 | IA32_XSS. 601 * 602 * Note the SDM's wording here. "sub-function 0" only enumerates 603 * the size of the *user* states. If we use it to size a buffer 604 * that we use 'XSAVES' on, we could potentially overflow the 605 * buffer because 'XSAVES' saves system states too. 606 * 607 * This also takes compaction into account. So this works for 608 * XSAVEC as well. 609 */ 610 static unsigned int __init get_compacted_size(void) 611 { 612 unsigned int eax, ebx, ecx, edx; 613 /* 614 * - CPUID function 0DH, sub-function 1: 615 * EBX enumerates the size (in bytes) required by 616 * the XSAVES instruction for an XSAVE area 617 * containing all the state components 618 * corresponding to bits currently set in 619 * XCR0 | IA32_XSS. 620 * 621 * When XSAVES is not available but XSAVEC is (virt), then there 622 * are no supervisor states, but XSAVEC still uses compacted 623 * format. 624 */ 625 cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); 626 return ebx; 627 } 628 629 /* 630 * Get the total size of the enabled xstates without the independent supervisor 631 * features. 632 */ 633 static unsigned int __init get_xsave_compacted_size(void) 634 { 635 u64 mask = xfeatures_mask_independent(); 636 unsigned int size; 637 638 if (!mask) 639 return get_compacted_size(); 640 641 /* Disable independent features. */ 642 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); 643 644 /* 645 * Ask the hardware what size is required of the buffer. 646 * This is the size required for the task->fpu buffer. 647 */ 648 size = get_compacted_size(); 649 650 /* Re-enable independent features so XSAVES will work on them again. */ 651 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); 652 653 return size; 654 } 655 656 static unsigned int __init get_xsave_size_user(void) 657 { 658 unsigned int eax, ebx, ecx, edx; 659 /* 660 * - CPUID function 0DH, sub-function 0: 661 * EBX enumerates the size (in bytes) required by 662 * the XSAVE instruction for an XSAVE area 663 * containing all the *user* state components 664 * corresponding to bits currently set in XCR0. 665 */ 666 cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); 667 return ebx; 668 } 669 670 static int __init init_xstate_size(void) 671 { 672 /* Recompute the context size for enabled features: */ 673 unsigned int user_size, kernel_size, kernel_default_size; 674 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 675 676 /* Uncompacted user space size */ 677 user_size = get_xsave_size_user(); 678 679 /* 680 * XSAVES kernel size includes supervisor states and uses compacted 681 * format. XSAVEC uses compacted format, but does not save 682 * supervisor states. 683 * 684 * XSAVE[OPT] do not support supervisor states so kernel and user 685 * size is identical. 686 */ 687 if (compacted) 688 kernel_size = get_xsave_compacted_size(); 689 else 690 kernel_size = user_size; 691 692 kernel_default_size = 693 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); 694 695 if (!paranoid_xstate_size_valid(kernel_size)) 696 return -EINVAL; 697 698 fpu_kernel_cfg.max_size = kernel_size; 699 fpu_user_cfg.max_size = user_size; 700 701 fpu_kernel_cfg.default_size = kernel_default_size; 702 fpu_user_cfg.default_size = 703 xstate_calculate_size(fpu_user_cfg.default_features, false); 704 705 return 0; 706 } 707 708 /* 709 * We enabled the XSAVE hardware, but something went wrong and 710 * we can not use it. Disable it. 711 */ 712 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) 713 { 714 fpu_kernel_cfg.max_features = 0; 715 cr4_clear_bits(X86_CR4_OSXSAVE); 716 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 717 718 /* Restore the legacy size.*/ 719 fpu_kernel_cfg.max_size = legacy_size; 720 fpu_kernel_cfg.default_size = legacy_size; 721 fpu_user_cfg.max_size = legacy_size; 722 fpu_user_cfg.default_size = legacy_size; 723 724 /* 725 * Prevent enabling the static branch which enables writes to the 726 * XFD MSR. 727 */ 728 init_fpstate.xfd = 0; 729 730 fpstate_reset(¤t->thread.fpu); 731 } 732 733 /* 734 * Enable and initialize the xsave feature. 735 * Called once per system bootup. 736 */ 737 void __init fpu__init_system_xstate(unsigned int legacy_size) 738 { 739 unsigned int eax, ebx, ecx, edx; 740 u64 xfeatures; 741 int err; 742 int i; 743 744 if (!boot_cpu_has(X86_FEATURE_FPU)) { 745 pr_info("x86/fpu: No FPU detected\n"); 746 return; 747 } 748 749 if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 750 pr_info("x86/fpu: x87 FPU will use %s\n", 751 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); 752 return; 753 } 754 755 /* 756 * Find user xstates supported by the processor. 757 */ 758 cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); 759 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); 760 761 /* 762 * Find supervisor xstates supported by the processor. 763 */ 764 cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); 765 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); 766 767 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { 768 /* 769 * This indicates that something really unexpected happened 770 * with the enumeration. Disable XSAVE and try to continue 771 * booting without it. This is too early to BUG(). 772 */ 773 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", 774 fpu_kernel_cfg.max_features); 775 goto out_disable; 776 } 777 778 fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & 779 XFEATURE_MASK_INDEPENDENT; 780 781 /* 782 * Clear XSAVE features that are disabled in the normal CPUID. 783 */ 784 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { 785 unsigned short cid = xsave_cpuid_features[i]; 786 787 /* Careful: X86_FEATURE_FPU is 0! */ 788 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) 789 fpu_kernel_cfg.max_features &= ~BIT_ULL(i); 790 } 791 792 if (!cpu_feature_enabled(X86_FEATURE_XFD)) 793 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; 794 795 if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) 796 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 797 else 798 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | 799 XFEATURE_MASK_SUPERVISOR_SUPPORTED; 800 801 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; 802 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 803 804 /* Clean out dynamic features from default */ 805 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; 806 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 807 808 fpu_user_cfg.default_features = fpu_user_cfg.max_features; 809 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 810 811 /* Store it for paranoia check at the end */ 812 xfeatures = fpu_kernel_cfg.max_features; 813 814 /* 815 * Initialize the default XFD state in initfp_state and enable the 816 * dynamic sizing mechanism if dynamic states are available. The 817 * static key cannot be enabled here because this runs before 818 * jump_label_init(). This is delayed to an initcall. 819 */ 820 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; 821 822 /* Set up compaction feature bit */ 823 if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || 824 cpu_feature_enabled(X86_FEATURE_XSAVES)) 825 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); 826 827 /* Enable xstate instructions to be able to continue with initialization: */ 828 fpu__init_cpu_xstate(); 829 830 /* Cache size, offset and flags for initialization */ 831 setup_xstate_cache(); 832 833 err = init_xstate_size(); 834 if (err) 835 goto out_disable; 836 837 /* Reset the state for the current task */ 838 fpstate_reset(¤t->thread.fpu); 839 840 /* 841 * Update info used for ptrace frames; use standard-format size and no 842 * supervisor xstates: 843 */ 844 update_regset_xstate_info(fpu_user_cfg.max_size, 845 fpu_user_cfg.max_features); 846 847 /* 848 * init_fpstate excludes dynamic states as they are large but init 849 * state is zero. 850 */ 851 init_fpstate.size = fpu_kernel_cfg.default_size; 852 init_fpstate.xfeatures = fpu_kernel_cfg.default_features; 853 854 if (init_fpstate.size > sizeof(init_fpstate.regs)) { 855 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", 856 sizeof(init_fpstate.regs), init_fpstate.size); 857 goto out_disable; 858 } 859 860 setup_init_fpu_buf(); 861 862 /* 863 * Paranoia check whether something in the setup modified the 864 * xfeatures mask. 865 */ 866 if (xfeatures != fpu_kernel_cfg.max_features) { 867 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", 868 xfeatures, fpu_kernel_cfg.max_features); 869 goto out_disable; 870 } 871 872 /* 873 * CPU capabilities initialization runs before FPU init. So 874 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely 875 * functional, set the feature bit so depending code works. 876 */ 877 setup_force_cpu_cap(X86_FEATURE_OSXSAVE); 878 879 print_xstate_offset_size(); 880 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", 881 fpu_kernel_cfg.max_features, 882 fpu_kernel_cfg.max_size, 883 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); 884 return; 885 886 out_disable: 887 /* something went wrong, try to boot without any XSAVE support */ 888 fpu__init_disable_system_xstate(legacy_size); 889 } 890 891 /* 892 * Restore minimal FPU state after suspend: 893 */ 894 void fpu__resume_cpu(void) 895 { 896 /* 897 * Restore XCR0 on xsave capable CPUs: 898 */ 899 if (cpu_feature_enabled(X86_FEATURE_XSAVE)) 900 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 901 902 /* 903 * Restore IA32_XSS. The same CPUID bit enumerates support 904 * of XSAVES and MSR_IA32_XSS. 905 */ 906 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { 907 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 908 xfeatures_mask_independent()); 909 } 910 911 if (fpu_state_size_dynamic()) 912 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); 913 } 914 915 /* 916 * Given an xstate feature nr, calculate where in the xsave 917 * buffer the state is. Callers should ensure that the buffer 918 * is valid. 919 */ 920 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 921 { 922 u64 xcomp_bv = xsave->header.xcomp_bv; 923 924 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 925 return NULL; 926 927 if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { 928 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) 929 return NULL; 930 } 931 932 return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); 933 } 934 935 /* 936 * Given the xsave area and a state inside, this function returns the 937 * address of the state. 938 * 939 * This is the API that is called to get xstate address in either 940 * standard format or compacted format of xsave area. 941 * 942 * Note that if there is no data for the field in the xsave buffer 943 * this will return NULL. 944 * 945 * Inputs: 946 * xstate: the thread's storage area for all FPU data 947 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, 948 * XFEATURE_SSE, etc...) 949 * Output: 950 * address of the state in the xsave area, or NULL if the 951 * field is not present in the xsave buffer. 952 */ 953 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 954 { 955 /* 956 * Do we even *have* xsave state? 957 */ 958 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 959 return NULL; 960 961 /* 962 * We should not ever be requesting features that we 963 * have not enabled. 964 */ 965 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 966 return NULL; 967 968 /* 969 * This assumes the last 'xsave*' instruction to 970 * have requested that 'xfeature_nr' be saved. 971 * If it did not, we might be seeing and old value 972 * of the field in the buffer. 973 * 974 * This can happen because the last 'xsave' did not 975 * request that this feature be saved (unlikely) 976 * or because the "init optimization" caused it 977 * to not be saved. 978 */ 979 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) 980 return NULL; 981 982 return __raw_xsave_addr(xsave, xfeature_nr); 983 } 984 EXPORT_SYMBOL_GPL(get_xsave_addr); 985 986 /* 987 * Given an xstate feature nr, calculate where in the xsave buffer the state is. 988 * The xsave buffer should be in standard format, not compacted (e.g. user mode 989 * signal frames). 990 */ 991 void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) 992 { 993 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 994 return NULL; 995 996 return (void __user *)xsave + xstate_offsets[xfeature_nr]; 997 } 998 999 #ifdef CONFIG_ARCH_HAS_PKEYS 1000 1001 /* 1002 * This will go out and modify PKRU register to set the access 1003 * rights for @pkey to @init_val. 1004 */ 1005 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 1006 unsigned long init_val) 1007 { 1008 u32 old_pkru, new_pkru_bits = 0; 1009 int pkey_shift; 1010 1011 /* 1012 * This check implies XSAVE support. OSPKE only gets 1013 * set if we enable XSAVE and we enable PKU in XCR0. 1014 */ 1015 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 1016 return -EINVAL; 1017 1018 /* 1019 * This code should only be called with valid 'pkey' 1020 * values originating from in-kernel users. Complain 1021 * if a bad value is observed. 1022 */ 1023 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 1024 return -EINVAL; 1025 1026 /* Set the bits we need in PKRU: */ 1027 if (init_val & PKEY_DISABLE_ACCESS) 1028 new_pkru_bits |= PKRU_AD_BIT; 1029 if (init_val & PKEY_DISABLE_WRITE) 1030 new_pkru_bits |= PKRU_WD_BIT; 1031 1032 /* Shift the bits in to the correct place in PKRU for pkey: */ 1033 pkey_shift = pkey * PKRU_BITS_PER_PKEY; 1034 new_pkru_bits <<= pkey_shift; 1035 1036 /* Get old PKRU and mask off any old bits in place: */ 1037 old_pkru = read_pkru(); 1038 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 1039 1040 /* Write old part along with new part: */ 1041 write_pkru(old_pkru | new_pkru_bits); 1042 1043 return 0; 1044 } 1045 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 1046 1047 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, 1048 void *init_xstate, unsigned int size) 1049 { 1050 membuf_write(to, from_xstate ? xstate : init_xstate, size); 1051 } 1052 1053 /** 1054 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1055 * @to: membuf descriptor 1056 * @fpstate: The fpstate buffer from which to copy 1057 * @xfeatures: The mask of xfeatures to save (XSAVE mode only) 1058 * @pkru_val: The PKRU value to store in the PKRU component 1059 * @copy_mode: The requested copy mode 1060 * 1061 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1062 * format, i.e. from the kernel internal hardware dependent storage format 1063 * to the requested @mode. UABI XSTATE is always uncompacted! 1064 * 1065 * It supports partial copy but @to.pos always starts from zero. 1066 */ 1067 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, 1068 u64 xfeatures, u32 pkru_val, 1069 enum xstate_copy_mode copy_mode) 1070 { 1071 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); 1072 struct xregs_state *xinit = &init_fpstate.regs.xsave; 1073 struct xregs_state *xsave = &fpstate->regs.xsave; 1074 struct xstate_header header; 1075 unsigned int zerofrom; 1076 u64 mask; 1077 int i; 1078 1079 memset(&header, 0, sizeof(header)); 1080 header.xfeatures = xsave->header.xfeatures; 1081 1082 /* Mask out the feature bits depending on copy mode */ 1083 switch (copy_mode) { 1084 case XSTATE_COPY_FP: 1085 header.xfeatures &= XFEATURE_MASK_FP; 1086 break; 1087 1088 case XSTATE_COPY_FX: 1089 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; 1090 break; 1091 1092 case XSTATE_COPY_XSAVE: 1093 header.xfeatures &= fpstate->user_xfeatures & xfeatures; 1094 break; 1095 } 1096 1097 /* Copy FP state up to MXCSR */ 1098 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, 1099 &xinit->i387, off_mxcsr); 1100 1101 /* Copy MXCSR when SSE or YMM are set in the feature mask */ 1102 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), 1103 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, 1104 MXCSR_AND_FLAGS_SIZE); 1105 1106 /* Copy the remaining FP state */ 1107 copy_feature(header.xfeatures & XFEATURE_MASK_FP, 1108 &to, &xsave->i387.st_space, &xinit->i387.st_space, 1109 sizeof(xsave->i387.st_space)); 1110 1111 /* Copy the SSE state - shared with YMM, but independently managed */ 1112 copy_feature(header.xfeatures & XFEATURE_MASK_SSE, 1113 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, 1114 sizeof(xsave->i387.xmm_space)); 1115 1116 if (copy_mode != XSTATE_COPY_XSAVE) 1117 goto out; 1118 1119 /* Zero the padding area */ 1120 membuf_zero(&to, sizeof(xsave->i387.padding)); 1121 1122 /* Copy xsave->i387.sw_reserved */ 1123 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); 1124 1125 /* Copy the user space relevant state of @xsave->header */ 1126 membuf_write(&to, &header, sizeof(header)); 1127 1128 zerofrom = offsetof(struct xregs_state, extended_state_area); 1129 1130 /* 1131 * This 'mask' indicates which states to copy from fpstate. 1132 * Those extended states that are not present in fpstate are 1133 * either disabled or initialized: 1134 * 1135 * In non-compacted format, disabled features still occupy 1136 * state space but there is no state to copy from in the 1137 * compacted init_fpstate. The gap tracking will zero these 1138 * states. 1139 * 1140 * The extended features have an all zeroes init state. Thus, 1141 * remove them from 'mask' to zero those features in the user 1142 * buffer instead of retrieving them from init_fpstate. 1143 */ 1144 mask = header.xfeatures; 1145 1146 for_each_extended_xfeature(i, mask) { 1147 /* 1148 * If there was a feature or alignment gap, zero the space 1149 * in the destination buffer. 1150 */ 1151 if (zerofrom < xstate_offsets[i]) 1152 membuf_zero(&to, xstate_offsets[i] - zerofrom); 1153 1154 if (i == XFEATURE_PKRU) { 1155 struct pkru_state pkru = {0}; 1156 /* 1157 * PKRU is not necessarily up to date in the 1158 * XSAVE buffer. Use the provided value. 1159 */ 1160 pkru.pkru = pkru_val; 1161 membuf_write(&to, &pkru, sizeof(pkru)); 1162 } else { 1163 membuf_write(&to, 1164 __raw_xsave_addr(xsave, i), 1165 xstate_sizes[i]); 1166 } 1167 /* 1168 * Keep track of the last copied state in the non-compacted 1169 * target buffer for gap zeroing. 1170 */ 1171 zerofrom = xstate_offsets[i] + xstate_sizes[i]; 1172 } 1173 1174 out: 1175 if (to.left) 1176 membuf_zero(&to, to.left); 1177 } 1178 1179 /** 1180 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1181 * @to: membuf descriptor 1182 * @tsk: The task from which to copy the saved xstate 1183 * @copy_mode: The requested copy mode 1184 * 1185 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1186 * format, i.e. from the kernel internal hardware dependent storage format 1187 * to the requested @mode. UABI XSTATE is always uncompacted! 1188 * 1189 * It supports partial copy but @to.pos always starts from zero. 1190 */ 1191 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, 1192 enum xstate_copy_mode copy_mode) 1193 { 1194 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, 1195 tsk->thread.fpu.fpstate->user_xfeatures, 1196 tsk->thread.pkru, copy_mode); 1197 } 1198 1199 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, 1200 const void *kbuf, const void __user *ubuf) 1201 { 1202 if (kbuf) { 1203 memcpy(dst, kbuf + offset, size); 1204 } else { 1205 if (copy_from_user(dst, ubuf + offset, size)) 1206 return -EFAULT; 1207 } 1208 return 0; 1209 } 1210 1211 1212 /** 1213 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate 1214 * @fpstate: The fpstate buffer to copy to 1215 * @kbuf: The UABI format buffer, if it comes from the kernel 1216 * @ubuf: The UABI format buffer, if it comes from userspace 1217 * @pkru: The location to write the PKRU value to 1218 * 1219 * Converts from the UABI format into the kernel internal hardware 1220 * dependent format. 1221 * 1222 * This function ultimately has three different callers with distinct PKRU 1223 * behavior. 1224 * 1. When called from sigreturn the PKRU register will be restored from 1225 * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to 1226 * @fpstate is sufficient to cover this case, but the caller will also 1227 * pass a pointer to the thread_struct's pkru field in @pkru and updating 1228 * it is harmless. 1229 * 2. When called from ptrace the PKRU register will be restored from the 1230 * thread_struct's pkru field. A pointer to that is passed in @pkru. 1231 * The kernel will restore it manually, so the XRSTOR behavior that resets 1232 * the PKRU register to the hardware init value (0) if the corresponding 1233 * xfeatures bit is not set is emulated here. 1234 * 3. When called from KVM the PKRU register will be restored from the vcpu's 1235 * pkru field. A pointer to that is passed in @pkru. KVM hasn't used 1236 * XRSTOR and hasn't had the PKRU resetting behavior described above. To 1237 * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures 1238 * bit is not set. 1239 */ 1240 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, 1241 const void __user *ubuf, u32 *pkru) 1242 { 1243 struct xregs_state *xsave = &fpstate->regs.xsave; 1244 unsigned int offset, size; 1245 struct xstate_header hdr; 1246 u64 mask; 1247 int i; 1248 1249 offset = offsetof(struct xregs_state, header); 1250 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) 1251 return -EFAULT; 1252 1253 if (validate_user_xstate_header(&hdr, fpstate)) 1254 return -EINVAL; 1255 1256 /* Validate MXCSR when any of the related features is in use */ 1257 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; 1258 if (hdr.xfeatures & mask) { 1259 u32 mxcsr[2]; 1260 1261 offset = offsetof(struct fxregs_state, mxcsr); 1262 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) 1263 return -EFAULT; 1264 1265 /* Reserved bits in MXCSR must be zero. */ 1266 if (mxcsr[0] & ~mxcsr_feature_mask) 1267 return -EINVAL; 1268 1269 /* SSE and YMM require MXCSR even when FP is not in use. */ 1270 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { 1271 xsave->i387.mxcsr = mxcsr[0]; 1272 xsave->i387.mxcsr_mask = mxcsr[1]; 1273 } 1274 } 1275 1276 for (i = 0; i < XFEATURE_MAX; i++) { 1277 mask = BIT_ULL(i); 1278 1279 if (hdr.xfeatures & mask) { 1280 void *dst = __raw_xsave_addr(xsave, i); 1281 1282 offset = xstate_offsets[i]; 1283 size = xstate_sizes[i]; 1284 1285 if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) 1286 return -EFAULT; 1287 } 1288 } 1289 1290 if (hdr.xfeatures & XFEATURE_MASK_PKRU) { 1291 struct pkru_state *xpkru; 1292 1293 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); 1294 *pkru = xpkru->pkru; 1295 } else { 1296 /* 1297 * KVM may pass NULL here to indicate that it does not need 1298 * PKRU updated. 1299 */ 1300 if (pkru) 1301 *pkru = 0; 1302 } 1303 1304 /* 1305 * The state that came in from userspace was user-state only. 1306 * Mask all the user states out of 'xfeatures': 1307 */ 1308 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; 1309 1310 /* 1311 * Add back in the features that came in from userspace: 1312 */ 1313 xsave->header.xfeatures |= hdr.xfeatures; 1314 1315 return 0; 1316 } 1317 1318 /* 1319 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] 1320 * format and copy to the target thread. Used by ptrace and KVM. 1321 */ 1322 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) 1323 { 1324 return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); 1325 } 1326 1327 /* 1328 * Convert from a sigreturn standard-format user-space buffer to kernel 1329 * XSAVE[S] format and copy to the target thread. This is called from the 1330 * sigreturn() and rt_sigreturn() system calls. 1331 */ 1332 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, 1333 const void __user *ubuf) 1334 { 1335 return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); 1336 } 1337 1338 static bool validate_independent_components(u64 mask) 1339 { 1340 u64 xchk; 1341 1342 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) 1343 return false; 1344 1345 xchk = ~xfeatures_mask_independent(); 1346 1347 if (WARN_ON_ONCE(!mask || mask & xchk)) 1348 return false; 1349 1350 return true; 1351 } 1352 1353 /** 1354 * xsaves - Save selected components to a kernel xstate buffer 1355 * @xstate: Pointer to the buffer 1356 * @mask: Feature mask to select the components to save 1357 * 1358 * The @xstate buffer must be 64 byte aligned and correctly initialized as 1359 * XSAVES does not write the full xstate header. Before first use the 1360 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer 1361 * can #GP. 1362 * 1363 * The feature mask must be a subset of the independent features. 1364 */ 1365 void xsaves(struct xregs_state *xstate, u64 mask) 1366 { 1367 int err; 1368 1369 if (!validate_independent_components(mask)) 1370 return; 1371 1372 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); 1373 WARN_ON_ONCE(err); 1374 } 1375 1376 /** 1377 * xrstors - Restore selected components from a kernel xstate buffer 1378 * @xstate: Pointer to the buffer 1379 * @mask: Feature mask to select the components to restore 1380 * 1381 * The @xstate buffer must be 64 byte aligned and correctly initialized 1382 * otherwise XRSTORS from that buffer can #GP. 1383 * 1384 * Proper usage is to restore the state which was saved with 1385 * xsaves() into @xstate. 1386 * 1387 * The feature mask must be a subset of the independent features. 1388 */ 1389 void xrstors(struct xregs_state *xstate, u64 mask) 1390 { 1391 int err; 1392 1393 if (!validate_independent_components(mask)) 1394 return; 1395 1396 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); 1397 WARN_ON_ONCE(err); 1398 } 1399 1400 #if IS_ENABLED(CONFIG_KVM) 1401 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) 1402 { 1403 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); 1404 1405 if (addr) 1406 memset(addr, 0, xstate_sizes[xfeature]); 1407 } 1408 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); 1409 #endif 1410 1411 #ifdef CONFIG_X86_64 1412 1413 #ifdef CONFIG_X86_DEBUG_FPU 1414 /* 1415 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask 1416 * can safely operate on the @fpstate buffer. 1417 */ 1418 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) 1419 { 1420 u64 xfd = __this_cpu_read(xfd_state); 1421 1422 if (fpstate->xfd == xfd) 1423 return true; 1424 1425 /* 1426 * The XFD MSR does not match fpstate->xfd. That's invalid when 1427 * the passed in fpstate is current's fpstate. 1428 */ 1429 if (fpstate->xfd == current->thread.fpu.fpstate->xfd) 1430 return false; 1431 1432 /* 1433 * XRSTOR(S) from init_fpstate are always correct as it will just 1434 * bring all components into init state and not read from the 1435 * buffer. XSAVE(S) raises #PF after init. 1436 */ 1437 if (fpstate == &init_fpstate) 1438 return rstor; 1439 1440 /* 1441 * XSAVE(S): clone(), fpu_swap_kvm_fpstate() 1442 * XRSTORS(S): fpu_swap_kvm_fpstate() 1443 */ 1444 1445 /* 1446 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch 1447 * the buffer area for XFD-disabled state components. 1448 */ 1449 mask &= ~xfd; 1450 1451 /* 1452 * Remove features which are valid in fpstate. They 1453 * have space allocated in fpstate. 1454 */ 1455 mask &= ~fpstate->xfeatures; 1456 1457 /* 1458 * Any remaining state components in 'mask' might be written 1459 * by XSAVE/XRSTOR. Fail validation it found. 1460 */ 1461 return !mask; 1462 } 1463 1464 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) 1465 { 1466 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); 1467 } 1468 #endif /* CONFIG_X86_DEBUG_FPU */ 1469 1470 static int __init xfd_update_static_branch(void) 1471 { 1472 /* 1473 * If init_fpstate.xfd has bits set then dynamic features are 1474 * available and the dynamic sizing must be enabled. 1475 */ 1476 if (init_fpstate.xfd) 1477 static_branch_enable(&__fpu_state_size_dynamic); 1478 return 0; 1479 } 1480 arch_initcall(xfd_update_static_branch) 1481 1482 void fpstate_free(struct fpu *fpu) 1483 { 1484 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) 1485 vfree(fpu->fpstate); 1486 } 1487 1488 /** 1489 * fpstate_realloc - Reallocate struct fpstate for the requested new features 1490 * 1491 * @xfeatures: A bitmap of xstate features which extend the enabled features 1492 * of that task 1493 * @ksize: The required size for the kernel buffer 1494 * @usize: The required size for user space buffers 1495 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations 1496 * 1497 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer 1498 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks 1499 * with large states are likely to live longer. 1500 * 1501 * Returns: 0 on success, -ENOMEM on allocation error. 1502 */ 1503 static int fpstate_realloc(u64 xfeatures, unsigned int ksize, 1504 unsigned int usize, struct fpu_guest *guest_fpu) 1505 { 1506 struct fpu *fpu = ¤t->thread.fpu; 1507 struct fpstate *curfps, *newfps = NULL; 1508 unsigned int fpsize; 1509 bool in_use; 1510 1511 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); 1512 1513 newfps = vzalloc(fpsize); 1514 if (!newfps) 1515 return -ENOMEM; 1516 newfps->size = ksize; 1517 newfps->user_size = usize; 1518 newfps->is_valloc = true; 1519 1520 /* 1521 * When a guest FPU is supplied, use @guest_fpu->fpstate 1522 * as reference independent whether it is in use or not. 1523 */ 1524 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; 1525 1526 /* Determine whether @curfps is the active fpstate */ 1527 in_use = fpu->fpstate == curfps; 1528 1529 if (guest_fpu) { 1530 newfps->is_guest = true; 1531 newfps->is_confidential = curfps->is_confidential; 1532 newfps->in_use = curfps->in_use; 1533 guest_fpu->xfeatures |= xfeatures; 1534 guest_fpu->uabi_size = usize; 1535 } 1536 1537 fpregs_lock(); 1538 /* 1539 * If @curfps is in use, ensure that the current state is in the 1540 * registers before swapping fpstate as that might invalidate it 1541 * due to layout changes. 1542 */ 1543 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) 1544 fpregs_restore_userregs(); 1545 1546 newfps->xfeatures = curfps->xfeatures | xfeatures; 1547 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; 1548 newfps->xfd = curfps->xfd & ~xfeatures; 1549 1550 /* Do the final updates within the locked region */ 1551 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); 1552 1553 if (guest_fpu) { 1554 guest_fpu->fpstate = newfps; 1555 /* If curfps is active, update the FPU fpstate pointer */ 1556 if (in_use) 1557 fpu->fpstate = newfps; 1558 } else { 1559 fpu->fpstate = newfps; 1560 } 1561 1562 if (in_use) 1563 xfd_update_state(fpu->fpstate); 1564 fpregs_unlock(); 1565 1566 /* Only free valloc'ed state */ 1567 if (curfps && curfps->is_valloc) 1568 vfree(curfps); 1569 1570 return 0; 1571 } 1572 1573 static int validate_sigaltstack(unsigned int usize) 1574 { 1575 struct task_struct *thread, *leader = current->group_leader; 1576 unsigned long framesize = get_sigframe_size(); 1577 1578 lockdep_assert_held(¤t->sighand->siglock); 1579 1580 /* get_sigframe_size() is based on fpu_user_cfg.max_size */ 1581 framesize -= fpu_user_cfg.max_size; 1582 framesize += usize; 1583 for_each_thread(leader, thread) { 1584 if (thread->sas_ss_size && thread->sas_ss_size < framesize) 1585 return -ENOSPC; 1586 } 1587 return 0; 1588 } 1589 1590 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) 1591 { 1592 /* 1593 * This deliberately does not exclude !XSAVES as we still might 1594 * decide to optionally context switch XCR0 or talk the silicon 1595 * vendors into extending XFD for the pre AMX states, especially 1596 * AVX512. 1597 */ 1598 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 1599 struct fpu *fpu = ¤t->group_leader->thread.fpu; 1600 struct fpu_state_perm *perm; 1601 unsigned int ksize, usize; 1602 u64 mask; 1603 int ret = 0; 1604 1605 /* Check whether fully enabled */ 1606 if ((permitted & requested) == requested) 1607 return 0; 1608 1609 /* Calculate the resulting kernel state size */ 1610 mask = permitted | requested; 1611 /* Take supervisor states into account on the host */ 1612 if (!guest) 1613 mask |= xfeatures_mask_supervisor(); 1614 ksize = xstate_calculate_size(mask, compacted); 1615 1616 /* Calculate the resulting user state size */ 1617 mask &= XFEATURE_MASK_USER_SUPPORTED; 1618 usize = xstate_calculate_size(mask, false); 1619 1620 if (!guest) { 1621 ret = validate_sigaltstack(usize); 1622 if (ret) 1623 return ret; 1624 } 1625 1626 perm = guest ? &fpu->guest_perm : &fpu->perm; 1627 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ 1628 WRITE_ONCE(perm->__state_perm, mask); 1629 /* Protected by sighand lock */ 1630 perm->__state_size = ksize; 1631 perm->__user_state_size = usize; 1632 return ret; 1633 } 1634 1635 /* 1636 * Permissions array to map facilities with more than one component 1637 */ 1638 static const u64 xstate_prctl_req[XFEATURE_MAX] = { 1639 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, 1640 }; 1641 1642 static int xstate_request_perm(unsigned long idx, bool guest) 1643 { 1644 u64 permitted, requested; 1645 int ret; 1646 1647 if (idx >= XFEATURE_MAX) 1648 return -EINVAL; 1649 1650 /* 1651 * Look up the facility mask which can require more than 1652 * one xstate component. 1653 */ 1654 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); 1655 requested = xstate_prctl_req[idx]; 1656 if (!requested) 1657 return -EOPNOTSUPP; 1658 1659 if ((fpu_user_cfg.max_features & requested) != requested) 1660 return -EOPNOTSUPP; 1661 1662 /* Lockless quick check */ 1663 permitted = xstate_get_group_perm(guest); 1664 if ((permitted & requested) == requested) 1665 return 0; 1666 1667 /* Protect against concurrent modifications */ 1668 spin_lock_irq(¤t->sighand->siglock); 1669 permitted = xstate_get_group_perm(guest); 1670 1671 /* First vCPU allocation locks the permissions. */ 1672 if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) 1673 ret = -EBUSY; 1674 else 1675 ret = __xstate_request_perm(permitted, requested, guest); 1676 spin_unlock_irq(¤t->sighand->siglock); 1677 return ret; 1678 } 1679 1680 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) 1681 { 1682 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; 1683 struct fpu_state_perm *perm; 1684 unsigned int ksize, usize; 1685 struct fpu *fpu; 1686 1687 if (!xfd_event) { 1688 if (!guest_fpu) 1689 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); 1690 return 0; 1691 } 1692 1693 /* Protect against concurrent modifications */ 1694 spin_lock_irq(¤t->sighand->siglock); 1695 1696 /* If not permitted let it die */ 1697 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { 1698 spin_unlock_irq(¤t->sighand->siglock); 1699 return -EPERM; 1700 } 1701 1702 fpu = ¤t->group_leader->thread.fpu; 1703 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; 1704 ksize = perm->__state_size; 1705 usize = perm->__user_state_size; 1706 1707 /* 1708 * The feature is permitted. State size is sufficient. Dropping 1709 * the lock is safe here even if more features are added from 1710 * another task, the retrieved buffer sizes are valid for the 1711 * currently requested feature(s). 1712 */ 1713 spin_unlock_irq(¤t->sighand->siglock); 1714 1715 /* 1716 * Try to allocate a new fpstate. If that fails there is no way 1717 * out. 1718 */ 1719 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) 1720 return -EFAULT; 1721 return 0; 1722 } 1723 1724 int xfd_enable_feature(u64 xfd_err) 1725 { 1726 return __xfd_enable_feature(xfd_err, NULL); 1727 } 1728 1729 #else /* CONFIG_X86_64 */ 1730 static inline int xstate_request_perm(unsigned long idx, bool guest) 1731 { 1732 return -EPERM; 1733 } 1734 #endif /* !CONFIG_X86_64 */ 1735 1736 u64 xstate_get_guest_group_perm(void) 1737 { 1738 return xstate_get_group_perm(true); 1739 } 1740 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); 1741 1742 /** 1743 * fpu_xstate_prctl - xstate permission operations 1744 * @option: A subfunction of arch_prctl() 1745 * @arg2: option argument 1746 * Return: 0 if successful; otherwise, an error code 1747 * 1748 * Option arguments: 1749 * 1750 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info 1751 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info 1752 * ARCH_REQ_XCOMP_PERM: Facility number requested 1753 * 1754 * For facilities which require more than one XSTATE component, the request 1755 * must be the highest state component number related to that facility, 1756 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and 1757 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). 1758 */ 1759 long fpu_xstate_prctl(int option, unsigned long arg2) 1760 { 1761 u64 __user *uptr = (u64 __user *)arg2; 1762 u64 permitted, supported; 1763 unsigned long idx = arg2; 1764 bool guest = false; 1765 1766 switch (option) { 1767 case ARCH_GET_XCOMP_SUPP: 1768 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; 1769 return put_user(supported, uptr); 1770 1771 case ARCH_GET_XCOMP_PERM: 1772 /* 1773 * Lockless snapshot as it can also change right after the 1774 * dropping the lock. 1775 */ 1776 permitted = xstate_get_host_group_perm(); 1777 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1778 return put_user(permitted, uptr); 1779 1780 case ARCH_GET_XCOMP_GUEST_PERM: 1781 permitted = xstate_get_guest_group_perm(); 1782 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1783 return put_user(permitted, uptr); 1784 1785 case ARCH_REQ_XCOMP_GUEST_PERM: 1786 guest = true; 1787 fallthrough; 1788 1789 case ARCH_REQ_XCOMP_PERM: 1790 if (!IS_ENABLED(CONFIG_X86_64)) 1791 return -EOPNOTSUPP; 1792 1793 return xstate_request_perm(idx, guest); 1794 1795 default: 1796 return -EINVAL; 1797 } 1798 } 1799 1800 #ifdef CONFIG_PROC_PID_ARCH_STATUS 1801 /* 1802 * Report the amount of time elapsed in millisecond since last AVX512 1803 * use in the task. 1804 */ 1805 static void avx512_status(struct seq_file *m, struct task_struct *task) 1806 { 1807 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); 1808 long delta; 1809 1810 if (!timestamp) { 1811 /* 1812 * Report -1 if no AVX512 usage 1813 */ 1814 delta = -1; 1815 } else { 1816 delta = (long)(jiffies - timestamp); 1817 /* 1818 * Cap to LONG_MAX if time difference > LONG_MAX 1819 */ 1820 if (delta < 0) 1821 delta = LONG_MAX; 1822 delta = jiffies_to_msecs(delta); 1823 } 1824 1825 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); 1826 seq_putc(m, '\n'); 1827 } 1828 1829 /* 1830 * Report architecture specific information 1831 */ 1832 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, 1833 struct pid *pid, struct task_struct *task) 1834 { 1835 /* 1836 * Report AVX512 state if the processor and build option supported. 1837 */ 1838 if (cpu_feature_enabled(X86_FEATURE_AVX512F)) 1839 avx512_status(m, task); 1840 1841 return 0; 1842 } 1843 #endif /* CONFIG_PROC_PID_ARCH_STATUS */ 1844 1845 #ifdef CONFIG_COREDUMP 1846 static const char owner_name[] = "LINUX"; 1847 1848 /* 1849 * Dump type, size, offset and flag values for every xfeature that is present. 1850 */ 1851 static int dump_xsave_layout_desc(struct coredump_params *cprm) 1852 { 1853 int num_records = 0; 1854 int i; 1855 1856 for_each_extended_xfeature(i, fpu_user_cfg.max_features) { 1857 struct x86_xfeat_component xc = { 1858 .type = i, 1859 .size = xstate_sizes[i], 1860 .offset = xstate_offsets[i], 1861 /* reserved for future use */ 1862 .flags = 0, 1863 }; 1864 1865 if (!dump_emit(cprm, &xc, sizeof(xc))) 1866 return 0; 1867 1868 num_records++; 1869 } 1870 return num_records; 1871 } 1872 1873 static u32 get_xsave_desc_size(void) 1874 { 1875 u32 cnt = 0; 1876 u32 i; 1877 1878 for_each_extended_xfeature(i, fpu_user_cfg.max_features) 1879 cnt++; 1880 1881 return cnt * (sizeof(struct x86_xfeat_component)); 1882 } 1883 1884 int elf_coredump_extra_notes_write(struct coredump_params *cprm) 1885 { 1886 int num_records = 0; 1887 struct elf_note en; 1888 1889 if (!fpu_user_cfg.max_features) 1890 return 0; 1891 1892 en.n_namesz = sizeof(owner_name); 1893 en.n_descsz = get_xsave_desc_size(); 1894 en.n_type = NT_X86_XSAVE_LAYOUT; 1895 1896 if (!dump_emit(cprm, &en, sizeof(en))) 1897 return 1; 1898 if (!dump_emit(cprm, owner_name, en.n_namesz)) 1899 return 1; 1900 if (!dump_align(cprm, 4)) 1901 return 1; 1902 1903 num_records = dump_xsave_layout_desc(cprm); 1904 if (!num_records) 1905 return 1; 1906 1907 /* Total size should be equal to the number of records */ 1908 if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) 1909 return 1; 1910 1911 return 0; 1912 } 1913 1914 int elf_coredump_extra_notes_size(void) 1915 { 1916 int size; 1917 1918 if (!fpu_user_cfg.max_features) 1919 return 0; 1920 1921 /* .note header */ 1922 size = sizeof(struct elf_note); 1923 /* Name plus alignment to 4 bytes */ 1924 size += roundup(sizeof(owner_name), 4); 1925 size += get_xsave_desc_size(); 1926 1927 return size; 1928 } 1929 #endif /* CONFIG_COREDUMP */ 1930