1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * xsave/xrstor support. 4 * 5 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 6 */ 7 #include <linux/bitops.h> 8 #include <linux/compat.h> 9 #include <linux/cpu.h> 10 #include <linux/mman.h> 11 #include <linux/nospec.h> 12 #include <linux/pkeys.h> 13 #include <linux/seq_file.h> 14 #include <linux/proc_fs.h> 15 #include <linux/vmalloc.h> 16 17 #include <asm/fpu/api.h> 18 #include <asm/fpu/regset.h> 19 #include <asm/fpu/signal.h> 20 #include <asm/fpu/xcr.h> 21 22 #include <asm/tlbflush.h> 23 #include <asm/prctl.h> 24 #include <asm/elf.h> 25 26 #include "context.h" 27 #include "internal.h" 28 #include "legacy.h" 29 #include "xstate.h" 30 31 #define for_each_extended_xfeature(bit, mask) \ 32 (bit) = FIRST_EXTENDED_XFEATURE; \ 33 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) 34 35 /* 36 * Although we spell it out in here, the Processor Trace 37 * xfeature is completely unused. We use other mechanisms 38 * to save/restore PT state in Linux. 39 */ 40 static const char *xfeature_names[] = 41 { 42 "x87 floating point registers", 43 "SSE registers", 44 "AVX registers", 45 "MPX bounds registers", 46 "MPX CSR", 47 "AVX-512 opmask", 48 "AVX-512 Hi256", 49 "AVX-512 ZMM_Hi256", 50 "Processor Trace (unused)", 51 "Protection Keys User registers", 52 "PASID state", 53 "Control-flow User registers", 54 "Control-flow Kernel registers (unused)", 55 "unknown xstate feature", 56 "unknown xstate feature", 57 "unknown xstate feature", 58 "unknown xstate feature", 59 "AMX Tile config", 60 "AMX Tile data", 61 "unknown xstate feature", 62 }; 63 64 static unsigned short xsave_cpuid_features[] __initdata = { 65 [XFEATURE_FP] = X86_FEATURE_FPU, 66 [XFEATURE_SSE] = X86_FEATURE_XMM, 67 [XFEATURE_YMM] = X86_FEATURE_AVX, 68 [XFEATURE_BNDREGS] = X86_FEATURE_MPX, 69 [XFEATURE_BNDCSR] = X86_FEATURE_MPX, 70 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, 71 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, 72 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, 73 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, 74 [XFEATURE_PKRU] = X86_FEATURE_OSPKE, 75 [XFEATURE_PASID] = X86_FEATURE_ENQCMD, 76 [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, 77 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, 78 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, 79 }; 80 81 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = 82 { [ 0 ... XFEATURE_MAX - 1] = -1}; 83 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = 84 { [ 0 ... XFEATURE_MAX - 1] = -1}; 85 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; 86 87 #define XSTATE_FLAG_SUPERVISOR BIT(0) 88 #define XSTATE_FLAG_ALIGNED64 BIT(1) 89 90 /* 91 * Return whether the system supports a given xfeature. 92 * 93 * Also return the name of the (most advanced) feature that the caller requested: 94 */ 95 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) 96 { 97 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; 98 99 if (unlikely(feature_name)) { 100 long xfeature_idx, max_idx; 101 u64 xfeatures_print; 102 /* 103 * So we use FLS here to be able to print the most advanced 104 * feature that was requested but is missing. So if a driver 105 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the 106 * missing AVX feature - this is the most informative message 107 * to users: 108 */ 109 if (xfeatures_missing) 110 xfeatures_print = xfeatures_missing; 111 else 112 xfeatures_print = xfeatures_needed; 113 114 xfeature_idx = fls64(xfeatures_print)-1; 115 max_idx = ARRAY_SIZE(xfeature_names)-1; 116 xfeature_idx = min(xfeature_idx, max_idx); 117 118 *feature_name = xfeature_names[xfeature_idx]; 119 } 120 121 if (xfeatures_missing) 122 return 0; 123 124 return 1; 125 } 126 EXPORT_SYMBOL_GPL(cpu_has_xfeatures); 127 128 static bool xfeature_is_aligned64(int xfeature_nr) 129 { 130 return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; 131 } 132 133 static bool xfeature_is_supervisor(int xfeature_nr) 134 { 135 return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; 136 } 137 138 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) 139 { 140 unsigned int offs, i; 141 142 /* 143 * Non-compacted format and legacy features use the cached fixed 144 * offsets. 145 */ 146 if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || 147 xfeature <= XFEATURE_SSE) 148 return xstate_offsets[xfeature]; 149 150 /* 151 * Compacted format offsets depend on the actual content of the 152 * compacted xsave area which is determined by the xcomp_bv header 153 * field. 154 */ 155 offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; 156 for_each_extended_xfeature(i, xcomp_bv) { 157 if (xfeature_is_aligned64(i)) 158 offs = ALIGN(offs, 64); 159 if (i == xfeature) 160 break; 161 offs += xstate_sizes[i]; 162 } 163 return offs; 164 } 165 166 /* 167 * Enable the extended processor state save/restore feature. 168 * Called once per CPU onlining. 169 */ 170 void fpu__init_cpu_xstate(void) 171 { 172 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) 173 return; 174 175 cr4_set_bits(X86_CR4_OSXSAVE); 176 177 /* 178 * Must happen after CR4 setup and before xsetbv() to allow KVM 179 * lazy passthrough. Write independent of the dynamic state static 180 * key as that does not work on the boot CPU. This also ensures 181 * that any stale state is wiped out from XFD. Reset the per CPU 182 * xfd cache too. 183 */ 184 if (cpu_feature_enabled(X86_FEATURE_XFD)) 185 xfd_set_state(init_fpstate.xfd); 186 187 /* 188 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features 189 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user 190 * states can be set here. 191 */ 192 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 193 194 /* 195 * MSR_IA32_XSS sets supervisor states managed by XSAVES. 196 */ 197 if (boot_cpu_has(X86_FEATURE_XSAVES)) { 198 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 199 xfeatures_mask_independent()); 200 } 201 } 202 203 static bool xfeature_enabled(enum xfeature xfeature) 204 { 205 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); 206 } 207 208 /* 209 * Record the offsets and sizes of various xstates contained 210 * in the XSAVE state memory layout. 211 */ 212 static void __init setup_xstate_cache(void) 213 { 214 u32 eax, ebx, ecx, edx, i; 215 /* start at the beginning of the "extended state" */ 216 unsigned int last_good_offset = offsetof(struct xregs_state, 217 extended_state_area); 218 /* 219 * The FP xstates and SSE xstates are legacy states. They are always 220 * in the fixed offsets in the xsave area in either compacted form 221 * or standard form. 222 */ 223 xstate_offsets[XFEATURE_FP] = 0; 224 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, 225 xmm_space); 226 227 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; 228 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, 229 xmm_space); 230 231 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 232 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 233 234 xstate_sizes[i] = eax; 235 xstate_flags[i] = ecx; 236 237 /* 238 * If an xfeature is supervisor state, the offset in EBX is 239 * invalid, leave it to -1. 240 */ 241 if (xfeature_is_supervisor(i)) 242 continue; 243 244 xstate_offsets[i] = ebx; 245 246 /* 247 * In our xstate size checks, we assume that the highest-numbered 248 * xstate feature has the highest offset in the buffer. Ensure 249 * it does. 250 */ 251 WARN_ONCE(last_good_offset > xstate_offsets[i], 252 "x86/fpu: misordered xstate at %d\n", last_good_offset); 253 254 last_good_offset = xstate_offsets[i]; 255 } 256 } 257 258 static void __init print_xstate_feature(u64 xstate_mask) 259 { 260 const char *feature_name; 261 262 if (cpu_has_xfeatures(xstate_mask, &feature_name)) 263 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); 264 } 265 266 /* 267 * Print out all the supported xstate features: 268 */ 269 static void __init print_xstate_features(void) 270 { 271 print_xstate_feature(XFEATURE_MASK_FP); 272 print_xstate_feature(XFEATURE_MASK_SSE); 273 print_xstate_feature(XFEATURE_MASK_YMM); 274 print_xstate_feature(XFEATURE_MASK_BNDREGS); 275 print_xstate_feature(XFEATURE_MASK_BNDCSR); 276 print_xstate_feature(XFEATURE_MASK_OPMASK); 277 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); 278 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); 279 print_xstate_feature(XFEATURE_MASK_PKRU); 280 print_xstate_feature(XFEATURE_MASK_PASID); 281 print_xstate_feature(XFEATURE_MASK_CET_USER); 282 print_xstate_feature(XFEATURE_MASK_XTILE_CFG); 283 print_xstate_feature(XFEATURE_MASK_XTILE_DATA); 284 } 285 286 /* 287 * This check is important because it is easy to get XSTATE_* 288 * confused with XSTATE_BIT_*. 289 */ 290 #define CHECK_XFEATURE(nr) do { \ 291 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ 292 WARN_ON(nr >= XFEATURE_MAX); \ 293 } while (0) 294 295 /* 296 * Print out xstate component offsets and sizes 297 */ 298 static void __init print_xstate_offset_size(void) 299 { 300 int i; 301 302 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 303 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", 304 i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), 305 i, xstate_sizes[i]); 306 } 307 } 308 309 /* 310 * This function is called only during boot time when x86 caps are not set 311 * up and alternative can not be used yet. 312 */ 313 static __init void os_xrstor_booting(struct xregs_state *xstate) 314 { 315 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; 316 u32 lmask = mask; 317 u32 hmask = mask >> 32; 318 int err; 319 320 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) 321 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); 322 else 323 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); 324 325 /* 326 * We should never fault when copying from a kernel buffer, and the FPU 327 * state we set at boot time should be valid. 328 */ 329 WARN_ON_FPU(err); 330 } 331 332 /* 333 * All supported features have either init state all zeros or are 334 * handled in setup_init_fpu() individually. This is an explicit 335 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch 336 * newly added supported features at build time and make people 337 * actually look at the init state for the new feature. 338 */ 339 #define XFEATURES_INIT_FPSTATE_HANDLED \ 340 (XFEATURE_MASK_FP | \ 341 XFEATURE_MASK_SSE | \ 342 XFEATURE_MASK_YMM | \ 343 XFEATURE_MASK_OPMASK | \ 344 XFEATURE_MASK_ZMM_Hi256 | \ 345 XFEATURE_MASK_Hi16_ZMM | \ 346 XFEATURE_MASK_PKRU | \ 347 XFEATURE_MASK_BNDREGS | \ 348 XFEATURE_MASK_BNDCSR | \ 349 XFEATURE_MASK_PASID | \ 350 XFEATURE_MASK_CET_USER | \ 351 XFEATURE_MASK_XTILE) 352 353 /* 354 * setup the xstate image representing the init state 355 */ 356 static void __init setup_init_fpu_buf(void) 357 { 358 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | 359 XFEATURE_MASK_SUPERVISOR_SUPPORTED) != 360 XFEATURES_INIT_FPSTATE_HANDLED); 361 362 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 363 return; 364 365 print_xstate_features(); 366 367 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); 368 369 /* 370 * Init all the features state with header.xfeatures being 0x0 371 */ 372 os_xrstor_booting(&init_fpstate.regs.xsave); 373 374 /* 375 * All components are now in init state. Read the state back so 376 * that init_fpstate contains all non-zero init state. This only 377 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because 378 * those use the init optimization which skips writing data for 379 * components in init state. 380 * 381 * XSAVE could be used, but that would require to reshuffle the 382 * data when XSAVEC/S is available because XSAVEC/S uses xstate 383 * compaction. But doing so is a pointless exercise because most 384 * components have an all zeros init state except for the legacy 385 * ones (FP and SSE). Those can be saved with FXSAVE into the 386 * legacy area. Adding new features requires to ensure that init 387 * state is all zeroes or if not to add the necessary handling 388 * here. 389 */ 390 fxsave(&init_fpstate.regs.fxsave); 391 } 392 393 int xfeature_size(int xfeature_nr) 394 { 395 u32 eax, ebx, ecx, edx; 396 397 CHECK_XFEATURE(xfeature_nr); 398 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); 399 return eax; 400 } 401 402 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ 403 static int validate_user_xstate_header(const struct xstate_header *hdr, 404 struct fpstate *fpstate) 405 { 406 /* No unknown or supervisor features may be set */ 407 if (hdr->xfeatures & ~fpstate->user_xfeatures) 408 return -EINVAL; 409 410 /* Userspace must use the uncompacted format */ 411 if (hdr->xcomp_bv) 412 return -EINVAL; 413 414 /* 415 * If 'reserved' is shrunken to add a new field, make sure to validate 416 * that new field here! 417 */ 418 BUILD_BUG_ON(sizeof(hdr->reserved) != 48); 419 420 /* No reserved bits may be set */ 421 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) 422 return -EINVAL; 423 424 return 0; 425 } 426 427 static void __init __xstate_dump_leaves(void) 428 { 429 int i; 430 u32 eax, ebx, ecx, edx; 431 static int should_dump = 1; 432 433 if (!should_dump) 434 return; 435 should_dump = 0; 436 /* 437 * Dump out a few leaves past the ones that we support 438 * just in case there are some goodies up there 439 */ 440 for (i = 0; i < XFEATURE_MAX + 10; i++) { 441 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); 442 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", 443 XSTATE_CPUID, i, eax, ebx, ecx, edx); 444 } 445 } 446 447 #define XSTATE_WARN_ON(x, fmt, ...) do { \ 448 if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ 449 __xstate_dump_leaves(); \ 450 } \ 451 } while (0) 452 453 #define XCHECK_SZ(sz, nr, __struct) ({ \ 454 if (WARN_ONCE(sz != sizeof(__struct), \ 455 "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ 456 xfeature_names[nr], sizeof(__struct), sz)) { \ 457 __xstate_dump_leaves(); \ 458 } \ 459 true; \ 460 }) 461 462 463 /** 464 * check_xtile_data_against_struct - Check tile data state size. 465 * 466 * Calculate the state size by multiplying the single tile size which is 467 * recorded in a C struct, and the number of tiles that the CPU informs. 468 * Compare the provided size with the calculation. 469 * 470 * @size: The tile data state size 471 * 472 * Returns: 0 on success, -EINVAL on mismatch. 473 */ 474 static int __init check_xtile_data_against_struct(int size) 475 { 476 u32 max_palid, palid, state_size; 477 u32 eax, ebx, ecx, edx; 478 u16 max_tile; 479 480 /* 481 * Check the maximum palette id: 482 * eax: the highest numbered palette subleaf. 483 */ 484 cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); 485 486 /* 487 * Cross-check each tile size and find the maximum number of 488 * supported tiles. 489 */ 490 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { 491 u16 tile_size, max; 492 493 /* 494 * Check the tile size info: 495 * eax[31:16]: bytes per title 496 * ebx[31:16]: the max names (or max number of tiles) 497 */ 498 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); 499 tile_size = eax >> 16; 500 max = ebx >> 16; 501 502 if (tile_size != sizeof(struct xtile_data)) { 503 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", 504 __stringify(XFEATURE_XTILE_DATA), 505 sizeof(struct xtile_data), tile_size); 506 __xstate_dump_leaves(); 507 return -EINVAL; 508 } 509 510 if (max > max_tile) 511 max_tile = max; 512 } 513 514 state_size = sizeof(struct xtile_data) * max_tile; 515 if (size != state_size) { 516 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", 517 __stringify(XFEATURE_XTILE_DATA), state_size, size); 518 __xstate_dump_leaves(); 519 return -EINVAL; 520 } 521 return 0; 522 } 523 524 /* 525 * We have a C struct for each 'xstate'. We need to ensure 526 * that our software representation matches what the CPU 527 * tells us about the state's size. 528 */ 529 static bool __init check_xstate_against_struct(int nr) 530 { 531 /* 532 * Ask the CPU for the size of the state. 533 */ 534 int sz = xfeature_size(nr); 535 536 /* 537 * Match each CPU state with the corresponding software 538 * structure. 539 */ 540 switch (nr) { 541 case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); 542 case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); 543 case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); 544 case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); 545 case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); 546 case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); 547 case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); 548 case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); 549 case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); 550 case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); 551 case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; 552 default: 553 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); 554 return false; 555 } 556 557 return true; 558 } 559 560 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) 561 { 562 unsigned int topmost = fls64(xfeatures) - 1; 563 unsigned int offset = xstate_offsets[topmost]; 564 565 if (topmost <= XFEATURE_SSE) 566 return sizeof(struct xregs_state); 567 568 if (compacted) 569 offset = xfeature_get_offset(xfeatures, topmost); 570 return offset + xstate_sizes[topmost]; 571 } 572 573 /* 574 * This essentially double-checks what the cpu told us about 575 * how large the XSAVE buffer needs to be. We are recalculating 576 * it to be safe. 577 * 578 * Independent XSAVE features allocate their own buffers and are not 579 * covered by these checks. Only the size of the buffer for task->fpu 580 * is checked here. 581 */ 582 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) 583 { 584 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 585 bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); 586 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; 587 int i; 588 589 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { 590 if (!check_xstate_against_struct(i)) 591 return false; 592 /* 593 * Supervisor state components can be managed only by 594 * XSAVES. 595 */ 596 if (!xsaves && xfeature_is_supervisor(i)) { 597 XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); 598 return false; 599 } 600 } 601 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); 602 XSTATE_WARN_ON(size != kernel_size, 603 "size %u != kernel_size %u\n", size, kernel_size); 604 return size == kernel_size; 605 } 606 607 /* 608 * Get total size of enabled xstates in XCR0 | IA32_XSS. 609 * 610 * Note the SDM's wording here. "sub-function 0" only enumerates 611 * the size of the *user* states. If we use it to size a buffer 612 * that we use 'XSAVES' on, we could potentially overflow the 613 * buffer because 'XSAVES' saves system states too. 614 * 615 * This also takes compaction into account. So this works for 616 * XSAVEC as well. 617 */ 618 static unsigned int __init get_compacted_size(void) 619 { 620 unsigned int eax, ebx, ecx, edx; 621 /* 622 * - CPUID function 0DH, sub-function 1: 623 * EBX enumerates the size (in bytes) required by 624 * the XSAVES instruction for an XSAVE area 625 * containing all the state components 626 * corresponding to bits currently set in 627 * XCR0 | IA32_XSS. 628 * 629 * When XSAVES is not available but XSAVEC is (virt), then there 630 * are no supervisor states, but XSAVEC still uses compacted 631 * format. 632 */ 633 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 634 return ebx; 635 } 636 637 /* 638 * Get the total size of the enabled xstates without the independent supervisor 639 * features. 640 */ 641 static unsigned int __init get_xsave_compacted_size(void) 642 { 643 u64 mask = xfeatures_mask_independent(); 644 unsigned int size; 645 646 if (!mask) 647 return get_compacted_size(); 648 649 /* Disable independent features. */ 650 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); 651 652 /* 653 * Ask the hardware what size is required of the buffer. 654 * This is the size required for the task->fpu buffer. 655 */ 656 size = get_compacted_size(); 657 658 /* Re-enable independent features so XSAVES will work on them again. */ 659 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); 660 661 return size; 662 } 663 664 static unsigned int __init get_xsave_size_user(void) 665 { 666 unsigned int eax, ebx, ecx, edx; 667 /* 668 * - CPUID function 0DH, sub-function 0: 669 * EBX enumerates the size (in bytes) required by 670 * the XSAVE instruction for an XSAVE area 671 * containing all the *user* state components 672 * corresponding to bits currently set in XCR0. 673 */ 674 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 675 return ebx; 676 } 677 678 static int __init init_xstate_size(void) 679 { 680 /* Recompute the context size for enabled features: */ 681 unsigned int user_size, kernel_size, kernel_default_size; 682 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 683 684 /* Uncompacted user space size */ 685 user_size = get_xsave_size_user(); 686 687 /* 688 * XSAVES kernel size includes supervisor states and uses compacted 689 * format. XSAVEC uses compacted format, but does not save 690 * supervisor states. 691 * 692 * XSAVE[OPT] do not support supervisor states so kernel and user 693 * size is identical. 694 */ 695 if (compacted) 696 kernel_size = get_xsave_compacted_size(); 697 else 698 kernel_size = user_size; 699 700 kernel_default_size = 701 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); 702 703 if (!paranoid_xstate_size_valid(kernel_size)) 704 return -EINVAL; 705 706 fpu_kernel_cfg.max_size = kernel_size; 707 fpu_user_cfg.max_size = user_size; 708 709 fpu_kernel_cfg.default_size = kernel_default_size; 710 fpu_user_cfg.default_size = 711 xstate_calculate_size(fpu_user_cfg.default_features, false); 712 713 return 0; 714 } 715 716 /* 717 * We enabled the XSAVE hardware, but something went wrong and 718 * we can not use it. Disable it. 719 */ 720 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) 721 { 722 fpu_kernel_cfg.max_features = 0; 723 cr4_clear_bits(X86_CR4_OSXSAVE); 724 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 725 726 /* Restore the legacy size.*/ 727 fpu_kernel_cfg.max_size = legacy_size; 728 fpu_kernel_cfg.default_size = legacy_size; 729 fpu_user_cfg.max_size = legacy_size; 730 fpu_user_cfg.default_size = legacy_size; 731 732 /* 733 * Prevent enabling the static branch which enables writes to the 734 * XFD MSR. 735 */ 736 init_fpstate.xfd = 0; 737 738 fpstate_reset(¤t->thread.fpu); 739 } 740 741 /* 742 * Enable and initialize the xsave feature. 743 * Called once per system bootup. 744 */ 745 void __init fpu__init_system_xstate(unsigned int legacy_size) 746 { 747 unsigned int eax, ebx, ecx, edx; 748 u64 xfeatures; 749 int err; 750 int i; 751 752 if (!boot_cpu_has(X86_FEATURE_FPU)) { 753 pr_info("x86/fpu: No FPU detected\n"); 754 return; 755 } 756 757 if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 758 pr_info("x86/fpu: x87 FPU will use %s\n", 759 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); 760 return; 761 } 762 763 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { 764 WARN_ON_FPU(1); 765 return; 766 } 767 768 /* 769 * Find user xstates supported by the processor. 770 */ 771 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); 772 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); 773 774 /* 775 * Find supervisor xstates supported by the processor. 776 */ 777 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); 778 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); 779 780 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { 781 /* 782 * This indicates that something really unexpected happened 783 * with the enumeration. Disable XSAVE and try to continue 784 * booting without it. This is too early to BUG(). 785 */ 786 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", 787 fpu_kernel_cfg.max_features); 788 goto out_disable; 789 } 790 791 /* 792 * Clear XSAVE features that are disabled in the normal CPUID. 793 */ 794 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { 795 unsigned short cid = xsave_cpuid_features[i]; 796 797 /* Careful: X86_FEATURE_FPU is 0! */ 798 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) 799 fpu_kernel_cfg.max_features &= ~BIT_ULL(i); 800 } 801 802 if (!cpu_feature_enabled(X86_FEATURE_XFD)) 803 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; 804 805 if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) 806 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 807 else 808 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | 809 XFEATURE_MASK_SUPERVISOR_SUPPORTED; 810 811 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; 812 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; 813 814 /* Clean out dynamic features from default */ 815 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; 816 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 817 818 fpu_user_cfg.default_features = fpu_user_cfg.max_features; 819 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; 820 821 /* Store it for paranoia check at the end */ 822 xfeatures = fpu_kernel_cfg.max_features; 823 824 /* 825 * Initialize the default XFD state in initfp_state and enable the 826 * dynamic sizing mechanism if dynamic states are available. The 827 * static key cannot be enabled here because this runs before 828 * jump_label_init(). This is delayed to an initcall. 829 */ 830 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; 831 832 /* Set up compaction feature bit */ 833 if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || 834 cpu_feature_enabled(X86_FEATURE_XSAVES)) 835 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); 836 837 /* Enable xstate instructions to be able to continue with initialization: */ 838 fpu__init_cpu_xstate(); 839 840 /* Cache size, offset and flags for initialization */ 841 setup_xstate_cache(); 842 843 err = init_xstate_size(); 844 if (err) 845 goto out_disable; 846 847 /* Reset the state for the current task */ 848 fpstate_reset(¤t->thread.fpu); 849 850 /* 851 * Update info used for ptrace frames; use standard-format size and no 852 * supervisor xstates: 853 */ 854 update_regset_xstate_info(fpu_user_cfg.max_size, 855 fpu_user_cfg.max_features); 856 857 /* 858 * init_fpstate excludes dynamic states as they are large but init 859 * state is zero. 860 */ 861 init_fpstate.size = fpu_kernel_cfg.default_size; 862 init_fpstate.xfeatures = fpu_kernel_cfg.default_features; 863 864 if (init_fpstate.size > sizeof(init_fpstate.regs)) { 865 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", 866 sizeof(init_fpstate.regs), init_fpstate.size); 867 goto out_disable; 868 } 869 870 setup_init_fpu_buf(); 871 872 /* 873 * Paranoia check whether something in the setup modified the 874 * xfeatures mask. 875 */ 876 if (xfeatures != fpu_kernel_cfg.max_features) { 877 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", 878 xfeatures, fpu_kernel_cfg.max_features); 879 goto out_disable; 880 } 881 882 /* 883 * CPU capabilities initialization runs before FPU init. So 884 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely 885 * functional, set the feature bit so depending code works. 886 */ 887 setup_force_cpu_cap(X86_FEATURE_OSXSAVE); 888 889 print_xstate_offset_size(); 890 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", 891 fpu_kernel_cfg.max_features, 892 fpu_kernel_cfg.max_size, 893 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); 894 return; 895 896 out_disable: 897 /* something went wrong, try to boot without any XSAVE support */ 898 fpu__init_disable_system_xstate(legacy_size); 899 } 900 901 /* 902 * Restore minimal FPU state after suspend: 903 */ 904 void fpu__resume_cpu(void) 905 { 906 /* 907 * Restore XCR0 on xsave capable CPUs: 908 */ 909 if (cpu_feature_enabled(X86_FEATURE_XSAVE)) 910 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); 911 912 /* 913 * Restore IA32_XSS. The same CPUID bit enumerates support 914 * of XSAVES and MSR_IA32_XSS. 915 */ 916 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { 917 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | 918 xfeatures_mask_independent()); 919 } 920 921 if (fpu_state_size_dynamic()) 922 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); 923 } 924 925 /* 926 * Given an xstate feature nr, calculate where in the xsave 927 * buffer the state is. Callers should ensure that the buffer 928 * is valid. 929 */ 930 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 931 { 932 u64 xcomp_bv = xsave->header.xcomp_bv; 933 934 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 935 return NULL; 936 937 if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { 938 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) 939 return NULL; 940 } 941 942 return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); 943 } 944 945 /* 946 * Given the xsave area and a state inside, this function returns the 947 * address of the state. 948 * 949 * This is the API that is called to get xstate address in either 950 * standard format or compacted format of xsave area. 951 * 952 * Note that if there is no data for the field in the xsave buffer 953 * this will return NULL. 954 * 955 * Inputs: 956 * xstate: the thread's storage area for all FPU data 957 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, 958 * XFEATURE_SSE, etc...) 959 * Output: 960 * address of the state in the xsave area, or NULL if the 961 * field is not present in the xsave buffer. 962 */ 963 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) 964 { 965 /* 966 * Do we even *have* xsave state? 967 */ 968 if (!boot_cpu_has(X86_FEATURE_XSAVE)) 969 return NULL; 970 971 /* 972 * We should not ever be requesting features that we 973 * have not enabled. 974 */ 975 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 976 return NULL; 977 978 /* 979 * This assumes the last 'xsave*' instruction to 980 * have requested that 'xfeature_nr' be saved. 981 * If it did not, we might be seeing and old value 982 * of the field in the buffer. 983 * 984 * This can happen because the last 'xsave' did not 985 * request that this feature be saved (unlikely) 986 * or because the "init optimization" caused it 987 * to not be saved. 988 */ 989 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) 990 return NULL; 991 992 return __raw_xsave_addr(xsave, xfeature_nr); 993 } 994 EXPORT_SYMBOL_GPL(get_xsave_addr); 995 996 #ifdef CONFIG_ARCH_HAS_PKEYS 997 998 /* 999 * This will go out and modify PKRU register to set the access 1000 * rights for @pkey to @init_val. 1001 */ 1002 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 1003 unsigned long init_val) 1004 { 1005 u32 old_pkru, new_pkru_bits = 0; 1006 int pkey_shift; 1007 1008 /* 1009 * This check implies XSAVE support. OSPKE only gets 1010 * set if we enable XSAVE and we enable PKU in XCR0. 1011 */ 1012 if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) 1013 return -EINVAL; 1014 1015 /* 1016 * This code should only be called with valid 'pkey' 1017 * values originating from in-kernel users. Complain 1018 * if a bad value is observed. 1019 */ 1020 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 1021 return -EINVAL; 1022 1023 /* Set the bits we need in PKRU: */ 1024 if (init_val & PKEY_DISABLE_ACCESS) 1025 new_pkru_bits |= PKRU_AD_BIT; 1026 if (init_val & PKEY_DISABLE_WRITE) 1027 new_pkru_bits |= PKRU_WD_BIT; 1028 1029 /* Shift the bits in to the correct place in PKRU for pkey: */ 1030 pkey_shift = pkey * PKRU_BITS_PER_PKEY; 1031 new_pkru_bits <<= pkey_shift; 1032 1033 /* Get old PKRU and mask off any old bits in place: */ 1034 old_pkru = read_pkru(); 1035 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 1036 1037 /* Write old part along with new part: */ 1038 write_pkru(old_pkru | new_pkru_bits); 1039 1040 return 0; 1041 } 1042 #endif /* ! CONFIG_ARCH_HAS_PKEYS */ 1043 1044 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, 1045 void *init_xstate, unsigned int size) 1046 { 1047 membuf_write(to, from_xstate ? xstate : init_xstate, size); 1048 } 1049 1050 /** 1051 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1052 * @to: membuf descriptor 1053 * @fpstate: The fpstate buffer from which to copy 1054 * @xfeatures: The mask of xfeatures to save (XSAVE mode only) 1055 * @pkru_val: The PKRU value to store in the PKRU component 1056 * @copy_mode: The requested copy mode 1057 * 1058 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1059 * format, i.e. from the kernel internal hardware dependent storage format 1060 * to the requested @mode. UABI XSTATE is always uncompacted! 1061 * 1062 * It supports partial copy but @to.pos always starts from zero. 1063 */ 1064 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, 1065 u64 xfeatures, u32 pkru_val, 1066 enum xstate_copy_mode copy_mode) 1067 { 1068 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); 1069 struct xregs_state *xinit = &init_fpstate.regs.xsave; 1070 struct xregs_state *xsave = &fpstate->regs.xsave; 1071 struct xstate_header header; 1072 unsigned int zerofrom; 1073 u64 mask; 1074 int i; 1075 1076 memset(&header, 0, sizeof(header)); 1077 header.xfeatures = xsave->header.xfeatures; 1078 1079 /* Mask out the feature bits depending on copy mode */ 1080 switch (copy_mode) { 1081 case XSTATE_COPY_FP: 1082 header.xfeatures &= XFEATURE_MASK_FP; 1083 break; 1084 1085 case XSTATE_COPY_FX: 1086 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; 1087 break; 1088 1089 case XSTATE_COPY_XSAVE: 1090 header.xfeatures &= fpstate->user_xfeatures & xfeatures; 1091 break; 1092 } 1093 1094 /* Copy FP state up to MXCSR */ 1095 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, 1096 &xinit->i387, off_mxcsr); 1097 1098 /* Copy MXCSR when SSE or YMM are set in the feature mask */ 1099 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), 1100 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, 1101 MXCSR_AND_FLAGS_SIZE); 1102 1103 /* Copy the remaining FP state */ 1104 copy_feature(header.xfeatures & XFEATURE_MASK_FP, 1105 &to, &xsave->i387.st_space, &xinit->i387.st_space, 1106 sizeof(xsave->i387.st_space)); 1107 1108 /* Copy the SSE state - shared with YMM, but independently managed */ 1109 copy_feature(header.xfeatures & XFEATURE_MASK_SSE, 1110 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, 1111 sizeof(xsave->i387.xmm_space)); 1112 1113 if (copy_mode != XSTATE_COPY_XSAVE) 1114 goto out; 1115 1116 /* Zero the padding area */ 1117 membuf_zero(&to, sizeof(xsave->i387.padding)); 1118 1119 /* Copy xsave->i387.sw_reserved */ 1120 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); 1121 1122 /* Copy the user space relevant state of @xsave->header */ 1123 membuf_write(&to, &header, sizeof(header)); 1124 1125 zerofrom = offsetof(struct xregs_state, extended_state_area); 1126 1127 /* 1128 * This 'mask' indicates which states to copy from fpstate. 1129 * Those extended states that are not present in fpstate are 1130 * either disabled or initialized: 1131 * 1132 * In non-compacted format, disabled features still occupy 1133 * state space but there is no state to copy from in the 1134 * compacted init_fpstate. The gap tracking will zero these 1135 * states. 1136 * 1137 * The extended features have an all zeroes init state. Thus, 1138 * remove them from 'mask' to zero those features in the user 1139 * buffer instead of retrieving them from init_fpstate. 1140 */ 1141 mask = header.xfeatures; 1142 1143 for_each_extended_xfeature(i, mask) { 1144 /* 1145 * If there was a feature or alignment gap, zero the space 1146 * in the destination buffer. 1147 */ 1148 if (zerofrom < xstate_offsets[i]) 1149 membuf_zero(&to, xstate_offsets[i] - zerofrom); 1150 1151 if (i == XFEATURE_PKRU) { 1152 struct pkru_state pkru = {0}; 1153 /* 1154 * PKRU is not necessarily up to date in the 1155 * XSAVE buffer. Use the provided value. 1156 */ 1157 pkru.pkru = pkru_val; 1158 membuf_write(&to, &pkru, sizeof(pkru)); 1159 } else { 1160 membuf_write(&to, 1161 __raw_xsave_addr(xsave, i), 1162 xstate_sizes[i]); 1163 } 1164 /* 1165 * Keep track of the last copied state in the non-compacted 1166 * target buffer for gap zeroing. 1167 */ 1168 zerofrom = xstate_offsets[i] + xstate_sizes[i]; 1169 } 1170 1171 out: 1172 if (to.left) 1173 membuf_zero(&to, to.left); 1174 } 1175 1176 /** 1177 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer 1178 * @to: membuf descriptor 1179 * @tsk: The task from which to copy the saved xstate 1180 * @copy_mode: The requested copy mode 1181 * 1182 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming 1183 * format, i.e. from the kernel internal hardware dependent storage format 1184 * to the requested @mode. UABI XSTATE is always uncompacted! 1185 * 1186 * It supports partial copy but @to.pos always starts from zero. 1187 */ 1188 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, 1189 enum xstate_copy_mode copy_mode) 1190 { 1191 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, 1192 tsk->thread.fpu.fpstate->user_xfeatures, 1193 tsk->thread.pkru, copy_mode); 1194 } 1195 1196 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, 1197 const void *kbuf, const void __user *ubuf) 1198 { 1199 if (kbuf) { 1200 memcpy(dst, kbuf + offset, size); 1201 } else { 1202 if (copy_from_user(dst, ubuf + offset, size)) 1203 return -EFAULT; 1204 } 1205 return 0; 1206 } 1207 1208 1209 /** 1210 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate 1211 * @fpstate: The fpstate buffer to copy to 1212 * @kbuf: The UABI format buffer, if it comes from the kernel 1213 * @ubuf: The UABI format buffer, if it comes from userspace 1214 * @pkru: The location to write the PKRU value to 1215 * 1216 * Converts from the UABI format into the kernel internal hardware 1217 * dependent format. 1218 * 1219 * This function ultimately has three different callers with distinct PKRU 1220 * behavior. 1221 * 1. When called from sigreturn the PKRU register will be restored from 1222 * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to 1223 * @fpstate is sufficient to cover this case, but the caller will also 1224 * pass a pointer to the thread_struct's pkru field in @pkru and updating 1225 * it is harmless. 1226 * 2. When called from ptrace the PKRU register will be restored from the 1227 * thread_struct's pkru field. A pointer to that is passed in @pkru. 1228 * The kernel will restore it manually, so the XRSTOR behavior that resets 1229 * the PKRU register to the hardware init value (0) if the corresponding 1230 * xfeatures bit is not set is emulated here. 1231 * 3. When called from KVM the PKRU register will be restored from the vcpu's 1232 * pkru field. A pointer to that is passed in @pkru. KVM hasn't used 1233 * XRSTOR and hasn't had the PKRU resetting behavior described above. To 1234 * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures 1235 * bit is not set. 1236 */ 1237 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, 1238 const void __user *ubuf, u32 *pkru) 1239 { 1240 struct xregs_state *xsave = &fpstate->regs.xsave; 1241 unsigned int offset, size; 1242 struct xstate_header hdr; 1243 u64 mask; 1244 int i; 1245 1246 offset = offsetof(struct xregs_state, header); 1247 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) 1248 return -EFAULT; 1249 1250 if (validate_user_xstate_header(&hdr, fpstate)) 1251 return -EINVAL; 1252 1253 /* Validate MXCSR when any of the related features is in use */ 1254 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; 1255 if (hdr.xfeatures & mask) { 1256 u32 mxcsr[2]; 1257 1258 offset = offsetof(struct fxregs_state, mxcsr); 1259 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) 1260 return -EFAULT; 1261 1262 /* Reserved bits in MXCSR must be zero. */ 1263 if (mxcsr[0] & ~mxcsr_feature_mask) 1264 return -EINVAL; 1265 1266 /* SSE and YMM require MXCSR even when FP is not in use. */ 1267 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { 1268 xsave->i387.mxcsr = mxcsr[0]; 1269 xsave->i387.mxcsr_mask = mxcsr[1]; 1270 } 1271 } 1272 1273 for (i = 0; i < XFEATURE_MAX; i++) { 1274 mask = BIT_ULL(i); 1275 1276 if (hdr.xfeatures & mask) { 1277 void *dst = __raw_xsave_addr(xsave, i); 1278 1279 offset = xstate_offsets[i]; 1280 size = xstate_sizes[i]; 1281 1282 if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) 1283 return -EFAULT; 1284 } 1285 } 1286 1287 if (hdr.xfeatures & XFEATURE_MASK_PKRU) { 1288 struct pkru_state *xpkru; 1289 1290 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); 1291 *pkru = xpkru->pkru; 1292 } else { 1293 /* 1294 * KVM may pass NULL here to indicate that it does not need 1295 * PKRU updated. 1296 */ 1297 if (pkru) 1298 *pkru = 0; 1299 } 1300 1301 /* 1302 * The state that came in from userspace was user-state only. 1303 * Mask all the user states out of 'xfeatures': 1304 */ 1305 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; 1306 1307 /* 1308 * Add back in the features that came in from userspace: 1309 */ 1310 xsave->header.xfeatures |= hdr.xfeatures; 1311 1312 return 0; 1313 } 1314 1315 /* 1316 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] 1317 * format and copy to the target thread. Used by ptrace and KVM. 1318 */ 1319 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) 1320 { 1321 return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); 1322 } 1323 1324 /* 1325 * Convert from a sigreturn standard-format user-space buffer to kernel 1326 * XSAVE[S] format and copy to the target thread. This is called from the 1327 * sigreturn() and rt_sigreturn() system calls. 1328 */ 1329 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, 1330 const void __user *ubuf) 1331 { 1332 return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); 1333 } 1334 1335 static bool validate_independent_components(u64 mask) 1336 { 1337 u64 xchk; 1338 1339 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) 1340 return false; 1341 1342 xchk = ~xfeatures_mask_independent(); 1343 1344 if (WARN_ON_ONCE(!mask || mask & xchk)) 1345 return false; 1346 1347 return true; 1348 } 1349 1350 /** 1351 * xsaves - Save selected components to a kernel xstate buffer 1352 * @xstate: Pointer to the buffer 1353 * @mask: Feature mask to select the components to save 1354 * 1355 * The @xstate buffer must be 64 byte aligned and correctly initialized as 1356 * XSAVES does not write the full xstate header. Before first use the 1357 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer 1358 * can #GP. 1359 * 1360 * The feature mask must be a subset of the independent features. 1361 */ 1362 void xsaves(struct xregs_state *xstate, u64 mask) 1363 { 1364 int err; 1365 1366 if (!validate_independent_components(mask)) 1367 return; 1368 1369 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); 1370 WARN_ON_ONCE(err); 1371 } 1372 1373 /** 1374 * xrstors - Restore selected components from a kernel xstate buffer 1375 * @xstate: Pointer to the buffer 1376 * @mask: Feature mask to select the components to restore 1377 * 1378 * The @xstate buffer must be 64 byte aligned and correctly initialized 1379 * otherwise XRSTORS from that buffer can #GP. 1380 * 1381 * Proper usage is to restore the state which was saved with 1382 * xsaves() into @xstate. 1383 * 1384 * The feature mask must be a subset of the independent features. 1385 */ 1386 void xrstors(struct xregs_state *xstate, u64 mask) 1387 { 1388 int err; 1389 1390 if (!validate_independent_components(mask)) 1391 return; 1392 1393 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); 1394 WARN_ON_ONCE(err); 1395 } 1396 1397 #if IS_ENABLED(CONFIG_KVM) 1398 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) 1399 { 1400 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); 1401 1402 if (addr) 1403 memset(addr, 0, xstate_sizes[xfeature]); 1404 } 1405 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); 1406 #endif 1407 1408 #ifdef CONFIG_X86_64 1409 1410 #ifdef CONFIG_X86_DEBUG_FPU 1411 /* 1412 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask 1413 * can safely operate on the @fpstate buffer. 1414 */ 1415 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) 1416 { 1417 u64 xfd = __this_cpu_read(xfd_state); 1418 1419 if (fpstate->xfd == xfd) 1420 return true; 1421 1422 /* 1423 * The XFD MSR does not match fpstate->xfd. That's invalid when 1424 * the passed in fpstate is current's fpstate. 1425 */ 1426 if (fpstate->xfd == current->thread.fpu.fpstate->xfd) 1427 return false; 1428 1429 /* 1430 * XRSTOR(S) from init_fpstate are always correct as it will just 1431 * bring all components into init state and not read from the 1432 * buffer. XSAVE(S) raises #PF after init. 1433 */ 1434 if (fpstate == &init_fpstate) 1435 return rstor; 1436 1437 /* 1438 * XSAVE(S): clone(), fpu_swap_kvm_fpstate() 1439 * XRSTORS(S): fpu_swap_kvm_fpstate() 1440 */ 1441 1442 /* 1443 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch 1444 * the buffer area for XFD-disabled state components. 1445 */ 1446 mask &= ~xfd; 1447 1448 /* 1449 * Remove features which are valid in fpstate. They 1450 * have space allocated in fpstate. 1451 */ 1452 mask &= ~fpstate->xfeatures; 1453 1454 /* 1455 * Any remaining state components in 'mask' might be written 1456 * by XSAVE/XRSTOR. Fail validation it found. 1457 */ 1458 return !mask; 1459 } 1460 1461 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) 1462 { 1463 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); 1464 } 1465 #endif /* CONFIG_X86_DEBUG_FPU */ 1466 1467 static int __init xfd_update_static_branch(void) 1468 { 1469 /* 1470 * If init_fpstate.xfd has bits set then dynamic features are 1471 * available and the dynamic sizing must be enabled. 1472 */ 1473 if (init_fpstate.xfd) 1474 static_branch_enable(&__fpu_state_size_dynamic); 1475 return 0; 1476 } 1477 arch_initcall(xfd_update_static_branch) 1478 1479 void fpstate_free(struct fpu *fpu) 1480 { 1481 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) 1482 vfree(fpu->fpstate); 1483 } 1484 1485 /** 1486 * fpstate_realloc - Reallocate struct fpstate for the requested new features 1487 * 1488 * @xfeatures: A bitmap of xstate features which extend the enabled features 1489 * of that task 1490 * @ksize: The required size for the kernel buffer 1491 * @usize: The required size for user space buffers 1492 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations 1493 * 1494 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer 1495 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks 1496 * with large states are likely to live longer. 1497 * 1498 * Returns: 0 on success, -ENOMEM on allocation error. 1499 */ 1500 static int fpstate_realloc(u64 xfeatures, unsigned int ksize, 1501 unsigned int usize, struct fpu_guest *guest_fpu) 1502 { 1503 struct fpu *fpu = ¤t->thread.fpu; 1504 struct fpstate *curfps, *newfps = NULL; 1505 unsigned int fpsize; 1506 bool in_use; 1507 1508 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); 1509 1510 newfps = vzalloc(fpsize); 1511 if (!newfps) 1512 return -ENOMEM; 1513 newfps->size = ksize; 1514 newfps->user_size = usize; 1515 newfps->is_valloc = true; 1516 1517 /* 1518 * When a guest FPU is supplied, use @guest_fpu->fpstate 1519 * as reference independent whether it is in use or not. 1520 */ 1521 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; 1522 1523 /* Determine whether @curfps is the active fpstate */ 1524 in_use = fpu->fpstate == curfps; 1525 1526 if (guest_fpu) { 1527 newfps->is_guest = true; 1528 newfps->is_confidential = curfps->is_confidential; 1529 newfps->in_use = curfps->in_use; 1530 guest_fpu->xfeatures |= xfeatures; 1531 guest_fpu->uabi_size = usize; 1532 } 1533 1534 fpregs_lock(); 1535 /* 1536 * If @curfps is in use, ensure that the current state is in the 1537 * registers before swapping fpstate as that might invalidate it 1538 * due to layout changes. 1539 */ 1540 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) 1541 fpregs_restore_userregs(); 1542 1543 newfps->xfeatures = curfps->xfeatures | xfeatures; 1544 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; 1545 newfps->xfd = curfps->xfd & ~xfeatures; 1546 1547 /* Do the final updates within the locked region */ 1548 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); 1549 1550 if (guest_fpu) { 1551 guest_fpu->fpstate = newfps; 1552 /* If curfps is active, update the FPU fpstate pointer */ 1553 if (in_use) 1554 fpu->fpstate = newfps; 1555 } else { 1556 fpu->fpstate = newfps; 1557 } 1558 1559 if (in_use) 1560 xfd_update_state(fpu->fpstate); 1561 fpregs_unlock(); 1562 1563 /* Only free valloc'ed state */ 1564 if (curfps && curfps->is_valloc) 1565 vfree(curfps); 1566 1567 return 0; 1568 } 1569 1570 static int validate_sigaltstack(unsigned int usize) 1571 { 1572 struct task_struct *thread, *leader = current->group_leader; 1573 unsigned long framesize = get_sigframe_size(); 1574 1575 lockdep_assert_held(¤t->sighand->siglock); 1576 1577 /* get_sigframe_size() is based on fpu_user_cfg.max_size */ 1578 framesize -= fpu_user_cfg.max_size; 1579 framesize += usize; 1580 for_each_thread(leader, thread) { 1581 if (thread->sas_ss_size && thread->sas_ss_size < framesize) 1582 return -ENOSPC; 1583 } 1584 return 0; 1585 } 1586 1587 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) 1588 { 1589 /* 1590 * This deliberately does not exclude !XSAVES as we still might 1591 * decide to optionally context switch XCR0 or talk the silicon 1592 * vendors into extending XFD for the pre AMX states, especially 1593 * AVX512. 1594 */ 1595 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); 1596 struct fpu *fpu = ¤t->group_leader->thread.fpu; 1597 struct fpu_state_perm *perm; 1598 unsigned int ksize, usize; 1599 u64 mask; 1600 int ret = 0; 1601 1602 /* Check whether fully enabled */ 1603 if ((permitted & requested) == requested) 1604 return 0; 1605 1606 /* Calculate the resulting kernel state size */ 1607 mask = permitted | requested; 1608 /* Take supervisor states into account on the host */ 1609 if (!guest) 1610 mask |= xfeatures_mask_supervisor(); 1611 ksize = xstate_calculate_size(mask, compacted); 1612 1613 /* Calculate the resulting user state size */ 1614 mask &= XFEATURE_MASK_USER_SUPPORTED; 1615 usize = xstate_calculate_size(mask, false); 1616 1617 if (!guest) { 1618 ret = validate_sigaltstack(usize); 1619 if (ret) 1620 return ret; 1621 } 1622 1623 perm = guest ? &fpu->guest_perm : &fpu->perm; 1624 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ 1625 WRITE_ONCE(perm->__state_perm, mask); 1626 /* Protected by sighand lock */ 1627 perm->__state_size = ksize; 1628 perm->__user_state_size = usize; 1629 return ret; 1630 } 1631 1632 /* 1633 * Permissions array to map facilities with more than one component 1634 */ 1635 static const u64 xstate_prctl_req[XFEATURE_MAX] = { 1636 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, 1637 }; 1638 1639 static int xstate_request_perm(unsigned long idx, bool guest) 1640 { 1641 u64 permitted, requested; 1642 int ret; 1643 1644 if (idx >= XFEATURE_MAX) 1645 return -EINVAL; 1646 1647 /* 1648 * Look up the facility mask which can require more than 1649 * one xstate component. 1650 */ 1651 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); 1652 requested = xstate_prctl_req[idx]; 1653 if (!requested) 1654 return -EOPNOTSUPP; 1655 1656 if ((fpu_user_cfg.max_features & requested) != requested) 1657 return -EOPNOTSUPP; 1658 1659 /* Lockless quick check */ 1660 permitted = xstate_get_group_perm(guest); 1661 if ((permitted & requested) == requested) 1662 return 0; 1663 1664 /* Protect against concurrent modifications */ 1665 spin_lock_irq(¤t->sighand->siglock); 1666 permitted = xstate_get_group_perm(guest); 1667 1668 /* First vCPU allocation locks the permissions. */ 1669 if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) 1670 ret = -EBUSY; 1671 else 1672 ret = __xstate_request_perm(permitted, requested, guest); 1673 spin_unlock_irq(¤t->sighand->siglock); 1674 return ret; 1675 } 1676 1677 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) 1678 { 1679 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; 1680 struct fpu_state_perm *perm; 1681 unsigned int ksize, usize; 1682 struct fpu *fpu; 1683 1684 if (!xfd_event) { 1685 if (!guest_fpu) 1686 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); 1687 return 0; 1688 } 1689 1690 /* Protect against concurrent modifications */ 1691 spin_lock_irq(¤t->sighand->siglock); 1692 1693 /* If not permitted let it die */ 1694 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { 1695 spin_unlock_irq(¤t->sighand->siglock); 1696 return -EPERM; 1697 } 1698 1699 fpu = ¤t->group_leader->thread.fpu; 1700 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; 1701 ksize = perm->__state_size; 1702 usize = perm->__user_state_size; 1703 1704 /* 1705 * The feature is permitted. State size is sufficient. Dropping 1706 * the lock is safe here even if more features are added from 1707 * another task, the retrieved buffer sizes are valid for the 1708 * currently requested feature(s). 1709 */ 1710 spin_unlock_irq(¤t->sighand->siglock); 1711 1712 /* 1713 * Try to allocate a new fpstate. If that fails there is no way 1714 * out. 1715 */ 1716 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) 1717 return -EFAULT; 1718 return 0; 1719 } 1720 1721 int xfd_enable_feature(u64 xfd_err) 1722 { 1723 return __xfd_enable_feature(xfd_err, NULL); 1724 } 1725 1726 #else /* CONFIG_X86_64 */ 1727 static inline int xstate_request_perm(unsigned long idx, bool guest) 1728 { 1729 return -EPERM; 1730 } 1731 #endif /* !CONFIG_X86_64 */ 1732 1733 u64 xstate_get_guest_group_perm(void) 1734 { 1735 return xstate_get_group_perm(true); 1736 } 1737 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); 1738 1739 /** 1740 * fpu_xstate_prctl - xstate permission operations 1741 * @option: A subfunction of arch_prctl() 1742 * @arg2: option argument 1743 * Return: 0 if successful; otherwise, an error code 1744 * 1745 * Option arguments: 1746 * 1747 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info 1748 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info 1749 * ARCH_REQ_XCOMP_PERM: Facility number requested 1750 * 1751 * For facilities which require more than one XSTATE component, the request 1752 * must be the highest state component number related to that facility, 1753 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and 1754 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). 1755 */ 1756 long fpu_xstate_prctl(int option, unsigned long arg2) 1757 { 1758 u64 __user *uptr = (u64 __user *)arg2; 1759 u64 permitted, supported; 1760 unsigned long idx = arg2; 1761 bool guest = false; 1762 1763 switch (option) { 1764 case ARCH_GET_XCOMP_SUPP: 1765 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; 1766 return put_user(supported, uptr); 1767 1768 case ARCH_GET_XCOMP_PERM: 1769 /* 1770 * Lockless snapshot as it can also change right after the 1771 * dropping the lock. 1772 */ 1773 permitted = xstate_get_host_group_perm(); 1774 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1775 return put_user(permitted, uptr); 1776 1777 case ARCH_GET_XCOMP_GUEST_PERM: 1778 permitted = xstate_get_guest_group_perm(); 1779 permitted &= XFEATURE_MASK_USER_SUPPORTED; 1780 return put_user(permitted, uptr); 1781 1782 case ARCH_REQ_XCOMP_GUEST_PERM: 1783 guest = true; 1784 fallthrough; 1785 1786 case ARCH_REQ_XCOMP_PERM: 1787 if (!IS_ENABLED(CONFIG_X86_64)) 1788 return -EOPNOTSUPP; 1789 1790 return xstate_request_perm(idx, guest); 1791 1792 default: 1793 return -EINVAL; 1794 } 1795 } 1796 1797 #ifdef CONFIG_PROC_PID_ARCH_STATUS 1798 /* 1799 * Report the amount of time elapsed in millisecond since last AVX512 1800 * use in the task. 1801 */ 1802 static void avx512_status(struct seq_file *m, struct task_struct *task) 1803 { 1804 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); 1805 long delta; 1806 1807 if (!timestamp) { 1808 /* 1809 * Report -1 if no AVX512 usage 1810 */ 1811 delta = -1; 1812 } else { 1813 delta = (long)(jiffies - timestamp); 1814 /* 1815 * Cap to LONG_MAX if time difference > LONG_MAX 1816 */ 1817 if (delta < 0) 1818 delta = LONG_MAX; 1819 delta = jiffies_to_msecs(delta); 1820 } 1821 1822 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); 1823 seq_putc(m, '\n'); 1824 } 1825 1826 /* 1827 * Report architecture specific information 1828 */ 1829 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, 1830 struct pid *pid, struct task_struct *task) 1831 { 1832 /* 1833 * Report AVX512 state if the processor and build option supported. 1834 */ 1835 if (cpu_feature_enabled(X86_FEATURE_AVX512F)) 1836 avx512_status(m, task); 1837 1838 return 0; 1839 } 1840 #endif /* CONFIG_PROC_PID_ARCH_STATUS */ 1841