xref: /linux/arch/x86/kernel/fpu/xstate.c (revision 566ab427f827b0256d3e8ce0235d088e6a9c28bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/nospec.h>
12 #include <linux/pkeys.h>
13 #include <linux/seq_file.h>
14 #include <linux/proc_fs.h>
15 #include <linux/vmalloc.h>
16 #include <linux/coredump.h>
17 
18 #include <asm/fpu/api.h>
19 #include <asm/fpu/regset.h>
20 #include <asm/fpu/signal.h>
21 #include <asm/fpu/xcr.h>
22 
23 #include <asm/tlbflush.h>
24 #include <asm/prctl.h>
25 #include <asm/elf.h>
26 
27 #include <uapi/asm/elf.h>
28 
29 #include "context.h"
30 #include "internal.h"
31 #include "legacy.h"
32 #include "xstate.h"
33 
34 #define for_each_extended_xfeature(bit, mask)				\
35 	(bit) = FIRST_EXTENDED_XFEATURE;				\
36 	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
37 
38 /*
39  * Although we spell it out in here, the Processor Trace
40  * xfeature is completely unused.  We use other mechanisms
41  * to save/restore PT state in Linux.
42  */
43 static const char *xfeature_names[] =
44 {
45 	"x87 floating point registers",
46 	"SSE registers",
47 	"AVX registers",
48 	"MPX bounds registers",
49 	"MPX CSR",
50 	"AVX-512 opmask",
51 	"AVX-512 Hi256",
52 	"AVX-512 ZMM_Hi256",
53 	"Processor Trace (unused)",
54 	"Protection Keys User registers",
55 	"PASID state",
56 	"Control-flow User registers",
57 	"Control-flow Kernel registers (unused)",
58 	"unknown xstate feature",
59 	"unknown xstate feature",
60 	"unknown xstate feature",
61 	"unknown xstate feature",
62 	"AMX Tile config",
63 	"AMX Tile data",
64 	"unknown xstate feature",
65 };
66 
67 static unsigned short xsave_cpuid_features[] __initdata = {
68 	[XFEATURE_FP]				= X86_FEATURE_FPU,
69 	[XFEATURE_SSE]				= X86_FEATURE_XMM,
70 	[XFEATURE_YMM]				= X86_FEATURE_AVX,
71 	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
72 	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
73 	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
74 	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
75 	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
76 	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
77 	[XFEATURE_PKRU]				= X86_FEATURE_OSPKE,
78 	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
79 	[XFEATURE_CET_USER]			= X86_FEATURE_SHSTK,
80 	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
81 	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
82 };
83 
84 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
85 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
86 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
87 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
88 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
89 
90 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
91 #define XSTATE_FLAG_ALIGNED64	BIT(1)
92 
93 /*
94  * Return whether the system supports a given xfeature.
95  *
96  * Also return the name of the (most advanced) feature that the caller requested:
97  */
98 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
99 {
100 	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
101 
102 	if (unlikely(feature_name)) {
103 		long xfeature_idx, max_idx;
104 		u64 xfeatures_print;
105 		/*
106 		 * So we use FLS here to be able to print the most advanced
107 		 * feature that was requested but is missing. So if a driver
108 		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
109 		 * missing AVX feature - this is the most informative message
110 		 * to users:
111 		 */
112 		if (xfeatures_missing)
113 			xfeatures_print = xfeatures_missing;
114 		else
115 			xfeatures_print = xfeatures_needed;
116 
117 		xfeature_idx = fls64(xfeatures_print)-1;
118 		max_idx = ARRAY_SIZE(xfeature_names)-1;
119 		xfeature_idx = min(xfeature_idx, max_idx);
120 
121 		*feature_name = xfeature_names[xfeature_idx];
122 	}
123 
124 	if (xfeatures_missing)
125 		return 0;
126 
127 	return 1;
128 }
129 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
130 
131 static bool xfeature_is_aligned64(int xfeature_nr)
132 {
133 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
134 }
135 
136 static bool xfeature_is_supervisor(int xfeature_nr)
137 {
138 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
139 }
140 
141 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
142 {
143 	unsigned int offs, i;
144 
145 	/*
146 	 * Non-compacted format and legacy features use the cached fixed
147 	 * offsets.
148 	 */
149 	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
150 	    xfeature <= XFEATURE_SSE)
151 		return xstate_offsets[xfeature];
152 
153 	/*
154 	 * Compacted format offsets depend on the actual content of the
155 	 * compacted xsave area which is determined by the xcomp_bv header
156 	 * field.
157 	 */
158 	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
159 	for_each_extended_xfeature(i, xcomp_bv) {
160 		if (xfeature_is_aligned64(i))
161 			offs = ALIGN(offs, 64);
162 		if (i == xfeature)
163 			break;
164 		offs += xstate_sizes[i];
165 	}
166 	return offs;
167 }
168 
169 /*
170  * Enable the extended processor state save/restore feature.
171  * Called once per CPU onlining.
172  */
173 void fpu__init_cpu_xstate(void)
174 {
175 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
176 		return;
177 
178 	cr4_set_bits(X86_CR4_OSXSAVE);
179 
180 	/*
181 	 * Must happen after CR4 setup and before xsetbv() to allow KVM
182 	 * lazy passthrough.  Write independent of the dynamic state static
183 	 * key as that does not work on the boot CPU. This also ensures
184 	 * that any stale state is wiped out from XFD. Reset the per CPU
185 	 * xfd cache too.
186 	 */
187 	if (cpu_feature_enabled(X86_FEATURE_XFD))
188 		xfd_set_state(init_fpstate.xfd);
189 
190 	/*
191 	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
192 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
193 	 * states can be set here.
194 	 */
195 	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
196 
197 	/*
198 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
199 	 */
200 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
201 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
202 				     xfeatures_mask_independent());
203 	}
204 }
205 
206 static bool xfeature_enabled(enum xfeature xfeature)
207 {
208 	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
209 }
210 
211 /*
212  * Record the offsets and sizes of various xstates contained
213  * in the XSAVE state memory layout.
214  */
215 static void __init setup_xstate_cache(void)
216 {
217 	u32 eax, ebx, ecx, edx, i;
218 	/* start at the beginning of the "extended state" */
219 	unsigned int last_good_offset = offsetof(struct xregs_state,
220 						 extended_state_area);
221 	/*
222 	 * The FP xstates and SSE xstates are legacy states. They are always
223 	 * in the fixed offsets in the xsave area in either compacted form
224 	 * or standard form.
225 	 */
226 	xstate_offsets[XFEATURE_FP]	= 0;
227 	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
228 						   xmm_space);
229 
230 	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
231 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
232 						       xmm_space);
233 
234 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
235 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
236 
237 		xstate_sizes[i] = eax;
238 		xstate_flags[i] = ecx;
239 
240 		/*
241 		 * If an xfeature is supervisor state, the offset in EBX is
242 		 * invalid, leave it to -1.
243 		 */
244 		if (xfeature_is_supervisor(i))
245 			continue;
246 
247 		xstate_offsets[i] = ebx;
248 
249 		/*
250 		 * In our xstate size checks, we assume that the highest-numbered
251 		 * xstate feature has the highest offset in the buffer.  Ensure
252 		 * it does.
253 		 */
254 		WARN_ONCE(last_good_offset > xstate_offsets[i],
255 			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
256 
257 		last_good_offset = xstate_offsets[i];
258 	}
259 }
260 
261 static void __init print_xstate_feature(u64 xstate_mask)
262 {
263 	const char *feature_name;
264 
265 	if (cpu_has_xfeatures(xstate_mask, &feature_name))
266 		pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
267 }
268 
269 /*
270  * Print out all the supported xstate features:
271  */
272 static void __init print_xstate_features(void)
273 {
274 	print_xstate_feature(XFEATURE_MASK_FP);
275 	print_xstate_feature(XFEATURE_MASK_SSE);
276 	print_xstate_feature(XFEATURE_MASK_YMM);
277 	print_xstate_feature(XFEATURE_MASK_BNDREGS);
278 	print_xstate_feature(XFEATURE_MASK_BNDCSR);
279 	print_xstate_feature(XFEATURE_MASK_OPMASK);
280 	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
281 	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
282 	print_xstate_feature(XFEATURE_MASK_PKRU);
283 	print_xstate_feature(XFEATURE_MASK_PASID);
284 	print_xstate_feature(XFEATURE_MASK_CET_USER);
285 	print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
286 	print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
287 }
288 
289 /*
290  * This check is important because it is easy to get XSTATE_*
291  * confused with XSTATE_BIT_*.
292  */
293 #define CHECK_XFEATURE(nr) do {		\
294 	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
295 	WARN_ON(nr >= XFEATURE_MAX);	\
296 } while (0)
297 
298 /*
299  * Print out xstate component offsets and sizes
300  */
301 static void __init print_xstate_offset_size(void)
302 {
303 	int i;
304 
305 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
306 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
307 			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
308 			i, xstate_sizes[i]);
309 	}
310 }
311 
312 /*
313  * This function is called only during boot time when x86 caps are not set
314  * up and alternative can not be used yet.
315  */
316 static __init void os_xrstor_booting(struct xregs_state *xstate)
317 {
318 	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
319 	u32 lmask = mask;
320 	u32 hmask = mask >> 32;
321 	int err;
322 
323 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
324 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
325 	else
326 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
327 
328 	/*
329 	 * We should never fault when copying from a kernel buffer, and the FPU
330 	 * state we set at boot time should be valid.
331 	 */
332 	WARN_ON_FPU(err);
333 }
334 
335 /*
336  * All supported features have either init state all zeros or are
337  * handled in setup_init_fpu() individually. This is an explicit
338  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
339  * newly added supported features at build time and make people
340  * actually look at the init state for the new feature.
341  */
342 #define XFEATURES_INIT_FPSTATE_HANDLED		\
343 	(XFEATURE_MASK_FP |			\
344 	 XFEATURE_MASK_SSE |			\
345 	 XFEATURE_MASK_YMM |			\
346 	 XFEATURE_MASK_OPMASK |			\
347 	 XFEATURE_MASK_ZMM_Hi256 |		\
348 	 XFEATURE_MASK_Hi16_ZMM	 |		\
349 	 XFEATURE_MASK_PKRU |			\
350 	 XFEATURE_MASK_BNDREGS |		\
351 	 XFEATURE_MASK_BNDCSR |			\
352 	 XFEATURE_MASK_PASID |			\
353 	 XFEATURE_MASK_CET_USER |		\
354 	 XFEATURE_MASK_XTILE)
355 
356 /*
357  * setup the xstate image representing the init state
358  */
359 static void __init setup_init_fpu_buf(void)
360 {
361 	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
362 		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
363 		     XFEATURES_INIT_FPSTATE_HANDLED);
364 
365 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
366 		return;
367 
368 	print_xstate_features();
369 
370 	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
371 
372 	/*
373 	 * Init all the features state with header.xfeatures being 0x0
374 	 */
375 	os_xrstor_booting(&init_fpstate.regs.xsave);
376 
377 	/*
378 	 * All components are now in init state. Read the state back so
379 	 * that init_fpstate contains all non-zero init state. This only
380 	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
381 	 * those use the init optimization which skips writing data for
382 	 * components in init state.
383 	 *
384 	 * XSAVE could be used, but that would require to reshuffle the
385 	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
386 	 * compaction. But doing so is a pointless exercise because most
387 	 * components have an all zeros init state except for the legacy
388 	 * ones (FP and SSE). Those can be saved with FXSAVE into the
389 	 * legacy area. Adding new features requires to ensure that init
390 	 * state is all zeroes or if not to add the necessary handling
391 	 * here.
392 	 */
393 	fxsave(&init_fpstate.regs.fxsave);
394 }
395 
396 int xfeature_size(int xfeature_nr)
397 {
398 	u32 eax, ebx, ecx, edx;
399 
400 	CHECK_XFEATURE(xfeature_nr);
401 	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
402 	return eax;
403 }
404 
405 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
406 static int validate_user_xstate_header(const struct xstate_header *hdr,
407 				       struct fpstate *fpstate)
408 {
409 	/* No unknown or supervisor features may be set */
410 	if (hdr->xfeatures & ~fpstate->user_xfeatures)
411 		return -EINVAL;
412 
413 	/* Userspace must use the uncompacted format */
414 	if (hdr->xcomp_bv)
415 		return -EINVAL;
416 
417 	/*
418 	 * If 'reserved' is shrunken to add a new field, make sure to validate
419 	 * that new field here!
420 	 */
421 	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
422 
423 	/* No reserved bits may be set */
424 	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
425 		return -EINVAL;
426 
427 	return 0;
428 }
429 
430 static void __init __xstate_dump_leaves(void)
431 {
432 	int i;
433 	u32 eax, ebx, ecx, edx;
434 	static int should_dump = 1;
435 
436 	if (!should_dump)
437 		return;
438 	should_dump = 0;
439 	/*
440 	 * Dump out a few leaves past the ones that we support
441 	 * just in case there are some goodies up there
442 	 */
443 	for (i = 0; i < XFEATURE_MAX + 10; i++) {
444 		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
445 		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
446 			XSTATE_CPUID, i, eax, ebx, ecx, edx);
447 	}
448 }
449 
450 #define XSTATE_WARN_ON(x, fmt, ...) do {					\
451 	if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {	\
452 		__xstate_dump_leaves();						\
453 	}									\
454 } while (0)
455 
456 #define XCHECK_SZ(sz, nr, __struct) ({					\
457 	if (WARN_ONCE(sz != sizeof(__struct),				\
458 	    "[%s]: struct is %zu bytes, cpu state %d bytes\n",		\
459 	    xfeature_names[nr], sizeof(__struct), sz)) {		\
460 		__xstate_dump_leaves();					\
461 	}								\
462 	true;								\
463 })
464 
465 
466 /**
467  * check_xtile_data_against_struct - Check tile data state size.
468  *
469  * Calculate the state size by multiplying the single tile size which is
470  * recorded in a C struct, and the number of tiles that the CPU informs.
471  * Compare the provided size with the calculation.
472  *
473  * @size:	The tile data state size
474  *
475  * Returns:	0 on success, -EINVAL on mismatch.
476  */
477 static int __init check_xtile_data_against_struct(int size)
478 {
479 	u32 max_palid, palid, state_size;
480 	u32 eax, ebx, ecx, edx;
481 	u16 max_tile;
482 
483 	/*
484 	 * Check the maximum palette id:
485 	 *   eax: the highest numbered palette subleaf.
486 	 */
487 	cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
488 
489 	/*
490 	 * Cross-check each tile size and find the maximum number of
491 	 * supported tiles.
492 	 */
493 	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
494 		u16 tile_size, max;
495 
496 		/*
497 		 * Check the tile size info:
498 		 *   eax[31:16]:  bytes per title
499 		 *   ebx[31:16]:  the max names (or max number of tiles)
500 		 */
501 		cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
502 		tile_size = eax >> 16;
503 		max = ebx >> 16;
504 
505 		if (tile_size != sizeof(struct xtile_data)) {
506 			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
507 			       __stringify(XFEATURE_XTILE_DATA),
508 			       sizeof(struct xtile_data), tile_size);
509 			__xstate_dump_leaves();
510 			return -EINVAL;
511 		}
512 
513 		if (max > max_tile)
514 			max_tile = max;
515 	}
516 
517 	state_size = sizeof(struct xtile_data) * max_tile;
518 	if (size != state_size) {
519 		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
520 		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
521 		__xstate_dump_leaves();
522 		return -EINVAL;
523 	}
524 	return 0;
525 }
526 
527 /*
528  * We have a C struct for each 'xstate'.  We need to ensure
529  * that our software representation matches what the CPU
530  * tells us about the state's size.
531  */
532 static bool __init check_xstate_against_struct(int nr)
533 {
534 	/*
535 	 * Ask the CPU for the size of the state.
536 	 */
537 	int sz = xfeature_size(nr);
538 
539 	/*
540 	 * Match each CPU state with the corresponding software
541 	 * structure.
542 	 */
543 	switch (nr) {
544 	case XFEATURE_YMM:	  return XCHECK_SZ(sz, nr, struct ymmh_struct);
545 	case XFEATURE_BNDREGS:	  return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
546 	case XFEATURE_BNDCSR:	  return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
547 	case XFEATURE_OPMASK:	  return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
548 	case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
549 	case XFEATURE_Hi16_ZMM:	  return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
550 	case XFEATURE_PKRU:	  return XCHECK_SZ(sz, nr, struct pkru_state);
551 	case XFEATURE_PASID:	  return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
552 	case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
553 	case XFEATURE_CET_USER:	  return XCHECK_SZ(sz, nr, struct cet_user_state);
554 	case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
555 	default:
556 		XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
557 		return false;
558 	}
559 
560 	return true;
561 }
562 
563 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
564 {
565 	unsigned int topmost = fls64(xfeatures) -  1;
566 	unsigned int offset = xstate_offsets[topmost];
567 
568 	if (topmost <= XFEATURE_SSE)
569 		return sizeof(struct xregs_state);
570 
571 	if (compacted)
572 		offset = xfeature_get_offset(xfeatures, topmost);
573 	return offset + xstate_sizes[topmost];
574 }
575 
576 /*
577  * This essentially double-checks what the cpu told us about
578  * how large the XSAVE buffer needs to be.  We are recalculating
579  * it to be safe.
580  *
581  * Independent XSAVE features allocate their own buffers and are not
582  * covered by these checks. Only the size of the buffer for task->fpu
583  * is checked here.
584  */
585 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
586 {
587 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
588 	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
589 	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
590 	int i;
591 
592 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
593 		if (!check_xstate_against_struct(i))
594 			return false;
595 		/*
596 		 * Supervisor state components can be managed only by
597 		 * XSAVES.
598 		 */
599 		if (!xsaves && xfeature_is_supervisor(i)) {
600 			XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
601 			return false;
602 		}
603 	}
604 	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
605 	XSTATE_WARN_ON(size != kernel_size,
606 		       "size %u != kernel_size %u\n", size, kernel_size);
607 	return size == kernel_size;
608 }
609 
610 /*
611  * Get total size of enabled xstates in XCR0 | IA32_XSS.
612  *
613  * Note the SDM's wording here.  "sub-function 0" only enumerates
614  * the size of the *user* states.  If we use it to size a buffer
615  * that we use 'XSAVES' on, we could potentially overflow the
616  * buffer because 'XSAVES' saves system states too.
617  *
618  * This also takes compaction into account. So this works for
619  * XSAVEC as well.
620  */
621 static unsigned int __init get_compacted_size(void)
622 {
623 	unsigned int eax, ebx, ecx, edx;
624 	/*
625 	 * - CPUID function 0DH, sub-function 1:
626 	 *    EBX enumerates the size (in bytes) required by
627 	 *    the XSAVES instruction for an XSAVE area
628 	 *    containing all the state components
629 	 *    corresponding to bits currently set in
630 	 *    XCR0 | IA32_XSS.
631 	 *
632 	 * When XSAVES is not available but XSAVEC is (virt), then there
633 	 * are no supervisor states, but XSAVEC still uses compacted
634 	 * format.
635 	 */
636 	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
637 	return ebx;
638 }
639 
640 /*
641  * Get the total size of the enabled xstates without the independent supervisor
642  * features.
643  */
644 static unsigned int __init get_xsave_compacted_size(void)
645 {
646 	u64 mask = xfeatures_mask_independent();
647 	unsigned int size;
648 
649 	if (!mask)
650 		return get_compacted_size();
651 
652 	/* Disable independent features. */
653 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
654 
655 	/*
656 	 * Ask the hardware what size is required of the buffer.
657 	 * This is the size required for the task->fpu buffer.
658 	 */
659 	size = get_compacted_size();
660 
661 	/* Re-enable independent features so XSAVES will work on them again. */
662 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
663 
664 	return size;
665 }
666 
667 static unsigned int __init get_xsave_size_user(void)
668 {
669 	unsigned int eax, ebx, ecx, edx;
670 	/*
671 	 * - CPUID function 0DH, sub-function 0:
672 	 *    EBX enumerates the size (in bytes) required by
673 	 *    the XSAVE instruction for an XSAVE area
674 	 *    containing all the *user* state components
675 	 *    corresponding to bits currently set in XCR0.
676 	 */
677 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
678 	return ebx;
679 }
680 
681 static int __init init_xstate_size(void)
682 {
683 	/* Recompute the context size for enabled features: */
684 	unsigned int user_size, kernel_size, kernel_default_size;
685 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
686 
687 	/* Uncompacted user space size */
688 	user_size = get_xsave_size_user();
689 
690 	/*
691 	 * XSAVES kernel size includes supervisor states and uses compacted
692 	 * format. XSAVEC uses compacted format, but does not save
693 	 * supervisor states.
694 	 *
695 	 * XSAVE[OPT] do not support supervisor states so kernel and user
696 	 * size is identical.
697 	 */
698 	if (compacted)
699 		kernel_size = get_xsave_compacted_size();
700 	else
701 		kernel_size = user_size;
702 
703 	kernel_default_size =
704 		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
705 
706 	if (!paranoid_xstate_size_valid(kernel_size))
707 		return -EINVAL;
708 
709 	fpu_kernel_cfg.max_size = kernel_size;
710 	fpu_user_cfg.max_size = user_size;
711 
712 	fpu_kernel_cfg.default_size = kernel_default_size;
713 	fpu_user_cfg.default_size =
714 		xstate_calculate_size(fpu_user_cfg.default_features, false);
715 
716 	return 0;
717 }
718 
719 /*
720  * We enabled the XSAVE hardware, but something went wrong and
721  * we can not use it.  Disable it.
722  */
723 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
724 {
725 	fpu_kernel_cfg.max_features = 0;
726 	cr4_clear_bits(X86_CR4_OSXSAVE);
727 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
728 
729 	/* Restore the legacy size.*/
730 	fpu_kernel_cfg.max_size = legacy_size;
731 	fpu_kernel_cfg.default_size = legacy_size;
732 	fpu_user_cfg.max_size = legacy_size;
733 	fpu_user_cfg.default_size = legacy_size;
734 
735 	/*
736 	 * Prevent enabling the static branch which enables writes to the
737 	 * XFD MSR.
738 	 */
739 	init_fpstate.xfd = 0;
740 
741 	fpstate_reset(&current->thread.fpu);
742 }
743 
744 /*
745  * Enable and initialize the xsave feature.
746  * Called once per system bootup.
747  */
748 void __init fpu__init_system_xstate(unsigned int legacy_size)
749 {
750 	unsigned int eax, ebx, ecx, edx;
751 	u64 xfeatures;
752 	int err;
753 	int i;
754 
755 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
756 		pr_info("x86/fpu: No FPU detected\n");
757 		return;
758 	}
759 
760 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
761 		pr_info("x86/fpu: x87 FPU will use %s\n",
762 			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
763 		return;
764 	}
765 
766 	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
767 		WARN_ON_FPU(1);
768 		return;
769 	}
770 
771 	/*
772 	 * Find user xstates supported by the processor.
773 	 */
774 	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
775 	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
776 
777 	/*
778 	 * Find supervisor xstates supported by the processor.
779 	 */
780 	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
781 	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
782 
783 	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
784 		/*
785 		 * This indicates that something really unexpected happened
786 		 * with the enumeration.  Disable XSAVE and try to continue
787 		 * booting without it.  This is too early to BUG().
788 		 */
789 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
790 		       fpu_kernel_cfg.max_features);
791 		goto out_disable;
792 	}
793 
794 	fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
795 					      XFEATURE_MASK_INDEPENDENT;
796 
797 	/*
798 	 * Clear XSAVE features that are disabled in the normal CPUID.
799 	 */
800 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
801 		unsigned short cid = xsave_cpuid_features[i];
802 
803 		/* Careful: X86_FEATURE_FPU is 0! */
804 		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
805 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
806 	}
807 
808 	if (!cpu_feature_enabled(X86_FEATURE_XFD))
809 		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
810 
811 	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
812 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
813 	else
814 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
815 					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
816 
817 	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
818 	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
819 
820 	/* Clean out dynamic features from default */
821 	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
822 	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
823 
824 	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
825 	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
826 
827 	/* Store it for paranoia check at the end */
828 	xfeatures = fpu_kernel_cfg.max_features;
829 
830 	/*
831 	 * Initialize the default XFD state in initfp_state and enable the
832 	 * dynamic sizing mechanism if dynamic states are available.  The
833 	 * static key cannot be enabled here because this runs before
834 	 * jump_label_init(). This is delayed to an initcall.
835 	 */
836 	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
837 
838 	/* Set up compaction feature bit */
839 	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
840 	    cpu_feature_enabled(X86_FEATURE_XSAVES))
841 		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
842 
843 	/* Enable xstate instructions to be able to continue with initialization: */
844 	fpu__init_cpu_xstate();
845 
846 	/* Cache size, offset and flags for initialization */
847 	setup_xstate_cache();
848 
849 	err = init_xstate_size();
850 	if (err)
851 		goto out_disable;
852 
853 	/* Reset the state for the current task */
854 	fpstate_reset(&current->thread.fpu);
855 
856 	/*
857 	 * Update info used for ptrace frames; use standard-format size and no
858 	 * supervisor xstates:
859 	 */
860 	update_regset_xstate_info(fpu_user_cfg.max_size,
861 				  fpu_user_cfg.max_features);
862 
863 	/*
864 	 * init_fpstate excludes dynamic states as they are large but init
865 	 * state is zero.
866 	 */
867 	init_fpstate.size		= fpu_kernel_cfg.default_size;
868 	init_fpstate.xfeatures		= fpu_kernel_cfg.default_features;
869 
870 	if (init_fpstate.size > sizeof(init_fpstate.regs)) {
871 		pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
872 			sizeof(init_fpstate.regs), init_fpstate.size);
873 		goto out_disable;
874 	}
875 
876 	setup_init_fpu_buf();
877 
878 	/*
879 	 * Paranoia check whether something in the setup modified the
880 	 * xfeatures mask.
881 	 */
882 	if (xfeatures != fpu_kernel_cfg.max_features) {
883 		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
884 		       xfeatures, fpu_kernel_cfg.max_features);
885 		goto out_disable;
886 	}
887 
888 	/*
889 	 * CPU capabilities initialization runs before FPU init. So
890 	 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
891 	 * functional, set the feature bit so depending code works.
892 	 */
893 	setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
894 
895 	print_xstate_offset_size();
896 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
897 		fpu_kernel_cfg.max_features,
898 		fpu_kernel_cfg.max_size,
899 		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
900 	return;
901 
902 out_disable:
903 	/* something went wrong, try to boot without any XSAVE support */
904 	fpu__init_disable_system_xstate(legacy_size);
905 }
906 
907 /*
908  * Restore minimal FPU state after suspend:
909  */
910 void fpu__resume_cpu(void)
911 {
912 	/*
913 	 * Restore XCR0 on xsave capable CPUs:
914 	 */
915 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
916 		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
917 
918 	/*
919 	 * Restore IA32_XSS. The same CPUID bit enumerates support
920 	 * of XSAVES and MSR_IA32_XSS.
921 	 */
922 	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
923 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
924 				     xfeatures_mask_independent());
925 	}
926 
927 	if (fpu_state_size_dynamic())
928 		wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
929 }
930 
931 /*
932  * Given an xstate feature nr, calculate where in the xsave
933  * buffer the state is.  Callers should ensure that the buffer
934  * is valid.
935  */
936 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
937 {
938 	u64 xcomp_bv = xsave->header.xcomp_bv;
939 
940 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
941 		return NULL;
942 
943 	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
944 		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
945 			return NULL;
946 	}
947 
948 	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
949 }
950 
951 /*
952  * Given the xsave area and a state inside, this function returns the
953  * address of the state.
954  *
955  * This is the API that is called to get xstate address in either
956  * standard format or compacted format of xsave area.
957  *
958  * Note that if there is no data for the field in the xsave buffer
959  * this will return NULL.
960  *
961  * Inputs:
962  *	xstate: the thread's storage area for all FPU data
963  *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
964  *	XFEATURE_SSE, etc...)
965  * Output:
966  *	address of the state in the xsave area, or NULL if the
967  *	field is not present in the xsave buffer.
968  */
969 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
970 {
971 	/*
972 	 * Do we even *have* xsave state?
973 	 */
974 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
975 		return NULL;
976 
977 	/*
978 	 * We should not ever be requesting features that we
979 	 * have not enabled.
980 	 */
981 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
982 		return NULL;
983 
984 	/*
985 	 * This assumes the last 'xsave*' instruction to
986 	 * have requested that 'xfeature_nr' be saved.
987 	 * If it did not, we might be seeing and old value
988 	 * of the field in the buffer.
989 	 *
990 	 * This can happen because the last 'xsave' did not
991 	 * request that this feature be saved (unlikely)
992 	 * or because the "init optimization" caused it
993 	 * to not be saved.
994 	 */
995 	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
996 		return NULL;
997 
998 	return __raw_xsave_addr(xsave, xfeature_nr);
999 }
1000 EXPORT_SYMBOL_GPL(get_xsave_addr);
1001 
1002 /*
1003  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
1004  * The xsave buffer should be in standard format, not compacted (e.g. user mode
1005  * signal frames).
1006  */
1007 void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
1008 {
1009 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1010 		return NULL;
1011 
1012 	return (void __user *)xsave + xstate_offsets[xfeature_nr];
1013 }
1014 
1015 #ifdef CONFIG_ARCH_HAS_PKEYS
1016 
1017 /*
1018  * This will go out and modify PKRU register to set the access
1019  * rights for @pkey to @init_val.
1020  */
1021 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1022 			      unsigned long init_val)
1023 {
1024 	u32 old_pkru, new_pkru_bits = 0;
1025 	int pkey_shift;
1026 
1027 	/*
1028 	 * This check implies XSAVE support.  OSPKE only gets
1029 	 * set if we enable XSAVE and we enable PKU in XCR0.
1030 	 */
1031 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1032 		return -EINVAL;
1033 
1034 	/*
1035 	 * This code should only be called with valid 'pkey'
1036 	 * values originating from in-kernel users.  Complain
1037 	 * if a bad value is observed.
1038 	 */
1039 	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1040 		return -EINVAL;
1041 
1042 	/* Set the bits we need in PKRU:  */
1043 	if (init_val & PKEY_DISABLE_ACCESS)
1044 		new_pkru_bits |= PKRU_AD_BIT;
1045 	if (init_val & PKEY_DISABLE_WRITE)
1046 		new_pkru_bits |= PKRU_WD_BIT;
1047 
1048 	/* Shift the bits in to the correct place in PKRU for pkey: */
1049 	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1050 	new_pkru_bits <<= pkey_shift;
1051 
1052 	/* Get old PKRU and mask off any old bits in place: */
1053 	old_pkru = read_pkru();
1054 	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1055 
1056 	/* Write old part along with new part: */
1057 	write_pkru(old_pkru | new_pkru_bits);
1058 
1059 	return 0;
1060 }
1061 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1062 
1063 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1064 			 void *init_xstate, unsigned int size)
1065 {
1066 	membuf_write(to, from_xstate ? xstate : init_xstate, size);
1067 }
1068 
1069 /**
1070  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1071  * @to:		membuf descriptor
1072  * @fpstate:	The fpstate buffer from which to copy
1073  * @xfeatures:	The mask of xfeatures to save (XSAVE mode only)
1074  * @pkru_val:	The PKRU value to store in the PKRU component
1075  * @copy_mode:	The requested copy mode
1076  *
1077  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1078  * format, i.e. from the kernel internal hardware dependent storage format
1079  * to the requested @mode. UABI XSTATE is always uncompacted!
1080  *
1081  * It supports partial copy but @to.pos always starts from zero.
1082  */
1083 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1084 			       u64 xfeatures, u32 pkru_val,
1085 			       enum xstate_copy_mode copy_mode)
1086 {
1087 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1088 	struct xregs_state *xinit = &init_fpstate.regs.xsave;
1089 	struct xregs_state *xsave = &fpstate->regs.xsave;
1090 	struct xstate_header header;
1091 	unsigned int zerofrom;
1092 	u64 mask;
1093 	int i;
1094 
1095 	memset(&header, 0, sizeof(header));
1096 	header.xfeatures = xsave->header.xfeatures;
1097 
1098 	/* Mask out the feature bits depending on copy mode */
1099 	switch (copy_mode) {
1100 	case XSTATE_COPY_FP:
1101 		header.xfeatures &= XFEATURE_MASK_FP;
1102 		break;
1103 
1104 	case XSTATE_COPY_FX:
1105 		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1106 		break;
1107 
1108 	case XSTATE_COPY_XSAVE:
1109 		header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1110 		break;
1111 	}
1112 
1113 	/* Copy FP state up to MXCSR */
1114 	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1115 		     &xinit->i387, off_mxcsr);
1116 
1117 	/* Copy MXCSR when SSE or YMM are set in the feature mask */
1118 	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1119 		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1120 		     MXCSR_AND_FLAGS_SIZE);
1121 
1122 	/* Copy the remaining FP state */
1123 	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1124 		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1125 		     sizeof(xsave->i387.st_space));
1126 
1127 	/* Copy the SSE state - shared with YMM, but independently managed */
1128 	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1129 		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1130 		     sizeof(xsave->i387.xmm_space));
1131 
1132 	if (copy_mode != XSTATE_COPY_XSAVE)
1133 		goto out;
1134 
1135 	/* Zero the padding area */
1136 	membuf_zero(&to, sizeof(xsave->i387.padding));
1137 
1138 	/* Copy xsave->i387.sw_reserved */
1139 	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1140 
1141 	/* Copy the user space relevant state of @xsave->header */
1142 	membuf_write(&to, &header, sizeof(header));
1143 
1144 	zerofrom = offsetof(struct xregs_state, extended_state_area);
1145 
1146 	/*
1147 	 * This 'mask' indicates which states to copy from fpstate.
1148 	 * Those extended states that are not present in fpstate are
1149 	 * either disabled or initialized:
1150 	 *
1151 	 * In non-compacted format, disabled features still occupy
1152 	 * state space but there is no state to copy from in the
1153 	 * compacted init_fpstate. The gap tracking will zero these
1154 	 * states.
1155 	 *
1156 	 * The extended features have an all zeroes init state. Thus,
1157 	 * remove them from 'mask' to zero those features in the user
1158 	 * buffer instead of retrieving them from init_fpstate.
1159 	 */
1160 	mask = header.xfeatures;
1161 
1162 	for_each_extended_xfeature(i, mask) {
1163 		/*
1164 		 * If there was a feature or alignment gap, zero the space
1165 		 * in the destination buffer.
1166 		 */
1167 		if (zerofrom < xstate_offsets[i])
1168 			membuf_zero(&to, xstate_offsets[i] - zerofrom);
1169 
1170 		if (i == XFEATURE_PKRU) {
1171 			struct pkru_state pkru = {0};
1172 			/*
1173 			 * PKRU is not necessarily up to date in the
1174 			 * XSAVE buffer. Use the provided value.
1175 			 */
1176 			pkru.pkru = pkru_val;
1177 			membuf_write(&to, &pkru, sizeof(pkru));
1178 		} else {
1179 			membuf_write(&to,
1180 				     __raw_xsave_addr(xsave, i),
1181 				     xstate_sizes[i]);
1182 		}
1183 		/*
1184 		 * Keep track of the last copied state in the non-compacted
1185 		 * target buffer for gap zeroing.
1186 		 */
1187 		zerofrom = xstate_offsets[i] + xstate_sizes[i];
1188 	}
1189 
1190 out:
1191 	if (to.left)
1192 		membuf_zero(&to, to.left);
1193 }
1194 
1195 /**
1196  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1197  * @to:		membuf descriptor
1198  * @tsk:	The task from which to copy the saved xstate
1199  * @copy_mode:	The requested copy mode
1200  *
1201  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1202  * format, i.e. from the kernel internal hardware dependent storage format
1203  * to the requested @mode. UABI XSTATE is always uncompacted!
1204  *
1205  * It supports partial copy but @to.pos always starts from zero.
1206  */
1207 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1208 			     enum xstate_copy_mode copy_mode)
1209 {
1210 	__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1211 				  tsk->thread.fpu.fpstate->user_xfeatures,
1212 				  tsk->thread.pkru, copy_mode);
1213 }
1214 
1215 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1216 			    const void *kbuf, const void __user *ubuf)
1217 {
1218 	if (kbuf) {
1219 		memcpy(dst, kbuf + offset, size);
1220 	} else {
1221 		if (copy_from_user(dst, ubuf + offset, size))
1222 			return -EFAULT;
1223 	}
1224 	return 0;
1225 }
1226 
1227 
1228 /**
1229  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1230  * @fpstate:	The fpstate buffer to copy to
1231  * @kbuf:	The UABI format buffer, if it comes from the kernel
1232  * @ubuf:	The UABI format buffer, if it comes from userspace
1233  * @pkru:	The location to write the PKRU value to
1234  *
1235  * Converts from the UABI format into the kernel internal hardware
1236  * dependent format.
1237  *
1238  * This function ultimately has three different callers with distinct PKRU
1239  * behavior.
1240  * 1.	When called from sigreturn the PKRU register will be restored from
1241  *	@fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1242  *	@fpstate is sufficient to cover this case, but the caller will also
1243  *	pass a pointer to the thread_struct's pkru field in @pkru and updating
1244  *	it is harmless.
1245  * 2.	When called from ptrace the PKRU register will be restored from the
1246  *	thread_struct's pkru field. A pointer to that is passed in @pkru.
1247  *	The kernel will restore it manually, so the XRSTOR behavior that resets
1248  *	the PKRU register to the hardware init value (0) if the corresponding
1249  *	xfeatures bit is not set is emulated here.
1250  * 3.	When called from KVM the PKRU register will be restored from the vcpu's
1251  *	pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1252  *	XRSTOR and hasn't had the PKRU resetting behavior described above. To
1253  *	preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1254  *	bit is not set.
1255  */
1256 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1257 			       const void __user *ubuf, u32 *pkru)
1258 {
1259 	struct xregs_state *xsave = &fpstate->regs.xsave;
1260 	unsigned int offset, size;
1261 	struct xstate_header hdr;
1262 	u64 mask;
1263 	int i;
1264 
1265 	offset = offsetof(struct xregs_state, header);
1266 	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1267 		return -EFAULT;
1268 
1269 	if (validate_user_xstate_header(&hdr, fpstate))
1270 		return -EINVAL;
1271 
1272 	/* Validate MXCSR when any of the related features is in use */
1273 	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1274 	if (hdr.xfeatures & mask) {
1275 		u32 mxcsr[2];
1276 
1277 		offset = offsetof(struct fxregs_state, mxcsr);
1278 		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1279 			return -EFAULT;
1280 
1281 		/* Reserved bits in MXCSR must be zero. */
1282 		if (mxcsr[0] & ~mxcsr_feature_mask)
1283 			return -EINVAL;
1284 
1285 		/* SSE and YMM require MXCSR even when FP is not in use. */
1286 		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1287 			xsave->i387.mxcsr = mxcsr[0];
1288 			xsave->i387.mxcsr_mask = mxcsr[1];
1289 		}
1290 	}
1291 
1292 	for (i = 0; i < XFEATURE_MAX; i++) {
1293 		mask = BIT_ULL(i);
1294 
1295 		if (hdr.xfeatures & mask) {
1296 			void *dst = __raw_xsave_addr(xsave, i);
1297 
1298 			offset = xstate_offsets[i];
1299 			size = xstate_sizes[i];
1300 
1301 			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1302 				return -EFAULT;
1303 		}
1304 	}
1305 
1306 	if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1307 		struct pkru_state *xpkru;
1308 
1309 		xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1310 		*pkru = xpkru->pkru;
1311 	} else {
1312 		/*
1313 		 * KVM may pass NULL here to indicate that it does not need
1314 		 * PKRU updated.
1315 		 */
1316 		if (pkru)
1317 			*pkru = 0;
1318 	}
1319 
1320 	/*
1321 	 * The state that came in from userspace was user-state only.
1322 	 * Mask all the user states out of 'xfeatures':
1323 	 */
1324 	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1325 
1326 	/*
1327 	 * Add back in the features that came in from userspace:
1328 	 */
1329 	xsave->header.xfeatures |= hdr.xfeatures;
1330 
1331 	return 0;
1332 }
1333 
1334 /*
1335  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1336  * format and copy to the target thread. Used by ptrace and KVM.
1337  */
1338 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1339 {
1340 	return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1341 }
1342 
1343 /*
1344  * Convert from a sigreturn standard-format user-space buffer to kernel
1345  * XSAVE[S] format and copy to the target thread. This is called from the
1346  * sigreturn() and rt_sigreturn() system calls.
1347  */
1348 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1349 				      const void __user *ubuf)
1350 {
1351 	return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1352 }
1353 
1354 static bool validate_independent_components(u64 mask)
1355 {
1356 	u64 xchk;
1357 
1358 	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1359 		return false;
1360 
1361 	xchk = ~xfeatures_mask_independent();
1362 
1363 	if (WARN_ON_ONCE(!mask || mask & xchk))
1364 		return false;
1365 
1366 	return true;
1367 }
1368 
1369 /**
1370  * xsaves - Save selected components to a kernel xstate buffer
1371  * @xstate:	Pointer to the buffer
1372  * @mask:	Feature mask to select the components to save
1373  *
1374  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1375  * XSAVES does not write the full xstate header. Before first use the
1376  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1377  * can #GP.
1378  *
1379  * The feature mask must be a subset of the independent features.
1380  */
1381 void xsaves(struct xregs_state *xstate, u64 mask)
1382 {
1383 	int err;
1384 
1385 	if (!validate_independent_components(mask))
1386 		return;
1387 
1388 	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1389 	WARN_ON_ONCE(err);
1390 }
1391 
1392 /**
1393  * xrstors - Restore selected components from a kernel xstate buffer
1394  * @xstate:	Pointer to the buffer
1395  * @mask:	Feature mask to select the components to restore
1396  *
1397  * The @xstate buffer must be 64 byte aligned and correctly initialized
1398  * otherwise XRSTORS from that buffer can #GP.
1399  *
1400  * Proper usage is to restore the state which was saved with
1401  * xsaves() into @xstate.
1402  *
1403  * The feature mask must be a subset of the independent features.
1404  */
1405 void xrstors(struct xregs_state *xstate, u64 mask)
1406 {
1407 	int err;
1408 
1409 	if (!validate_independent_components(mask))
1410 		return;
1411 
1412 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1413 	WARN_ON_ONCE(err);
1414 }
1415 
1416 #if IS_ENABLED(CONFIG_KVM)
1417 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1418 {
1419 	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1420 
1421 	if (addr)
1422 		memset(addr, 0, xstate_sizes[xfeature]);
1423 }
1424 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1425 #endif
1426 
1427 #ifdef CONFIG_X86_64
1428 
1429 #ifdef CONFIG_X86_DEBUG_FPU
1430 /*
1431  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1432  * can safely operate on the @fpstate buffer.
1433  */
1434 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1435 {
1436 	u64 xfd = __this_cpu_read(xfd_state);
1437 
1438 	if (fpstate->xfd == xfd)
1439 		return true;
1440 
1441 	 /*
1442 	  * The XFD MSR does not match fpstate->xfd. That's invalid when
1443 	  * the passed in fpstate is current's fpstate.
1444 	  */
1445 	if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1446 		return false;
1447 
1448 	/*
1449 	 * XRSTOR(S) from init_fpstate are always correct as it will just
1450 	 * bring all components into init state and not read from the
1451 	 * buffer. XSAVE(S) raises #PF after init.
1452 	 */
1453 	if (fpstate == &init_fpstate)
1454 		return rstor;
1455 
1456 	/*
1457 	 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1458 	 * XRSTORS(S): fpu_swap_kvm_fpstate()
1459 	 */
1460 
1461 	/*
1462 	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1463 	 * the buffer area for XFD-disabled state components.
1464 	 */
1465 	mask &= ~xfd;
1466 
1467 	/*
1468 	 * Remove features which are valid in fpstate. They
1469 	 * have space allocated in fpstate.
1470 	 */
1471 	mask &= ~fpstate->xfeatures;
1472 
1473 	/*
1474 	 * Any remaining state components in 'mask' might be written
1475 	 * by XSAVE/XRSTOR. Fail validation it found.
1476 	 */
1477 	return !mask;
1478 }
1479 
1480 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1481 {
1482 	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1483 }
1484 #endif /* CONFIG_X86_DEBUG_FPU */
1485 
1486 static int __init xfd_update_static_branch(void)
1487 {
1488 	/*
1489 	 * If init_fpstate.xfd has bits set then dynamic features are
1490 	 * available and the dynamic sizing must be enabled.
1491 	 */
1492 	if (init_fpstate.xfd)
1493 		static_branch_enable(&__fpu_state_size_dynamic);
1494 	return 0;
1495 }
1496 arch_initcall(xfd_update_static_branch)
1497 
1498 void fpstate_free(struct fpu *fpu)
1499 {
1500 	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1501 		vfree(fpu->fpstate);
1502 }
1503 
1504 /**
1505  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1506  *
1507  * @xfeatures:	A bitmap of xstate features which extend the enabled features
1508  *		of that task
1509  * @ksize:	The required size for the kernel buffer
1510  * @usize:	The required size for user space buffers
1511  * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
1512  *
1513  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1514  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1515  * with large states are likely to live longer.
1516  *
1517  * Returns: 0 on success, -ENOMEM on allocation error.
1518  */
1519 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1520 			   unsigned int usize, struct fpu_guest *guest_fpu)
1521 {
1522 	struct fpu *fpu = &current->thread.fpu;
1523 	struct fpstate *curfps, *newfps = NULL;
1524 	unsigned int fpsize;
1525 	bool in_use;
1526 
1527 	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1528 
1529 	newfps = vzalloc(fpsize);
1530 	if (!newfps)
1531 		return -ENOMEM;
1532 	newfps->size = ksize;
1533 	newfps->user_size = usize;
1534 	newfps->is_valloc = true;
1535 
1536 	/*
1537 	 * When a guest FPU is supplied, use @guest_fpu->fpstate
1538 	 * as reference independent whether it is in use or not.
1539 	 */
1540 	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1541 
1542 	/* Determine whether @curfps is the active fpstate */
1543 	in_use = fpu->fpstate == curfps;
1544 
1545 	if (guest_fpu) {
1546 		newfps->is_guest = true;
1547 		newfps->is_confidential = curfps->is_confidential;
1548 		newfps->in_use = curfps->in_use;
1549 		guest_fpu->xfeatures |= xfeatures;
1550 		guest_fpu->uabi_size = usize;
1551 	}
1552 
1553 	fpregs_lock();
1554 	/*
1555 	 * If @curfps is in use, ensure that the current state is in the
1556 	 * registers before swapping fpstate as that might invalidate it
1557 	 * due to layout changes.
1558 	 */
1559 	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1560 		fpregs_restore_userregs();
1561 
1562 	newfps->xfeatures = curfps->xfeatures | xfeatures;
1563 	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1564 	newfps->xfd = curfps->xfd & ~xfeatures;
1565 
1566 	/* Do the final updates within the locked region */
1567 	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1568 
1569 	if (guest_fpu) {
1570 		guest_fpu->fpstate = newfps;
1571 		/* If curfps is active, update the FPU fpstate pointer */
1572 		if (in_use)
1573 			fpu->fpstate = newfps;
1574 	} else {
1575 		fpu->fpstate = newfps;
1576 	}
1577 
1578 	if (in_use)
1579 		xfd_update_state(fpu->fpstate);
1580 	fpregs_unlock();
1581 
1582 	/* Only free valloc'ed state */
1583 	if (curfps && curfps->is_valloc)
1584 		vfree(curfps);
1585 
1586 	return 0;
1587 }
1588 
1589 static int validate_sigaltstack(unsigned int usize)
1590 {
1591 	struct task_struct *thread, *leader = current->group_leader;
1592 	unsigned long framesize = get_sigframe_size();
1593 
1594 	lockdep_assert_held(&current->sighand->siglock);
1595 
1596 	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
1597 	framesize -= fpu_user_cfg.max_size;
1598 	framesize += usize;
1599 	for_each_thread(leader, thread) {
1600 		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1601 			return -ENOSPC;
1602 	}
1603 	return 0;
1604 }
1605 
1606 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1607 {
1608 	/*
1609 	 * This deliberately does not exclude !XSAVES as we still might
1610 	 * decide to optionally context switch XCR0 or talk the silicon
1611 	 * vendors into extending XFD for the pre AMX states, especially
1612 	 * AVX512.
1613 	 */
1614 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1615 	struct fpu *fpu = &current->group_leader->thread.fpu;
1616 	struct fpu_state_perm *perm;
1617 	unsigned int ksize, usize;
1618 	u64 mask;
1619 	int ret = 0;
1620 
1621 	/* Check whether fully enabled */
1622 	if ((permitted & requested) == requested)
1623 		return 0;
1624 
1625 	/* Calculate the resulting kernel state size */
1626 	mask = permitted | requested;
1627 	/* Take supervisor states into account on the host */
1628 	if (!guest)
1629 		mask |= xfeatures_mask_supervisor();
1630 	ksize = xstate_calculate_size(mask, compacted);
1631 
1632 	/* Calculate the resulting user state size */
1633 	mask &= XFEATURE_MASK_USER_SUPPORTED;
1634 	usize = xstate_calculate_size(mask, false);
1635 
1636 	if (!guest) {
1637 		ret = validate_sigaltstack(usize);
1638 		if (ret)
1639 			return ret;
1640 	}
1641 
1642 	perm = guest ? &fpu->guest_perm : &fpu->perm;
1643 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1644 	WRITE_ONCE(perm->__state_perm, mask);
1645 	/* Protected by sighand lock */
1646 	perm->__state_size = ksize;
1647 	perm->__user_state_size = usize;
1648 	return ret;
1649 }
1650 
1651 /*
1652  * Permissions array to map facilities with more than one component
1653  */
1654 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1655 	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1656 };
1657 
1658 static int xstate_request_perm(unsigned long idx, bool guest)
1659 {
1660 	u64 permitted, requested;
1661 	int ret;
1662 
1663 	if (idx >= XFEATURE_MAX)
1664 		return -EINVAL;
1665 
1666 	/*
1667 	 * Look up the facility mask which can require more than
1668 	 * one xstate component.
1669 	 */
1670 	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1671 	requested = xstate_prctl_req[idx];
1672 	if (!requested)
1673 		return -EOPNOTSUPP;
1674 
1675 	if ((fpu_user_cfg.max_features & requested) != requested)
1676 		return -EOPNOTSUPP;
1677 
1678 	/* Lockless quick check */
1679 	permitted = xstate_get_group_perm(guest);
1680 	if ((permitted & requested) == requested)
1681 		return 0;
1682 
1683 	/* Protect against concurrent modifications */
1684 	spin_lock_irq(&current->sighand->siglock);
1685 	permitted = xstate_get_group_perm(guest);
1686 
1687 	/* First vCPU allocation locks the permissions. */
1688 	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1689 		ret = -EBUSY;
1690 	else
1691 		ret = __xstate_request_perm(permitted, requested, guest);
1692 	spin_unlock_irq(&current->sighand->siglock);
1693 	return ret;
1694 }
1695 
1696 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1697 {
1698 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1699 	struct fpu_state_perm *perm;
1700 	unsigned int ksize, usize;
1701 	struct fpu *fpu;
1702 
1703 	if (!xfd_event) {
1704 		if (!guest_fpu)
1705 			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1706 		return 0;
1707 	}
1708 
1709 	/* Protect against concurrent modifications */
1710 	spin_lock_irq(&current->sighand->siglock);
1711 
1712 	/* If not permitted let it die */
1713 	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1714 		spin_unlock_irq(&current->sighand->siglock);
1715 		return -EPERM;
1716 	}
1717 
1718 	fpu = &current->group_leader->thread.fpu;
1719 	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1720 	ksize = perm->__state_size;
1721 	usize = perm->__user_state_size;
1722 
1723 	/*
1724 	 * The feature is permitted. State size is sufficient.  Dropping
1725 	 * the lock is safe here even if more features are added from
1726 	 * another task, the retrieved buffer sizes are valid for the
1727 	 * currently requested feature(s).
1728 	 */
1729 	spin_unlock_irq(&current->sighand->siglock);
1730 
1731 	/*
1732 	 * Try to allocate a new fpstate. If that fails there is no way
1733 	 * out.
1734 	 */
1735 	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1736 		return -EFAULT;
1737 	return 0;
1738 }
1739 
1740 int xfd_enable_feature(u64 xfd_err)
1741 {
1742 	return __xfd_enable_feature(xfd_err, NULL);
1743 }
1744 
1745 #else /* CONFIG_X86_64 */
1746 static inline int xstate_request_perm(unsigned long idx, bool guest)
1747 {
1748 	return -EPERM;
1749 }
1750 #endif  /* !CONFIG_X86_64 */
1751 
1752 u64 xstate_get_guest_group_perm(void)
1753 {
1754 	return xstate_get_group_perm(true);
1755 }
1756 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1757 
1758 /**
1759  * fpu_xstate_prctl - xstate permission operations
1760  * @option:	A subfunction of arch_prctl()
1761  * @arg2:	option argument
1762  * Return:	0 if successful; otherwise, an error code
1763  *
1764  * Option arguments:
1765  *
1766  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1767  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1768  * ARCH_REQ_XCOMP_PERM: Facility number requested
1769  *
1770  * For facilities which require more than one XSTATE component, the request
1771  * must be the highest state component number related to that facility,
1772  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1773  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1774  */
1775 long fpu_xstate_prctl(int option, unsigned long arg2)
1776 {
1777 	u64 __user *uptr = (u64 __user *)arg2;
1778 	u64 permitted, supported;
1779 	unsigned long idx = arg2;
1780 	bool guest = false;
1781 
1782 	switch (option) {
1783 	case ARCH_GET_XCOMP_SUPP:
1784 		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
1785 		return put_user(supported, uptr);
1786 
1787 	case ARCH_GET_XCOMP_PERM:
1788 		/*
1789 		 * Lockless snapshot as it can also change right after the
1790 		 * dropping the lock.
1791 		 */
1792 		permitted = xstate_get_host_group_perm();
1793 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1794 		return put_user(permitted, uptr);
1795 
1796 	case ARCH_GET_XCOMP_GUEST_PERM:
1797 		permitted = xstate_get_guest_group_perm();
1798 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1799 		return put_user(permitted, uptr);
1800 
1801 	case ARCH_REQ_XCOMP_GUEST_PERM:
1802 		guest = true;
1803 		fallthrough;
1804 
1805 	case ARCH_REQ_XCOMP_PERM:
1806 		if (!IS_ENABLED(CONFIG_X86_64))
1807 			return -EOPNOTSUPP;
1808 
1809 		return xstate_request_perm(idx, guest);
1810 
1811 	default:
1812 		return -EINVAL;
1813 	}
1814 }
1815 
1816 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1817 /*
1818  * Report the amount of time elapsed in millisecond since last AVX512
1819  * use in the task.
1820  */
1821 static void avx512_status(struct seq_file *m, struct task_struct *task)
1822 {
1823 	unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1824 	long delta;
1825 
1826 	if (!timestamp) {
1827 		/*
1828 		 * Report -1 if no AVX512 usage
1829 		 */
1830 		delta = -1;
1831 	} else {
1832 		delta = (long)(jiffies - timestamp);
1833 		/*
1834 		 * Cap to LONG_MAX if time difference > LONG_MAX
1835 		 */
1836 		if (delta < 0)
1837 			delta = LONG_MAX;
1838 		delta = jiffies_to_msecs(delta);
1839 	}
1840 
1841 	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1842 	seq_putc(m, '\n');
1843 }
1844 
1845 /*
1846  * Report architecture specific information
1847  */
1848 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1849 			struct pid *pid, struct task_struct *task)
1850 {
1851 	/*
1852 	 * Report AVX512 state if the processor and build option supported.
1853 	 */
1854 	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1855 		avx512_status(m, task);
1856 
1857 	return 0;
1858 }
1859 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
1860 
1861 #ifdef CONFIG_COREDUMP
1862 static const char owner_name[] = "LINUX";
1863 
1864 /*
1865  * Dump type, size, offset and flag values for every xfeature that is present.
1866  */
1867 static int dump_xsave_layout_desc(struct coredump_params *cprm)
1868 {
1869 	int num_records = 0;
1870 	int i;
1871 
1872 	for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
1873 		struct x86_xfeat_component xc = {
1874 			.type   = i,
1875 			.size   = xstate_sizes[i],
1876 			.offset = xstate_offsets[i],
1877 			/* reserved for future use */
1878 			.flags  = 0,
1879 		};
1880 
1881 		if (!dump_emit(cprm, &xc, sizeof(xc)))
1882 			return 0;
1883 
1884 		num_records++;
1885 	}
1886 	return num_records;
1887 }
1888 
1889 static u32 get_xsave_desc_size(void)
1890 {
1891 	u32 cnt = 0;
1892 	u32 i;
1893 
1894 	for_each_extended_xfeature(i, fpu_user_cfg.max_features)
1895 		cnt++;
1896 
1897 	return cnt * (sizeof(struct x86_xfeat_component));
1898 }
1899 
1900 int elf_coredump_extra_notes_write(struct coredump_params *cprm)
1901 {
1902 	int num_records = 0;
1903 	struct elf_note en;
1904 
1905 	if (!fpu_user_cfg.max_features)
1906 		return 0;
1907 
1908 	en.n_namesz = sizeof(owner_name);
1909 	en.n_descsz = get_xsave_desc_size();
1910 	en.n_type = NT_X86_XSAVE_LAYOUT;
1911 
1912 	if (!dump_emit(cprm, &en, sizeof(en)))
1913 		return 1;
1914 	if (!dump_emit(cprm, owner_name, en.n_namesz))
1915 		return 1;
1916 	if (!dump_align(cprm, 4))
1917 		return 1;
1918 
1919 	num_records = dump_xsave_layout_desc(cprm);
1920 	if (!num_records)
1921 		return 1;
1922 
1923 	/* Total size should be equal to the number of records */
1924 	if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
1925 		return 1;
1926 
1927 	return 0;
1928 }
1929 
1930 int elf_coredump_extra_notes_size(void)
1931 {
1932 	int size;
1933 
1934 	if (!fpu_user_cfg.max_features)
1935 		return 0;
1936 
1937 	/* .note header */
1938 	size  = sizeof(struct elf_note);
1939 	/*  Name plus alignment to 4 bytes */
1940 	size += roundup(sizeof(owner_name), 4);
1941 	size += get_xsave_desc_size();
1942 
1943 	return size;
1944 }
1945 #endif /* CONFIG_COREDUMP */
1946