xref: /linux/arch/x86/kernel/fpu/xstate.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/kvm_types.h>
12 #include <linux/nospec.h>
13 #include <linux/pkeys.h>
14 #include <linux/seq_file.h>
15 #include <linux/proc_fs.h>
16 #include <linux/vmalloc.h>
17 #include <linux/coredump.h>
18 #include <linux/sort.h>
19 
20 #include <asm/fpu/api.h>
21 #include <asm/fpu/regset.h>
22 #include <asm/fpu/signal.h>
23 #include <asm/fpu/xcr.h>
24 
25 #include <asm/cpuid/api.h>
26 #include <asm/msr.h>
27 #include <asm/tlbflush.h>
28 #include <asm/prctl.h>
29 #include <asm/elf.h>
30 
31 #include <uapi/asm/elf.h>
32 
33 #include "context.h"
34 #include "internal.h"
35 #include "legacy.h"
36 #include "xstate.h"
37 
38 #define for_each_extended_xfeature(bit, mask)				\
39 	(bit) = FIRST_EXTENDED_XFEATURE;				\
40 	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
41 
42 /*
43  * Although we spell it out in here, the Processor Trace
44  * xfeature is completely unused.  We use other mechanisms
45  * to save/restore PT state in Linux.
46  */
47 static const char *xfeature_names[] =
48 {
49 	"x87 floating point registers",
50 	"SSE registers",
51 	"AVX registers",
52 	"MPX bounds registers",
53 	"MPX CSR",
54 	"AVX-512 opmask",
55 	"AVX-512 Hi256",
56 	"AVX-512 ZMM_Hi256",
57 	"Processor Trace (unused)",
58 	"Protection Keys User registers",
59 	"PASID state",
60 	"Control-flow User registers",
61 	"Control-flow Kernel registers (KVM only)",
62 	"unknown xstate feature",
63 	"unknown xstate feature",
64 	"unknown xstate feature",
65 	"unknown xstate feature",
66 	"AMX Tile config",
67 	"AMX Tile data",
68 	"APX registers",
69 	"unknown xstate feature",
70 };
71 
72 static unsigned short xsave_cpuid_features[] __initdata = {
73 	[XFEATURE_FP]				= X86_FEATURE_FPU,
74 	[XFEATURE_SSE]				= X86_FEATURE_XMM,
75 	[XFEATURE_YMM]				= X86_FEATURE_AVX,
76 	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
77 	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
78 	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
79 	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
80 	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
81 	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
82 	[XFEATURE_PKRU]				= X86_FEATURE_OSPKE,
83 	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
84 	[XFEATURE_CET_USER]			= X86_FEATURE_SHSTK,
85 	[XFEATURE_CET_KERNEL]			= X86_FEATURE_SHSTK,
86 	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
87 	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
88 	[XFEATURE_APX]				= X86_FEATURE_APX,
89 };
90 
91 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
92 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
93 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
94 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
95 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
96 
97 /*
98  * Ordering of xstate components in uncompacted format:  The xfeature
99  * number does not necessarily indicate its position in the XSAVE buffer.
100  * This array defines the traversal order of xstate features.
101  */
102 static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init =
103 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
104 
105 static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
106 {
107 	for (; xfeature_uncompact_order[i] != -1; i++) {
108 		if (mask & BIT_ULL(xfeature_uncompact_order[i]))
109 			break;
110 	}
111 
112 	return i;
113 }
114 
115 /* Iterate xstate features in uncompacted order: */
116 #define for_each_extended_xfeature_in_order(i, mask)	\
117 	for (i = 0;					\
118 	     i = next_xfeature_order(i, mask),		\
119 	     xfeature_uncompact_order[i] != -1;		\
120 	     i++)
121 
122 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
123 #define XSTATE_FLAG_ALIGNED64	BIT(1)
124 
125 /*
126  * Return whether the system supports a given xfeature.
127  *
128  * Also return the name of the (most advanced) feature that the caller requested:
129  */
130 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
131 {
132 	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
133 
134 	if (unlikely(feature_name)) {
135 		long xfeature_idx, max_idx;
136 		u64 xfeatures_print;
137 		/*
138 		 * So we use FLS here to be able to print the most advanced
139 		 * feature that was requested but is missing. So if a driver
140 		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
141 		 * missing AVX feature - this is the most informative message
142 		 * to users:
143 		 */
144 		if (xfeatures_missing)
145 			xfeatures_print = xfeatures_missing;
146 		else
147 			xfeatures_print = xfeatures_needed;
148 
149 		xfeature_idx = fls64(xfeatures_print)-1;
150 		max_idx = ARRAY_SIZE(xfeature_names)-1;
151 		xfeature_idx = min(xfeature_idx, max_idx);
152 
153 		*feature_name = xfeature_names[xfeature_idx];
154 	}
155 
156 	if (xfeatures_missing)
157 		return 0;
158 
159 	return 1;
160 }
161 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
162 
163 static bool xfeature_is_aligned64(int xfeature_nr)
164 {
165 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
166 }
167 
168 static bool xfeature_is_supervisor(int xfeature_nr)
169 {
170 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
171 }
172 
173 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
174 {
175 	unsigned int offs, i;
176 
177 	/*
178 	 * Non-compacted format and legacy features use the cached fixed
179 	 * offsets.
180 	 */
181 	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
182 	    xfeature <= XFEATURE_SSE)
183 		return xstate_offsets[xfeature];
184 
185 	/*
186 	 * Compacted format offsets depend on the actual content of the
187 	 * compacted xsave area which is determined by the xcomp_bv header
188 	 * field.
189 	 */
190 	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
191 	for_each_extended_xfeature(i, xcomp_bv) {
192 		if (xfeature_is_aligned64(i))
193 			offs = ALIGN(offs, 64);
194 		if (i == xfeature)
195 			break;
196 		offs += xstate_sizes[i];
197 	}
198 	return offs;
199 }
200 
201 /*
202  * Enable the extended processor state save/restore feature.
203  * Called once per CPU onlining.
204  */
205 void fpu__init_cpu_xstate(void)
206 {
207 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
208 		return;
209 
210 	cr4_set_bits(X86_CR4_OSXSAVE);
211 
212 	/*
213 	 * Must happen after CR4 setup and before xsetbv() to allow KVM
214 	 * lazy passthrough.  Write independent of the dynamic state static
215 	 * key as that does not work on the boot CPU. This also ensures
216 	 * that any stale state is wiped out from XFD. Reset the per CPU
217 	 * xfd cache too.
218 	 */
219 	if (cpu_feature_enabled(X86_FEATURE_XFD))
220 		xfd_set_state(init_fpstate.xfd);
221 
222 	/*
223 	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
224 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
225 	 * states can be set here.
226 	 */
227 	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
228 
229 	/*
230 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
231 	 */
232 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
233 		wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() |
234 				     xfeatures_mask_independent());
235 	}
236 }
237 
238 static bool xfeature_enabled(enum xfeature xfeature)
239 {
240 	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
241 }
242 
243 static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2)
244 {
245 	return  xstate_offsets[*(unsigned int *)xfeature1] -
246 		xstate_offsets[*(unsigned int *)xfeature2];
247 }
248 
249 /*
250  * Record the offsets and sizes of various xstates contained
251  * in the XSAVE state memory layout. Also, create an ordered
252  * list of xfeatures for handling out-of-order offsets.
253  */
254 static void __init setup_xstate_cache(void)
255 {
256 	u32 eax, ebx, ecx, edx, xfeature, i = 0;
257 	/*
258 	 * The FP xstates and SSE xstates are legacy states. They are always
259 	 * in the fixed offsets in the xsave area in either compacted form
260 	 * or standard form.
261 	 */
262 	xstate_offsets[XFEATURE_FP]	= 0;
263 	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
264 						   xmm_space);
265 
266 	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
267 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
268 						       xmm_space);
269 
270 	for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) {
271 		cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx);
272 
273 		xstate_sizes[xfeature] = eax;
274 		xstate_flags[xfeature] = ecx;
275 
276 		/*
277 		 * If an xfeature is supervisor state, the offset in EBX is
278 		 * invalid, leave it to -1.
279 		 */
280 		if (xfeature_is_supervisor(xfeature))
281 			continue;
282 
283 		xstate_offsets[xfeature] = ebx;
284 
285 		/* Populate the list of xfeatures before sorting */
286 		xfeature_uncompact_order[i++] = xfeature;
287 	}
288 
289 	/*
290 	 * Sort xfeatures by their offsets to support out-of-order
291 	 * offsets in the uncompacted format.
292 	 */
293 	sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
294 }
295 
296 /*
297  * Print out all the supported xstate features:
298  */
299 static void __init print_xstate_features(void)
300 {
301 	int i;
302 
303 	for (i = 0; i < XFEATURE_MAX; i++) {
304 		u64 mask = BIT_ULL(i);
305 		const char *name;
306 
307 		if (cpu_has_xfeatures(mask, &name))
308 			pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
309 	}
310 }
311 
312 /*
313  * This check is important because it is easy to get XSTATE_*
314  * confused with XSTATE_BIT_*.
315  */
316 #define CHECK_XFEATURE(nr) do {		\
317 	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
318 	WARN_ON(nr >= XFEATURE_MAX);	\
319 } while (0)
320 
321 /*
322  * Print out xstate component offsets and sizes
323  */
324 static void __init print_xstate_offset_size(void)
325 {
326 	int i;
327 
328 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
329 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
330 			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
331 			i, xstate_sizes[i]);
332 	}
333 }
334 
335 /*
336  * This function is called only during boot time when x86 caps are not set
337  * up and alternative can not be used yet.
338  */
339 static __init void os_xrstor_booting(struct xregs_state *xstate)
340 {
341 	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
342 	u32 lmask = mask;
343 	u32 hmask = mask >> 32;
344 	int err;
345 
346 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
347 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
348 	else
349 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
350 
351 	/*
352 	 * We should never fault when copying from a kernel buffer, and the FPU
353 	 * state we set at boot time should be valid.
354 	 */
355 	WARN_ON_FPU(err);
356 }
357 
358 /*
359  * All supported features have either init state all zeros or are
360  * handled in setup_init_fpu() individually. This is an explicit
361  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
362  * newly added supported features at build time and make people
363  * actually look at the init state for the new feature.
364  */
365 #define XFEATURES_INIT_FPSTATE_HANDLED		\
366 	(XFEATURE_MASK_FP |			\
367 	 XFEATURE_MASK_SSE |			\
368 	 XFEATURE_MASK_YMM |			\
369 	 XFEATURE_MASK_OPMASK |			\
370 	 XFEATURE_MASK_ZMM_Hi256 |		\
371 	 XFEATURE_MASK_Hi16_ZMM	 |		\
372 	 XFEATURE_MASK_PKRU |			\
373 	 XFEATURE_MASK_BNDREGS |		\
374 	 XFEATURE_MASK_BNDCSR |			\
375 	 XFEATURE_MASK_PASID |			\
376 	 XFEATURE_MASK_CET_USER |		\
377 	 XFEATURE_MASK_CET_KERNEL |		\
378 	 XFEATURE_MASK_XTILE |			\
379 	 XFEATURE_MASK_APX)
380 
381 /*
382  * setup the xstate image representing the init state
383  */
384 static void __init setup_init_fpu_buf(void)
385 {
386 	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
387 		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
388 		     XFEATURES_INIT_FPSTATE_HANDLED);
389 
390 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
391 		return;
392 
393 	print_xstate_features();
394 
395 	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
396 
397 	/*
398 	 * Init all the features state with header.xfeatures being 0x0
399 	 */
400 	os_xrstor_booting(&init_fpstate.regs.xsave);
401 
402 	/*
403 	 * All components are now in init state. Read the state back so
404 	 * that init_fpstate contains all non-zero init state. This only
405 	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
406 	 * those use the init optimization which skips writing data for
407 	 * components in init state.
408 	 *
409 	 * XSAVE could be used, but that would require to reshuffle the
410 	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
411 	 * compaction. But doing so is a pointless exercise because most
412 	 * components have an all zeros init state except for the legacy
413 	 * ones (FP and SSE). Those can be saved with FXSAVE into the
414 	 * legacy area. Adding new features requires to ensure that init
415 	 * state is all zeroes or if not to add the necessary handling
416 	 * here.
417 	 */
418 	fxsave(&init_fpstate.regs.fxsave);
419 }
420 
421 int xfeature_size(int xfeature_nr)
422 {
423 	u32 eax, ebx, ecx, edx;
424 
425 	CHECK_XFEATURE(xfeature_nr);
426 	cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
427 	return eax;
428 }
429 
430 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
431 static int validate_user_xstate_header(const struct xstate_header *hdr,
432 				       struct fpstate *fpstate)
433 {
434 	/* No unknown or supervisor features may be set */
435 	if (hdr->xfeatures & ~fpstate->user_xfeatures)
436 		return -EINVAL;
437 
438 	/* Userspace must use the uncompacted format */
439 	if (hdr->xcomp_bv)
440 		return -EINVAL;
441 
442 	/*
443 	 * If 'reserved' is shrunken to add a new field, make sure to validate
444 	 * that new field here!
445 	 */
446 	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
447 
448 	/* No reserved bits may be set */
449 	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
450 		return -EINVAL;
451 
452 	return 0;
453 }
454 
455 static void __init __xstate_dump_leaves(void)
456 {
457 	int i;
458 	u32 eax, ebx, ecx, edx;
459 	static int should_dump = 1;
460 
461 	if (!should_dump)
462 		return;
463 	should_dump = 0;
464 	/*
465 	 * Dump out a few leaves past the ones that we support
466 	 * just in case there are some goodies up there
467 	 */
468 	for (i = 0; i < XFEATURE_MAX + 10; i++) {
469 		cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
470 		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
471 			CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
472 	}
473 }
474 
475 #define XSTATE_WARN_ON(x, fmt, ...) do {					\
476 	if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {	\
477 		__xstate_dump_leaves();						\
478 	}									\
479 } while (0)
480 
481 #define XCHECK_SZ(sz, nr, __struct) ({					\
482 	if (WARN_ONCE(sz != sizeof(__struct),				\
483 	    "[%s]: struct is %zu bytes, cpu state %d bytes\n",		\
484 	    xfeature_names[nr], sizeof(__struct), sz)) {		\
485 		__xstate_dump_leaves();					\
486 	}								\
487 	true;								\
488 })
489 
490 
491 /**
492  * check_xtile_data_against_struct - Check tile data state size.
493  *
494  * Calculate the state size by multiplying the single tile size which is
495  * recorded in a C struct, and the number of tiles that the CPU informs.
496  * Compare the provided size with the calculation.
497  *
498  * @size:	The tile data state size
499  *
500  * Returns:	0 on success, -EINVAL on mismatch.
501  */
502 static int __init check_xtile_data_against_struct(int size)
503 {
504 	u32 max_palid, palid, state_size;
505 	u32 eax, ebx, ecx, edx;
506 	u16 max_tile;
507 
508 	/*
509 	 * Check the maximum palette id:
510 	 *   eax: the highest numbered palette subleaf.
511 	 */
512 	cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
513 
514 	/*
515 	 * Cross-check each tile size and find the maximum number of
516 	 * supported tiles.
517 	 */
518 	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
519 		u16 tile_size, max;
520 
521 		/*
522 		 * Check the tile size info:
523 		 *   eax[31:16]:  bytes per title
524 		 *   ebx[31:16]:  the max names (or max number of tiles)
525 		 */
526 		cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
527 		tile_size = eax >> 16;
528 		max = ebx >> 16;
529 
530 		if (tile_size != sizeof(struct xtile_data)) {
531 			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
532 			       __stringify(XFEATURE_XTILE_DATA),
533 			       sizeof(struct xtile_data), tile_size);
534 			__xstate_dump_leaves();
535 			return -EINVAL;
536 		}
537 
538 		if (max > max_tile)
539 			max_tile = max;
540 	}
541 
542 	state_size = sizeof(struct xtile_data) * max_tile;
543 	if (size != state_size) {
544 		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
545 		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
546 		__xstate_dump_leaves();
547 		return -EINVAL;
548 	}
549 	return 0;
550 }
551 
552 /*
553  * We have a C struct for each 'xstate'.  We need to ensure
554  * that our software representation matches what the CPU
555  * tells us about the state's size.
556  */
557 static bool __init check_xstate_against_struct(int nr)
558 {
559 	/*
560 	 * Ask the CPU for the size of the state.
561 	 */
562 	int sz = xfeature_size(nr);
563 
564 	/*
565 	 * Match each CPU state with the corresponding software
566 	 * structure.
567 	 */
568 	switch (nr) {
569 	case XFEATURE_YMM:	  return XCHECK_SZ(sz, nr, struct ymmh_struct);
570 	case XFEATURE_BNDREGS:	  return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
571 	case XFEATURE_BNDCSR:	  return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
572 	case XFEATURE_OPMASK:	  return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
573 	case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
574 	case XFEATURE_Hi16_ZMM:	  return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
575 	case XFEATURE_PKRU:	  return XCHECK_SZ(sz, nr, struct pkru_state);
576 	case XFEATURE_PASID:	  return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
577 	case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
578 	case XFEATURE_CET_USER:	  return XCHECK_SZ(sz, nr, struct cet_user_state);
579 	case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state);
580 	case XFEATURE_APX:        return XCHECK_SZ(sz, nr, struct apx_state);
581 	case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
582 	default:
583 		XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
584 		return false;
585 	}
586 
587 	return true;
588 }
589 
590 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
591 {
592 	unsigned int topmost = fls64(xfeatures) -  1;
593 	unsigned int offset, i;
594 
595 	if (topmost <= XFEATURE_SSE)
596 		return sizeof(struct xregs_state);
597 
598 	if (compacted) {
599 		offset = xfeature_get_offset(xfeatures, topmost);
600 	} else {
601 		/* Walk through the xfeature order to pick the last */
602 		for_each_extended_xfeature_in_order(i, xfeatures)
603 			topmost = xfeature_uncompact_order[i];
604 		offset = xstate_offsets[topmost];
605 	}
606 
607 	return offset + xstate_sizes[topmost];
608 }
609 
610 /*
611  * This essentially double-checks what the cpu told us about
612  * how large the XSAVE buffer needs to be.  We are recalculating
613  * it to be safe.
614  *
615  * Independent XSAVE features allocate their own buffers and are not
616  * covered by these checks. Only the size of the buffer for task->fpu
617  * is checked here.
618  */
619 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
620 {
621 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
622 	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
623 	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
624 	int i;
625 
626 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
627 		if (!check_xstate_against_struct(i))
628 			return false;
629 		/*
630 		 * Supervisor state components can be managed only by
631 		 * XSAVES.
632 		 */
633 		if (!xsaves && xfeature_is_supervisor(i)) {
634 			XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
635 			return false;
636 		}
637 	}
638 	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
639 	XSTATE_WARN_ON(size != kernel_size,
640 		       "size %u != kernel_size %u\n", size, kernel_size);
641 	return size == kernel_size;
642 }
643 
644 /*
645  * Get total size of enabled xstates in XCR0 | IA32_XSS.
646  *
647  * Note the SDM's wording here.  "sub-function 0" only enumerates
648  * the size of the *user* states.  If we use it to size a buffer
649  * that we use 'XSAVES' on, we could potentially overflow the
650  * buffer because 'XSAVES' saves system states too.
651  *
652  * This also takes compaction into account. So this works for
653  * XSAVEC as well.
654  */
655 static unsigned int __init get_compacted_size(void)
656 {
657 	unsigned int eax, ebx, ecx, edx;
658 	/*
659 	 * - CPUID function 0DH, sub-function 1:
660 	 *    EBX enumerates the size (in bytes) required by
661 	 *    the XSAVES instruction for an XSAVE area
662 	 *    containing all the state components
663 	 *    corresponding to bits currently set in
664 	 *    XCR0 | IA32_XSS.
665 	 *
666 	 * When XSAVES is not available but XSAVEC is (virt), then there
667 	 * are no supervisor states, but XSAVEC still uses compacted
668 	 * format.
669 	 */
670 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
671 	return ebx;
672 }
673 
674 /*
675  * Get the total size of the enabled xstates without the independent supervisor
676  * features.
677  */
678 static unsigned int __init get_xsave_compacted_size(void)
679 {
680 	u64 mask = xfeatures_mask_independent();
681 	unsigned int size;
682 
683 	if (!mask)
684 		return get_compacted_size();
685 
686 	/* Disable independent features. */
687 	wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor());
688 
689 	/*
690 	 * Ask the hardware what size is required of the buffer.
691 	 * This is the size required for the task->fpu buffer.
692 	 */
693 	size = get_compacted_size();
694 
695 	/* Re-enable independent features so XSAVES will work on them again. */
696 	wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
697 
698 	return size;
699 }
700 
701 static unsigned int __init get_xsave_size_user(void)
702 {
703 	unsigned int eax, ebx, ecx, edx;
704 	/*
705 	 * - CPUID function 0DH, sub-function 0:
706 	 *    EBX enumerates the size (in bytes) required by
707 	 *    the XSAVE instruction for an XSAVE area
708 	 *    containing all the *user* state components
709 	 *    corresponding to bits currently set in XCR0.
710 	 */
711 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
712 	return ebx;
713 }
714 
715 static int __init init_xstate_size(void)
716 {
717 	/* Recompute the context size for enabled features: */
718 	unsigned int user_size, kernel_size, kernel_default_size;
719 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
720 
721 	/* Uncompacted user space size */
722 	user_size = get_xsave_size_user();
723 
724 	/*
725 	 * XSAVES kernel size includes supervisor states and uses compacted
726 	 * format. XSAVEC uses compacted format, but does not save
727 	 * supervisor states.
728 	 *
729 	 * XSAVE[OPT] do not support supervisor states so kernel and user
730 	 * size is identical.
731 	 */
732 	if (compacted)
733 		kernel_size = get_xsave_compacted_size();
734 	else
735 		kernel_size = user_size;
736 
737 	kernel_default_size =
738 		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
739 
740 	if (!paranoid_xstate_size_valid(kernel_size))
741 		return -EINVAL;
742 
743 	fpu_kernel_cfg.max_size = kernel_size;
744 	fpu_user_cfg.max_size = user_size;
745 
746 	fpu_kernel_cfg.default_size = kernel_default_size;
747 	fpu_user_cfg.default_size =
748 		xstate_calculate_size(fpu_user_cfg.default_features, false);
749 
750 	guest_default_cfg.size =
751 		xstate_calculate_size(guest_default_cfg.features, compacted);
752 
753 	return 0;
754 }
755 
756 /*
757  * We enabled the XSAVE hardware, but something went wrong and
758  * we can not use it.  Disable it.
759  */
760 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
761 {
762 	pr_info("x86/fpu: XSAVE disabled\n");
763 
764 	fpu_kernel_cfg.max_features = 0;
765 	cr4_clear_bits(X86_CR4_OSXSAVE);
766 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
767 
768 	/* Restore the legacy size.*/
769 	fpu_kernel_cfg.max_size = legacy_size;
770 	fpu_kernel_cfg.default_size = legacy_size;
771 	fpu_user_cfg.max_size = legacy_size;
772 	fpu_user_cfg.default_size = legacy_size;
773 	guest_default_cfg.size = legacy_size;
774 
775 	/*
776 	 * Prevent enabling the static branch which enables writes to the
777 	 * XFD MSR.
778 	 */
779 	init_fpstate.xfd = 0;
780 
781 	fpstate_reset(x86_task_fpu(current));
782 }
783 
784 static u64 __init host_default_mask(void)
785 {
786 	/*
787 	 * Exclude dynamic features (require userspace opt-in) and features
788 	 * that are supported only for KVM guests.
789 	 */
790 	return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR);
791 }
792 
793 static u64 __init guest_default_mask(void)
794 {
795 	/*
796 	 * Exclude dynamic features, which require userspace opt-in even
797 	 * for KVM guests.
798 	 */
799 	return ~(u64)XFEATURE_MASK_USER_DYNAMIC;
800 }
801 
802 /*
803  * Enable and initialize the xsave feature.
804  * Called once per system bootup.
805  */
806 void __init fpu__init_system_xstate(unsigned int legacy_size)
807 {
808 	unsigned int eax, ebx, ecx, edx;
809 	u64 xfeatures;
810 	int err;
811 	int i;
812 
813 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
814 		pr_info("x86/fpu: No FPU detected\n");
815 		return;
816 	}
817 
818 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
819 		pr_info("x86/fpu: x87 FPU will use %s\n",
820 			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
821 		return;
822 	}
823 
824 	/*
825 	 * Find user xstates supported by the processor.
826 	 */
827 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
828 	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
829 
830 	/*
831 	 * Find supervisor xstates supported by the processor.
832 	 */
833 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
834 	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
835 
836 	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
837 		/*
838 		 * This indicates that something really unexpected happened
839 		 * with the enumeration.  Disable XSAVE and try to continue
840 		 * booting without it.  This is too early to BUG().
841 		 */
842 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
843 		       fpu_kernel_cfg.max_features);
844 		goto out_disable;
845 	}
846 
847 	if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX &&
848 	    fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) {
849 		/*
850 		 * This is a problematic CPU configuration where two
851 		 * conflicting state components are both enumerated.
852 		 */
853 		pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n",
854 		       fpu_kernel_cfg.max_features);
855 		goto out_disable;
856 	}
857 
858 	fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
859 					      XFEATURE_MASK_INDEPENDENT;
860 
861 	/*
862 	 * Clear XSAVE features that are disabled in the normal CPUID.
863 	 */
864 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
865 		unsigned short cid = xsave_cpuid_features[i];
866 
867 		/* Careful: X86_FEATURE_FPU is 0! */
868 		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
869 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
870 	}
871 
872 	if (!cpu_feature_enabled(X86_FEATURE_XFD))
873 		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
874 
875 	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
876 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
877 	else
878 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
879 					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
880 
881 	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
882 	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
883 
884 	/*
885 	 * Now, given maximum feature set, determine default values by
886 	 * applying default masks.
887 	 */
888 	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask();
889 	fpu_user_cfg.default_features   = fpu_user_cfg.max_features & host_default_mask();
890 	guest_default_cfg.features      = fpu_kernel_cfg.max_features & guest_default_mask();
891 
892 	/* Store it for paranoia check at the end */
893 	xfeatures = fpu_kernel_cfg.max_features;
894 
895 	/*
896 	 * Initialize the default XFD state in initfp_state and enable the
897 	 * dynamic sizing mechanism if dynamic states are available.  The
898 	 * static key cannot be enabled here because this runs before
899 	 * jump_label_init(). This is delayed to an initcall.
900 	 */
901 	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
902 
903 	/* Set up compaction feature bit */
904 	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
905 	    cpu_feature_enabled(X86_FEATURE_XSAVES))
906 		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
907 
908 	/* Enable xstate instructions to be able to continue with initialization: */
909 	fpu__init_cpu_xstate();
910 
911 	/* Cache size, offset and flags for initialization */
912 	setup_xstate_cache();
913 
914 	err = init_xstate_size();
915 	if (err)
916 		goto out_disable;
917 
918 	/*
919 	 * Update info used for ptrace frames; use standard-format size and no
920 	 * supervisor xstates:
921 	 */
922 	update_regset_xstate_info(fpu_user_cfg.max_size,
923 				  fpu_user_cfg.max_features);
924 
925 	/*
926 	 * init_fpstate excludes dynamic states as they are large but init
927 	 * state is zero.
928 	 */
929 	init_fpstate.size		= fpu_kernel_cfg.default_size;
930 	init_fpstate.xfeatures		= fpu_kernel_cfg.default_features;
931 
932 	if (init_fpstate.size > sizeof(init_fpstate.regs)) {
933 		pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n",
934 			sizeof(init_fpstate.regs), init_fpstate.size);
935 		goto out_disable;
936 	}
937 
938 	setup_init_fpu_buf();
939 
940 	/*
941 	 * Paranoia check whether something in the setup modified the
942 	 * xfeatures mask.
943 	 */
944 	if (xfeatures != fpu_kernel_cfg.max_features) {
945 		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n",
946 		       xfeatures, fpu_kernel_cfg.max_features);
947 		goto out_disable;
948 	}
949 
950 	/*
951 	 * CPU capabilities initialization runs before FPU init. So
952 	 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
953 	 * functional, set the feature bit so depending code works.
954 	 */
955 	setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
956 
957 	print_xstate_offset_size();
958 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
959 		fpu_kernel_cfg.max_features,
960 		fpu_kernel_cfg.max_size,
961 		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
962 	return;
963 
964 out_disable:
965 	/* something went wrong, try to boot without any XSAVE support */
966 	fpu__init_disable_system_xstate(legacy_size);
967 }
968 
969 /*
970  * Restore minimal FPU state after suspend:
971  */
972 void fpu__resume_cpu(void)
973 {
974 	/*
975 	 * Restore XCR0 on xsave capable CPUs:
976 	 */
977 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
978 		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
979 
980 	/*
981 	 * Restore IA32_XSS. The same CPUID bit enumerates support
982 	 * of XSAVES and MSR_IA32_XSS.
983 	 */
984 	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
985 		wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
986 				     xfeatures_mask_independent());
987 	}
988 
989 	if (fpu_state_size_dynamic())
990 		wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
991 }
992 
993 /*
994  * Given an xstate feature nr, calculate where in the xsave
995  * buffer the state is.  Callers should ensure that the buffer
996  * is valid.
997  */
998 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
999 {
1000 	u64 xcomp_bv = xsave->header.xcomp_bv;
1001 
1002 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1003 		return NULL;
1004 
1005 	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
1006 		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
1007 			return NULL;
1008 	}
1009 
1010 	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
1011 }
1012 
1013 /*
1014  * Given the xsave area and a state inside, this function returns the
1015  * address of the state.
1016  *
1017  * This is the API that is called to get xstate address in either
1018  * standard format or compacted format of xsave area.
1019  *
1020  * Note that if there is no data for the field in the xsave buffer
1021  * this will return NULL.
1022  *
1023  * Inputs:
1024  *	xstate: the thread's storage area for all FPU data
1025  *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
1026  *	XFEATURE_SSE, etc...)
1027  * Output:
1028  *	address of the state in the xsave area, or NULL if the
1029  *	field is not present in the xsave buffer.
1030  */
1031 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
1032 {
1033 	/*
1034 	 * Do we even *have* xsave state?
1035 	 */
1036 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
1037 		return NULL;
1038 
1039 	/*
1040 	 * We should not ever be requesting features that we
1041 	 * have not enabled.
1042 	 */
1043 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1044 		return NULL;
1045 
1046 	/*
1047 	 * This assumes the last 'xsave*' instruction to
1048 	 * have requested that 'xfeature_nr' be saved.
1049 	 * If it did not, we might be seeing and old value
1050 	 * of the field in the buffer.
1051 	 *
1052 	 * This can happen because the last 'xsave' did not
1053 	 * request that this feature be saved (unlikely)
1054 	 * or because the "init optimization" caused it
1055 	 * to not be saved.
1056 	 */
1057 	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
1058 		return NULL;
1059 
1060 	return __raw_xsave_addr(xsave, xfeature_nr);
1061 }
1062 EXPORT_SYMBOL_FOR_KVM(get_xsave_addr);
1063 
1064 /*
1065  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
1066  * The xsave buffer should be in standard format, not compacted (e.g. user mode
1067  * signal frames).
1068  */
1069 void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
1070 {
1071 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1072 		return NULL;
1073 
1074 	return (void __user *)xsave + xstate_offsets[xfeature_nr];
1075 }
1076 
1077 #ifdef CONFIG_ARCH_HAS_PKEYS
1078 
1079 /*
1080  * This will go out and modify PKRU register to set the access
1081  * rights for @pkey to @init_val.
1082  */
1083 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1084 			      unsigned long init_val)
1085 {
1086 	u32 old_pkru, new_pkru_bits = 0;
1087 	int pkey_shift;
1088 
1089 	/*
1090 	 * This check implies XSAVE support.  OSPKE only gets
1091 	 * set if we enable XSAVE and we enable PKU in XCR0.
1092 	 */
1093 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1094 		return -EINVAL;
1095 
1096 	/*
1097 	 * This code should only be called with valid 'pkey'
1098 	 * values originating from in-kernel users.  Complain
1099 	 * if a bad value is observed.
1100 	 */
1101 	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1102 		return -EINVAL;
1103 
1104 	/* Set the bits we need in PKRU:  */
1105 	if (init_val & PKEY_DISABLE_ACCESS)
1106 		new_pkru_bits |= PKRU_AD_BIT;
1107 	if (init_val & PKEY_DISABLE_WRITE)
1108 		new_pkru_bits |= PKRU_WD_BIT;
1109 
1110 	/* Shift the bits in to the correct place in PKRU for pkey: */
1111 	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1112 	new_pkru_bits <<= pkey_shift;
1113 
1114 	/* Get old PKRU and mask off any old bits in place: */
1115 	old_pkru = read_pkru();
1116 	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1117 
1118 	/* Write old part along with new part: */
1119 	write_pkru(old_pkru | new_pkru_bits);
1120 
1121 	return 0;
1122 }
1123 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1124 
1125 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1126 			 void *init_xstate, unsigned int size)
1127 {
1128 	membuf_write(to, from_xstate ? xstate : init_xstate, size);
1129 }
1130 
1131 /**
1132  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1133  * @to:		membuf descriptor
1134  * @fpstate:	The fpstate buffer from which to copy
1135  * @xfeatures:	The mask of xfeatures to save (XSAVE mode only)
1136  * @pkru_val:	The PKRU value to store in the PKRU component
1137  * @copy_mode:	The requested copy mode
1138  *
1139  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1140  * format, i.e. from the kernel internal hardware dependent storage format
1141  * to the requested @mode. UABI XSTATE is always uncompacted!
1142  *
1143  * It supports partial copy but @to.pos always starts from zero.
1144  */
1145 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1146 			       u64 xfeatures, u32 pkru_val,
1147 			       enum xstate_copy_mode copy_mode)
1148 {
1149 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1150 	struct xregs_state *xinit = &init_fpstate.regs.xsave;
1151 	struct xregs_state *xsave = &fpstate->regs.xsave;
1152 	unsigned int zerofrom, i, xfeature;
1153 	struct xstate_header header;
1154 	u64 mask;
1155 
1156 	memset(&header, 0, sizeof(header));
1157 	header.xfeatures = xsave->header.xfeatures;
1158 
1159 	/* Mask out the feature bits depending on copy mode */
1160 	switch (copy_mode) {
1161 	case XSTATE_COPY_FP:
1162 		header.xfeatures &= XFEATURE_MASK_FP;
1163 		break;
1164 
1165 	case XSTATE_COPY_FX:
1166 		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1167 		break;
1168 
1169 	case XSTATE_COPY_XSAVE:
1170 		header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1171 		break;
1172 	}
1173 
1174 	/* Copy FP state up to MXCSR */
1175 	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1176 		     &xinit->i387, off_mxcsr);
1177 
1178 	/* Copy MXCSR when SSE or YMM are set in the feature mask */
1179 	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1180 		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1181 		     MXCSR_AND_FLAGS_SIZE);
1182 
1183 	/* Copy the remaining FP state */
1184 	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1185 		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1186 		     sizeof(xsave->i387.st_space));
1187 
1188 	/* Copy the SSE state - shared with YMM, but independently managed */
1189 	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1190 		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1191 		     sizeof(xsave->i387.xmm_space));
1192 
1193 	if (copy_mode != XSTATE_COPY_XSAVE)
1194 		goto out;
1195 
1196 	/* Zero the padding area */
1197 	membuf_zero(&to, sizeof(xsave->i387.padding));
1198 
1199 	/* Copy xsave->i387.sw_reserved */
1200 	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1201 
1202 	/* Copy the user space relevant state of @xsave->header */
1203 	membuf_write(&to, &header, sizeof(header));
1204 
1205 	zerofrom = offsetof(struct xregs_state, extended_state_area);
1206 
1207 	/*
1208 	 * This 'mask' indicates which states to copy from fpstate.
1209 	 * Those extended states that are not present in fpstate are
1210 	 * either disabled or initialized:
1211 	 *
1212 	 * In non-compacted format, disabled features still occupy
1213 	 * state space but there is no state to copy from in the
1214 	 * compacted init_fpstate. The gap tracking will zero these
1215 	 * states.
1216 	 *
1217 	 * The extended features have an all zeroes init state. Thus,
1218 	 * remove them from 'mask' to zero those features in the user
1219 	 * buffer instead of retrieving them from init_fpstate.
1220 	 */
1221 	mask = header.xfeatures;
1222 
1223 	for_each_extended_xfeature_in_order(i, mask) {
1224 		xfeature = xfeature_uncompact_order[i];
1225 		/*
1226 		 * If there was a feature or alignment gap, zero the space
1227 		 * in the destination buffer.
1228 		 */
1229 		if (zerofrom < xstate_offsets[xfeature])
1230 			membuf_zero(&to, xstate_offsets[xfeature] - zerofrom);
1231 
1232 		if (xfeature == XFEATURE_PKRU) {
1233 			struct pkru_state pkru = {0};
1234 			/*
1235 			 * PKRU is not necessarily up to date in the
1236 			 * XSAVE buffer. Use the provided value.
1237 			 */
1238 			pkru.pkru = pkru_val;
1239 			membuf_write(&to, &pkru, sizeof(pkru));
1240 		} else {
1241 			membuf_write(&to,
1242 				     __raw_xsave_addr(xsave, xfeature),
1243 				     xstate_sizes[xfeature]);
1244 		}
1245 		/*
1246 		 * Keep track of the last copied state in the non-compacted
1247 		 * target buffer for gap zeroing.
1248 		 */
1249 		zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature];
1250 	}
1251 
1252 out:
1253 	if (to.left)
1254 		membuf_zero(&to, to.left);
1255 }
1256 
1257 /**
1258  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1259  * @to:		membuf descriptor
1260  * @tsk:	The task from which to copy the saved xstate
1261  * @copy_mode:	The requested copy mode
1262  *
1263  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1264  * format, i.e. from the kernel internal hardware dependent storage format
1265  * to the requested @mode. UABI XSTATE is always uncompacted!
1266  *
1267  * It supports partial copy but @to.pos always starts from zero.
1268  */
1269 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1270 			     enum xstate_copy_mode copy_mode)
1271 {
1272 	__copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate,
1273 				  x86_task_fpu(tsk)->fpstate->user_xfeatures,
1274 				  tsk->thread.pkru, copy_mode);
1275 }
1276 
1277 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1278 			    const void *kbuf, const void __user *ubuf)
1279 {
1280 	if (kbuf) {
1281 		memcpy(dst, kbuf + offset, size);
1282 	} else {
1283 		if (copy_from_user(dst, ubuf + offset, size))
1284 			return -EFAULT;
1285 	}
1286 	return 0;
1287 }
1288 
1289 
1290 /**
1291  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1292  * @fpstate:	The fpstate buffer to copy to
1293  * @kbuf:	The UABI format buffer, if it comes from the kernel
1294  * @ubuf:	The UABI format buffer, if it comes from userspace
1295  * @pkru:	The location to write the PKRU value to
1296  *
1297  * Converts from the UABI format into the kernel internal hardware
1298  * dependent format.
1299  *
1300  * This function ultimately has three different callers with distinct PKRU
1301  * behavior.
1302  * 1.	When called from sigreturn the PKRU register will be restored from
1303  *	@fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1304  *	@fpstate is sufficient to cover this case, but the caller will also
1305  *	pass a pointer to the thread_struct's pkru field in @pkru and updating
1306  *	it is harmless.
1307  * 2.	When called from ptrace the PKRU register will be restored from the
1308  *	thread_struct's pkru field. A pointer to that is passed in @pkru.
1309  *	The kernel will restore it manually, so the XRSTOR behavior that resets
1310  *	the PKRU register to the hardware init value (0) if the corresponding
1311  *	xfeatures bit is not set is emulated here.
1312  * 3.	When called from KVM the PKRU register will be restored from the vcpu's
1313  *	pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1314  *	XRSTOR and hasn't had the PKRU resetting behavior described above. To
1315  *	preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1316  *	bit is not set.
1317  */
1318 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1319 			       const void __user *ubuf, u32 *pkru)
1320 {
1321 	struct xregs_state *xsave = &fpstate->regs.xsave;
1322 	unsigned int offset, size;
1323 	struct xstate_header hdr;
1324 	u64 mask;
1325 	int i;
1326 
1327 	offset = offsetof(struct xregs_state, header);
1328 	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1329 		return -EFAULT;
1330 
1331 	if (validate_user_xstate_header(&hdr, fpstate))
1332 		return -EINVAL;
1333 
1334 	/* Validate MXCSR when any of the related features is in use */
1335 	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1336 	if (hdr.xfeatures & mask) {
1337 		u32 mxcsr[2];
1338 
1339 		offset = offsetof(struct fxregs_state, mxcsr);
1340 		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1341 			return -EFAULT;
1342 
1343 		/* Reserved bits in MXCSR must be zero. */
1344 		if (mxcsr[0] & ~mxcsr_feature_mask)
1345 			return -EINVAL;
1346 
1347 		/* SSE and YMM require MXCSR even when FP is not in use. */
1348 		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1349 			xsave->i387.mxcsr = mxcsr[0];
1350 			xsave->i387.mxcsr_mask = mxcsr[1];
1351 		}
1352 	}
1353 
1354 	for (i = 0; i < XFEATURE_MAX; i++) {
1355 		mask = BIT_ULL(i);
1356 
1357 		if (hdr.xfeatures & mask) {
1358 			void *dst = __raw_xsave_addr(xsave, i);
1359 
1360 			offset = xstate_offsets[i];
1361 			size = xstate_sizes[i];
1362 
1363 			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1364 				return -EFAULT;
1365 		}
1366 	}
1367 
1368 	if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1369 		struct pkru_state *xpkru;
1370 
1371 		xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1372 		*pkru = xpkru->pkru;
1373 	} else {
1374 		/*
1375 		 * KVM may pass NULL here to indicate that it does not need
1376 		 * PKRU updated.
1377 		 */
1378 		if (pkru)
1379 			*pkru = 0;
1380 	}
1381 
1382 	/*
1383 	 * The state that came in from userspace was user-state only.
1384 	 * Mask all the user states out of 'xfeatures':
1385 	 */
1386 	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1387 
1388 	/*
1389 	 * Add back in the features that came in from userspace:
1390 	 */
1391 	xsave->header.xfeatures |= hdr.xfeatures;
1392 
1393 	return 0;
1394 }
1395 
1396 /*
1397  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1398  * format and copy to the target thread. Used by ptrace and KVM.
1399  */
1400 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1401 {
1402 	return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1403 }
1404 
1405 /*
1406  * Convert from a sigreturn standard-format user-space buffer to kernel
1407  * XSAVE[S] format and copy to the target thread. This is called from the
1408  * sigreturn() and rt_sigreturn() system calls.
1409  */
1410 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1411 				      const void __user *ubuf)
1412 {
1413 	return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru);
1414 }
1415 
1416 static bool validate_independent_components(u64 mask)
1417 {
1418 	u64 xchk;
1419 
1420 	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1421 		return false;
1422 
1423 	xchk = ~xfeatures_mask_independent();
1424 
1425 	if (WARN_ON_ONCE(!mask || mask & xchk))
1426 		return false;
1427 
1428 	return true;
1429 }
1430 
1431 /**
1432  * xsaves - Save selected components to a kernel xstate buffer
1433  * @xstate:	Pointer to the buffer
1434  * @mask:	Feature mask to select the components to save
1435  *
1436  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1437  * XSAVES does not write the full xstate header. Before first use the
1438  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1439  * can #GP.
1440  *
1441  * The feature mask must be a subset of the independent features.
1442  */
1443 void xsaves(struct xregs_state *xstate, u64 mask)
1444 {
1445 	int err;
1446 
1447 	if (!validate_independent_components(mask))
1448 		return;
1449 
1450 	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1451 	WARN_ON_ONCE(err);
1452 }
1453 
1454 /**
1455  * xrstors - Restore selected components from a kernel xstate buffer
1456  * @xstate:	Pointer to the buffer
1457  * @mask:	Feature mask to select the components to restore
1458  *
1459  * The @xstate buffer must be 64 byte aligned and correctly initialized
1460  * otherwise XRSTORS from that buffer can #GP.
1461  *
1462  * Proper usage is to restore the state which was saved with
1463  * xsaves() into @xstate.
1464  *
1465  * The feature mask must be a subset of the independent features.
1466  */
1467 void xrstors(struct xregs_state *xstate, u64 mask)
1468 {
1469 	int err;
1470 
1471 	if (!validate_independent_components(mask))
1472 		return;
1473 
1474 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1475 	WARN_ON_ONCE(err);
1476 }
1477 
1478 #if IS_ENABLED(CONFIG_KVM)
1479 void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
1480 {
1481 	void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature);
1482 
1483 	if (addr)
1484 		memset(addr, 0, xstate_sizes[xfeature]);
1485 }
1486 EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component);
1487 #endif
1488 
1489 #ifdef CONFIG_X86_64
1490 
1491 #ifdef CONFIG_X86_DEBUG_FPU
1492 /*
1493  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1494  * can safely operate on the @fpstate buffer.
1495  */
1496 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1497 {
1498 	u64 xfd = __this_cpu_read(xfd_state);
1499 
1500 	if (fpstate->xfd == xfd)
1501 		return true;
1502 
1503 	 /*
1504 	  * The XFD MSR does not match fpstate->xfd. That's invalid when
1505 	  * the passed in fpstate is current's fpstate.
1506 	  */
1507 	if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd)
1508 		return false;
1509 
1510 	/*
1511 	 * XRSTOR(S) from init_fpstate are always correct as it will just
1512 	 * bring all components into init state and not read from the
1513 	 * buffer. XSAVE(S) raises #PF after init.
1514 	 */
1515 	if (fpstate == &init_fpstate)
1516 		return rstor;
1517 
1518 	/*
1519 	 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1520 	 * XRSTORS(S): fpu_swap_kvm_fpstate()
1521 	 */
1522 
1523 	/*
1524 	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1525 	 * the buffer area for XFD-disabled state components.
1526 	 */
1527 	mask &= ~xfd;
1528 
1529 	/*
1530 	 * Remove features which are valid in fpstate. They
1531 	 * have space allocated in fpstate.
1532 	 */
1533 	mask &= ~fpstate->xfeatures;
1534 
1535 	/*
1536 	 * Any remaining state components in 'mask' might be written
1537 	 * by XSAVE/XRSTOR. Fail validation it found.
1538 	 */
1539 	return !mask;
1540 }
1541 
1542 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1543 {
1544 	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1545 }
1546 #endif /* CONFIG_X86_DEBUG_FPU */
1547 
1548 static int __init xfd_update_static_branch(void)
1549 {
1550 	/*
1551 	 * If init_fpstate.xfd has bits set then dynamic features are
1552 	 * available and the dynamic sizing must be enabled.
1553 	 */
1554 	if (init_fpstate.xfd)
1555 		static_branch_enable(&__fpu_state_size_dynamic);
1556 	return 0;
1557 }
1558 arch_initcall(xfd_update_static_branch)
1559 
1560 void fpstate_free(struct fpu *fpu)
1561 {
1562 	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1563 		vfree(fpu->fpstate);
1564 }
1565 
1566 /**
1567  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1568  *
1569  * @xfeatures:	A bitmap of xstate features which extend the enabled features
1570  *		of that task
1571  * @ksize:	The required size for the kernel buffer
1572  * @usize:	The required size for user space buffers
1573  * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
1574  *
1575  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1576  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1577  * with large states are likely to live longer.
1578  *
1579  * Returns: 0 on success, -ENOMEM on allocation error.
1580  */
1581 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1582 			   unsigned int usize, struct fpu_guest *guest_fpu)
1583 {
1584 	struct fpu *fpu = x86_task_fpu(current);
1585 	struct fpstate *curfps, *newfps = NULL;
1586 	unsigned int fpsize;
1587 	bool in_use;
1588 
1589 	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1590 
1591 	newfps = vzalloc(fpsize);
1592 	if (!newfps)
1593 		return -ENOMEM;
1594 	newfps->size = ksize;
1595 	newfps->user_size = usize;
1596 	newfps->is_valloc = true;
1597 
1598 	/*
1599 	 * When a guest FPU is supplied, use @guest_fpu->fpstate
1600 	 * as reference independent whether it is in use or not.
1601 	 */
1602 	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1603 
1604 	/* Determine whether @curfps is the active fpstate */
1605 	in_use = fpu->fpstate == curfps;
1606 
1607 	if (guest_fpu) {
1608 		newfps->is_guest = true;
1609 		newfps->is_confidential = curfps->is_confidential;
1610 		newfps->in_use = curfps->in_use;
1611 		guest_fpu->xfeatures |= xfeatures;
1612 		guest_fpu->uabi_size = usize;
1613 	}
1614 
1615 	fpregs_lock();
1616 	/*
1617 	 * If @curfps is in use, ensure that the current state is in the
1618 	 * registers before swapping fpstate as that might invalidate it
1619 	 * due to layout changes.
1620 	 */
1621 	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1622 		fpregs_restore_userregs();
1623 
1624 	newfps->xfeatures = curfps->xfeatures | xfeatures;
1625 	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1626 	newfps->xfd = curfps->xfd & ~xfeatures;
1627 
1628 	/* Do the final updates within the locked region */
1629 	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1630 
1631 	if (guest_fpu) {
1632 		guest_fpu->fpstate = newfps;
1633 		/* If curfps is active, update the FPU fpstate pointer */
1634 		if (in_use)
1635 			fpu->fpstate = newfps;
1636 	} else {
1637 		fpu->fpstate = newfps;
1638 	}
1639 
1640 	if (in_use)
1641 		xfd_update_state(fpu->fpstate);
1642 	fpregs_unlock();
1643 
1644 	/* Only free valloc'ed state */
1645 	if (curfps && curfps->is_valloc)
1646 		vfree(curfps);
1647 
1648 	return 0;
1649 }
1650 
1651 static int validate_sigaltstack(unsigned int usize)
1652 {
1653 	struct task_struct *thread, *leader = current->group_leader;
1654 	unsigned long framesize = get_sigframe_size();
1655 
1656 	lockdep_assert_held(&current->sighand->siglock);
1657 
1658 	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
1659 	framesize -= fpu_user_cfg.max_size;
1660 	framesize += usize;
1661 	for_each_thread(leader, thread) {
1662 		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1663 			return -ENOSPC;
1664 	}
1665 	return 0;
1666 }
1667 
1668 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1669 {
1670 	/*
1671 	 * This deliberately does not exclude !XSAVES as we still might
1672 	 * decide to optionally context switch XCR0 or talk the silicon
1673 	 * vendors into extending XFD for the pre AMX states, especially
1674 	 * AVX512.
1675 	 */
1676 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1677 	struct fpu *fpu = x86_task_fpu(current->group_leader);
1678 	struct fpu_state_perm *perm;
1679 	unsigned int ksize, usize;
1680 	u64 mask;
1681 	int ret = 0;
1682 
1683 	/* Check whether fully enabled */
1684 	if ((permitted & requested) == requested)
1685 		return 0;
1686 
1687 	/*
1688 	 * Calculate the resulting kernel state size.  Note, @permitted also
1689 	 * contains supervisor xfeatures even though supervisor are always
1690 	 * permitted for kernel and guest FPUs, and never permitted for user
1691 	 * FPUs.
1692 	 */
1693 	mask = permitted | requested;
1694 	ksize = xstate_calculate_size(mask, compacted);
1695 
1696 	/*
1697 	 * Calculate the resulting user state size.  Take care not to clobber
1698 	 * the supervisor xfeatures in the new mask!
1699 	 */
1700 	usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false);
1701 
1702 	if (!guest) {
1703 		ret = validate_sigaltstack(usize);
1704 		if (ret)
1705 			return ret;
1706 	}
1707 
1708 	perm = guest ? &fpu->guest_perm : &fpu->perm;
1709 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1710 	WRITE_ONCE(perm->__state_perm, mask);
1711 	/* Protected by sighand lock */
1712 	perm->__state_size = ksize;
1713 	perm->__user_state_size = usize;
1714 	return ret;
1715 }
1716 
1717 /*
1718  * Permissions array to map facilities with more than one component
1719  */
1720 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1721 	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1722 };
1723 
1724 static int xstate_request_perm(unsigned long idx, bool guest)
1725 {
1726 	u64 permitted, requested;
1727 	int ret;
1728 
1729 	if (idx >= XFEATURE_MAX)
1730 		return -EINVAL;
1731 
1732 	/*
1733 	 * Look up the facility mask which can require more than
1734 	 * one xstate component.
1735 	 */
1736 	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1737 	requested = xstate_prctl_req[idx];
1738 	if (!requested)
1739 		return -EOPNOTSUPP;
1740 
1741 	if ((fpu_user_cfg.max_features & requested) != requested)
1742 		return -EOPNOTSUPP;
1743 
1744 	/* Lockless quick check */
1745 	permitted = xstate_get_group_perm(guest);
1746 	if ((permitted & requested) == requested)
1747 		return 0;
1748 
1749 	/* Protect against concurrent modifications */
1750 	spin_lock_irq(&current->sighand->siglock);
1751 	permitted = xstate_get_group_perm(guest);
1752 
1753 	/* First vCPU allocation locks the permissions. */
1754 	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1755 		ret = -EBUSY;
1756 	else
1757 		ret = __xstate_request_perm(permitted, requested, guest);
1758 	spin_unlock_irq(&current->sighand->siglock);
1759 	return ret;
1760 }
1761 
1762 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1763 {
1764 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1765 	struct fpu_state_perm *perm;
1766 	unsigned int ksize, usize;
1767 	struct fpu *fpu;
1768 
1769 	if (!xfd_event) {
1770 		if (!guest_fpu)
1771 			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1772 		return 0;
1773 	}
1774 
1775 	/* Protect against concurrent modifications */
1776 	spin_lock_irq(&current->sighand->siglock);
1777 
1778 	/* If not permitted let it die */
1779 	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1780 		spin_unlock_irq(&current->sighand->siglock);
1781 		return -EPERM;
1782 	}
1783 
1784 	fpu = x86_task_fpu(current->group_leader);
1785 	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1786 	ksize = perm->__state_size;
1787 	usize = perm->__user_state_size;
1788 
1789 	/*
1790 	 * The feature is permitted. State size is sufficient.  Dropping
1791 	 * the lock is safe here even if more features are added from
1792 	 * another task, the retrieved buffer sizes are valid for the
1793 	 * currently requested feature(s).
1794 	 */
1795 	spin_unlock_irq(&current->sighand->siglock);
1796 
1797 	/*
1798 	 * Try to allocate a new fpstate. If that fails there is no way
1799 	 * out.
1800 	 */
1801 	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1802 		return -EFAULT;
1803 	return 0;
1804 }
1805 
1806 int xfd_enable_feature(u64 xfd_err)
1807 {
1808 	return __xfd_enable_feature(xfd_err, NULL);
1809 }
1810 
1811 #else /* CONFIG_X86_64 */
1812 static inline int xstate_request_perm(unsigned long idx, bool guest)
1813 {
1814 	return -EPERM;
1815 }
1816 #endif  /* !CONFIG_X86_64 */
1817 
1818 u64 xstate_get_guest_group_perm(void)
1819 {
1820 	return xstate_get_group_perm(true);
1821 }
1822 EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm);
1823 
1824 /**
1825  * fpu_xstate_prctl - xstate permission operations
1826  * @option:	A subfunction of arch_prctl()
1827  * @arg2:	option argument
1828  * Return:	0 if successful; otherwise, an error code
1829  *
1830  * Option arguments:
1831  *
1832  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1833  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1834  * ARCH_REQ_XCOMP_PERM: Facility number requested
1835  *
1836  * For facilities which require more than one XSTATE component, the request
1837  * must be the highest state component number related to that facility,
1838  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1839  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1840  */
1841 long fpu_xstate_prctl(int option, unsigned long arg2)
1842 {
1843 	u64 __user *uptr = (u64 __user *)arg2;
1844 	u64 permitted, supported;
1845 	unsigned long idx = arg2;
1846 	bool guest = false;
1847 
1848 	switch (option) {
1849 	case ARCH_GET_XCOMP_SUPP:
1850 		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
1851 		return put_user(supported, uptr);
1852 
1853 	case ARCH_GET_XCOMP_PERM:
1854 		/*
1855 		 * Lockless snapshot as it can also change right after the
1856 		 * dropping the lock.
1857 		 */
1858 		permitted = xstate_get_host_group_perm();
1859 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1860 		return put_user(permitted, uptr);
1861 
1862 	case ARCH_GET_XCOMP_GUEST_PERM:
1863 		permitted = xstate_get_guest_group_perm();
1864 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1865 		return put_user(permitted, uptr);
1866 
1867 	case ARCH_REQ_XCOMP_GUEST_PERM:
1868 		guest = true;
1869 		fallthrough;
1870 
1871 	case ARCH_REQ_XCOMP_PERM:
1872 		if (!IS_ENABLED(CONFIG_X86_64))
1873 			return -EOPNOTSUPP;
1874 
1875 		return xstate_request_perm(idx, guest);
1876 
1877 	default:
1878 		return -EINVAL;
1879 	}
1880 }
1881 
1882 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1883 /*
1884  * Report the amount of time elapsed in millisecond since last AVX512
1885  * use in the task. Report -1 if no AVX-512 usage.
1886  */
1887 static void avx512_status(struct seq_file *m, struct task_struct *task)
1888 {
1889 	unsigned long timestamp;
1890 	long delta = -1;
1891 
1892 	/* AVX-512 usage is not tracked for kernel threads. Don't report anything. */
1893 	if (task->flags & (PF_KTHREAD | PF_USER_WORKER))
1894 		return;
1895 
1896 	timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp);
1897 
1898 	if (timestamp) {
1899 		delta = (long)(jiffies - timestamp);
1900 		/*
1901 		 * Cap to LONG_MAX if time difference > LONG_MAX
1902 		 */
1903 		if (delta < 0)
1904 			delta = LONG_MAX;
1905 		delta = jiffies_to_msecs(delta);
1906 	}
1907 
1908 	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1909 	seq_putc(m, '\n');
1910 }
1911 
1912 /*
1913  * Report architecture specific information
1914  */
1915 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1916 			struct pid *pid, struct task_struct *task)
1917 {
1918 	/*
1919 	 * Report AVX512 state if the processor and build option supported.
1920 	 */
1921 	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1922 		avx512_status(m, task);
1923 
1924 	return 0;
1925 }
1926 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
1927 
1928 #ifdef CONFIG_COREDUMP
1929 static const char owner_name[] = "LINUX";
1930 
1931 /*
1932  * Dump type, size, offset and flag values for every xfeature that is present.
1933  */
1934 static int dump_xsave_layout_desc(struct coredump_params *cprm)
1935 {
1936 	int num_records = 0;
1937 	int i;
1938 
1939 	for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
1940 		struct x86_xfeat_component xc = {
1941 			.type   = i,
1942 			.size   = xstate_sizes[i],
1943 			.offset = xstate_offsets[i],
1944 			/* reserved for future use */
1945 			.flags  = 0,
1946 		};
1947 
1948 		if (!dump_emit(cprm, &xc, sizeof(xc)))
1949 			return 0;
1950 
1951 		num_records++;
1952 	}
1953 	return num_records;
1954 }
1955 
1956 static u32 get_xsave_desc_size(void)
1957 {
1958 	u32 cnt = 0;
1959 	u32 i;
1960 
1961 	for_each_extended_xfeature(i, fpu_user_cfg.max_features)
1962 		cnt++;
1963 
1964 	return cnt * (sizeof(struct x86_xfeat_component));
1965 }
1966 
1967 int elf_coredump_extra_notes_write(struct coredump_params *cprm)
1968 {
1969 	int num_records = 0;
1970 	struct elf_note en;
1971 
1972 	if (!fpu_user_cfg.max_features)
1973 		return 0;
1974 
1975 	en.n_namesz = sizeof(owner_name);
1976 	en.n_descsz = get_xsave_desc_size();
1977 	en.n_type = NT_X86_XSAVE_LAYOUT;
1978 
1979 	if (!dump_emit(cprm, &en, sizeof(en)))
1980 		return 1;
1981 	if (!dump_emit(cprm, owner_name, en.n_namesz))
1982 		return 1;
1983 	if (!dump_align(cprm, 4))
1984 		return 1;
1985 
1986 	num_records = dump_xsave_layout_desc(cprm);
1987 	if (!num_records)
1988 		return 1;
1989 
1990 	/* Total size should be equal to the number of records */
1991 	if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
1992 		return 1;
1993 
1994 	return 0;
1995 }
1996 
1997 int elf_coredump_extra_notes_size(void)
1998 {
1999 	int size;
2000 
2001 	if (!fpu_user_cfg.max_features)
2002 		return 0;
2003 
2004 	/* .note header */
2005 	size  = sizeof(struct elf_note);
2006 	/*  Name plus alignment to 4 bytes */
2007 	size += roundup(sizeof(owner_name), 4);
2008 	size += get_xsave_desc_size();
2009 
2010 	return size;
2011 }
2012 #endif /* CONFIG_COREDUMP */
2013